2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/resourcevar.h>
70 #include <sys/signalvar.h>
71 #include <sys/proc_internal.h>
72 #include <sys/kauth.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/dirent.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/ubc_internal.h>
79 #include <sys/uio_internal.h>
82 #include <sys/vmparam.h>
85 #include <kern/clock.h>
86 #include <libkern/OSAtomic.h>
87 #include <kern/kalloc.h>
88 #include <kern/thread_call.h>
90 #include <nfs/rpcv2.h>
91 #include <nfs/nfsproto.h>
93 #include <nfs/nfs_gss.h>
94 #include <nfs/nfsmount.h>
95 #include <nfs/nfsnode.h>
96 #include <sys/buf_internal.h>
97 #include <libkern/OSAtomic.h>
99 kern_return_t
thread_terminate(thread_t
); /* XXX */
101 #define NFSBUFHASH(np, lbn) \
102 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
103 LIST_HEAD(nfsbufhashhead
, nfsbuf
) *nfsbufhashtbl
;
104 struct nfsbuffreehead nfsbuffree
, nfsbuffreemeta
, nfsbufdelwri
;
106 int nfsbufcnt
, nfsbufmin
, nfsbufmax
, nfsbufmetacnt
, nfsbufmetamax
;
107 int nfsbuffreecnt
, nfsbuffreemetacnt
, nfsbufdelwricnt
, nfsneedbuffer
;
109 int nfs_buf_timer_on
= 0;
110 thread_t nfsbufdelwrithd
= NULL
;
112 lck_grp_t
*nfs_buf_lck_grp
;
113 lck_mtx_t
*nfs_buf_mutex
;
115 #define NFSBUF_FREE_PERIOD 30 /* seconds */
116 #define NFSBUF_LRU_STALE 120
117 #define NFSBUF_META_STALE 240
119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
120 #define LRU_TO_FREEUP 6
121 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
122 #define META_TO_FREEUP 3
123 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
124 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
126 #define LRU_FREEUP_FRAC_ON_TIMER 8
127 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
128 #define META_FREEUP_FRAC_ON_TIMER 16
129 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
130 #define LRU_FREEUP_MIN_FRAC 4
131 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
132 #define META_FREEUP_MIN_FRAC 2
134 #define NFS_BUF_FREEUP() \
136 /* only call nfs_buf_freeup() if it has work to do: */ \
137 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
138 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
139 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
144 * Initialize nfsbuf lists
149 nfs_buf_lck_grp
= lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL
);
150 nfs_buf_mutex
= lck_mtx_alloc_init(nfs_buf_lck_grp
, LCK_ATTR_NULL
);
152 nfsbufcnt
= nfsbufmetacnt
=
153 nfsbuffreecnt
= nfsbuffreemetacnt
= nfsbufdelwricnt
= 0;
155 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
156 nfsbufmax
= (sane_size
>> PAGE_SHIFT
) / (2 * (NFS_RWSIZE
>> PAGE_SHIFT
));
157 nfsbufmetamax
= nfsbufmax
/ 4;
161 nfsbufhashtbl
= hashinit(nfsbufmax
/4, M_TEMP
, &nfsbufhash
);
162 TAILQ_INIT(&nfsbuffree
);
163 TAILQ_INIT(&nfsbuffreemeta
);
164 TAILQ_INIT(&nfsbufdelwri
);
169 * Check periodically for stale/unused nfs bufs
172 nfs_buf_timer(__unused
void *param0
, __unused
void *param1
)
176 lck_mtx_lock(nfs_buf_mutex
);
177 if (nfsbufcnt
<= nfsbufmin
) {
178 nfs_buf_timer_on
= 0;
179 lck_mtx_unlock(nfs_buf_mutex
);
182 lck_mtx_unlock(nfs_buf_mutex
);
184 nfs_interval_timer_start(nfs_buf_timer_call
,
185 NFSBUF_FREE_PERIOD
* 1000);
189 * try to free up some excess, unused nfsbufs
192 nfs_buf_freeup(int timer
)
197 struct nfsbuffreehead nfsbuffreeup
;
199 TAILQ_INIT(&nfsbuffreeup
);
201 lck_mtx_lock(nfs_buf_mutex
);
205 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
207 count
= timer
? nfsbuffreecnt
/LRU_FREEUP_FRAC_ON_TIMER
: LRU_TO_FREEUP
;
208 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
209 fbp
= TAILQ_FIRST(&nfsbuffree
);
214 if (NBUFSTAMPVALID(fbp
) &&
215 (fbp
->nb_timestamp
+ (2*NFSBUF_LRU_STALE
)) > now
.tv_sec
)
217 nfs_buf_remfree(fbp
);
218 /* disassociate buffer from any nfsnode */
220 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
221 LIST_REMOVE(fbp
, nb_vnbufs
);
222 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
226 LIST_REMOVE(fbp
, nb_hash
);
227 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
231 count
= timer
? nfsbuffreemetacnt
/META_FREEUP_FRAC_ON_TIMER
: META_TO_FREEUP
;
232 while ((nfsbufcnt
> nfsbufmin
) && (count
-- > 0)) {
233 fbp
= TAILQ_FIRST(&nfsbuffreemeta
);
238 if (NBUFSTAMPVALID(fbp
) &&
239 (fbp
->nb_timestamp
+ (2*NFSBUF_META_STALE
)) > now
.tv_sec
)
241 nfs_buf_remfree(fbp
);
242 /* disassociate buffer from any nfsnode */
244 if (fbp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
245 LIST_REMOVE(fbp
, nb_vnbufs
);
246 fbp
->nb_vnbufs
.le_next
= NFSNOLIST
;
250 LIST_REMOVE(fbp
, nb_hash
);
251 TAILQ_INSERT_TAIL(&nfsbuffreeup
, fbp
, nb_free
);
256 FSDBG(320, nfsbufcnt
, nfsbuffreecnt
, nfsbuffreemetacnt
, 0);
259 lck_mtx_unlock(nfs_buf_mutex
);
261 while ((fbp
= TAILQ_FIRST(&nfsbuffreeup
))) {
262 TAILQ_REMOVE(&nfsbuffreeup
, fbp
, nb_free
);
264 if (IS_VALID_CRED(fbp
->nb_rcred
))
265 kauth_cred_unref(&fbp
->nb_rcred
);
266 if (IS_VALID_CRED(fbp
->nb_wcred
))
267 kauth_cred_unref(&fbp
->nb_wcred
);
268 /* if buf was NB_META, dump buffer */
269 if (ISSET(fbp
->nb_flags
, NB_META
) && fbp
->nb_data
)
270 kfree(fbp
->nb_data
, fbp
->nb_bufsize
);
277 * remove a buffer from the freelist
278 * (must be called with nfs_buf_mutex held)
281 nfs_buf_remfree(struct nfsbuf
*bp
)
283 if (bp
->nb_free
.tqe_next
== NFSNOLIST
)
284 panic("nfsbuf not on free list");
285 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
287 TAILQ_REMOVE(&nfsbufdelwri
, bp
, nb_free
);
288 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
290 TAILQ_REMOVE(&nfsbuffreemeta
, bp
, nb_free
);
293 TAILQ_REMOVE(&nfsbuffree
, bp
, nb_free
);
295 bp
->nb_free
.tqe_next
= NFSNOLIST
;
300 * check for existence of nfsbuf in cache
303 nfs_buf_is_incore(nfsnode_t np
, daddr64_t blkno
)
306 lck_mtx_lock(nfs_buf_mutex
);
307 if (nfs_buf_incore(np
, blkno
))
311 lck_mtx_unlock(nfs_buf_mutex
);
316 * return incore buffer (must be called with nfs_buf_mutex held)
319 nfs_buf_incore(nfsnode_t np
, daddr64_t blkno
)
321 /* Search hash chain */
322 struct nfsbuf
* bp
= NFSBUFHASH(np
, blkno
)->lh_first
;
323 for (; bp
!= NULL
; bp
= bp
->nb_hash
.le_next
)
324 if ((bp
->nb_lblkno
== blkno
) && (bp
->nb_np
== np
)) {
325 if (!ISSET(bp
->nb_flags
, NB_INVAL
)) {
326 FSDBG(547, bp
, blkno
, bp
->nb_flags
, bp
->nb_np
);
334 * Check if it's OK to drop a page.
336 * Called by vnode_pager() on pageout request of non-dirty page.
337 * We need to make sure that it's not part of a delayed write.
338 * If it is, we can't let the VM drop it because we may need it
339 * later when/if we need to write the data (again).
342 nfs_buf_page_inval(vnode_t vp
, off_t offset
)
344 struct nfsmount
*nmp
= VTONMP(vp
);
351 lck_mtx_lock(nfs_buf_mutex
);
352 bp
= nfs_buf_incore(VTONFS(vp
), (daddr64_t
)(offset
/ nmp
->nm_biosize
));
355 FSDBG(325, bp
, bp
->nb_flags
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
356 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
361 * If there's a dirty range in the buffer, check to
362 * see if this page intersects with the dirty range.
363 * If it does, we can't let the pager drop the page.
365 if (bp
->nb_dirtyend
> 0) {
366 int start
= offset
- NBOFF(bp
);
367 if (bp
->nb_dirtyend
<= start
||
368 bp
->nb_dirtyoff
>= (start
+ PAGE_SIZE
))
374 lck_mtx_unlock(nfs_buf_mutex
);
379 * set up the UPL for a buffer
380 * (must NOT be called with nfs_buf_mutex held)
383 nfs_buf_upl_setup(struct nfsbuf
*bp
)
389 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
392 upl_flags
= UPL_PRECIOUS
;
393 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
395 * We're doing a "write", so we intend to modify
396 * the pages we're gathering.
398 upl_flags
|= UPL_WILL_MODIFY
;
400 kret
= ubc_create_upl(NFSTOV(bp
->nb_np
), NBOFF(bp
), bp
->nb_bufsize
,
401 &upl
, NULL
, upl_flags
);
402 if (kret
== KERN_INVALID_ARGUMENT
) {
403 /* vm object probably doesn't exist any more */
404 bp
->nb_pagelist
= NULL
;
407 if (kret
!= KERN_SUCCESS
) {
408 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret
);
409 bp
->nb_pagelist
= NULL
;
413 FSDBG(538, bp
, NBOFF(bp
), bp
->nb_bufsize
, bp
->nb_np
);
415 bp
->nb_pagelist
= upl
;
416 SET(bp
->nb_flags
, NB_PAGELIST
);
421 * update buffer's valid/dirty info from UBC
422 * (must NOT be called with nfs_buf_mutex held)
425 nfs_buf_upl_check(struct nfsbuf
*bp
)
428 off_t filesize
, fileoffset
;
431 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
434 npages
= round_page_32(bp
->nb_bufsize
) / PAGE_SIZE
;
435 filesize
= ubc_getsize(NFSTOV(bp
->nb_np
));
436 fileoffset
= NBOFF(bp
);
437 if (fileoffset
< filesize
)
438 SET(bp
->nb_flags
, NB_CACHE
);
440 CLR(bp
->nb_flags
, NB_CACHE
);
442 pl
= ubc_upl_pageinfo(bp
->nb_pagelist
);
443 bp
->nb_valid
= bp
->nb_dirty
= 0;
445 for (i
=0; i
< npages
; i
++, fileoffset
+= PAGE_SIZE_64
) {
446 /* anything beyond the end of the file is not valid or dirty */
447 if (fileoffset
>= filesize
)
449 if (!upl_valid_page(pl
, i
)) {
450 CLR(bp
->nb_flags
, NB_CACHE
);
454 if (upl_dirty_page(pl
, i
))
455 NBPGDIRTY_SET(bp
, i
);
457 fileoffset
= NBOFF(bp
);
458 if (ISSET(bp
->nb_flags
, NB_CACHE
)) {
460 bp
->nb_validend
= bp
->nb_bufsize
;
461 if (fileoffset
+ bp
->nb_validend
> filesize
)
462 bp
->nb_validend
= filesize
- fileoffset
;
464 bp
->nb_validoff
= bp
->nb_validend
= -1;
466 FSDBG(539, bp
, fileoffset
, bp
->nb_valid
, bp
->nb_dirty
);
467 FSDBG(539, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
471 * make sure that a buffer is mapped
472 * (must NOT be called with nfs_buf_mutex held)
475 nfs_buf_map(struct nfsbuf
*bp
)
481 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
484 kret
= ubc_upl_map(bp
->nb_pagelist
, (vm_address_t
*)&(bp
->nb_data
));
485 if (kret
!= KERN_SUCCESS
)
486 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret
);
487 if (bp
->nb_data
== 0)
488 panic("ubc_upl_map mapped 0");
489 FSDBG(540, bp
, bp
->nb_flags
, NBOFF(bp
), bp
->nb_data
);
494 * normalize an nfsbuf's valid range
496 * the read/write code guarantees that we'll always have a valid
497 * region that is an integral number of pages. If either end
498 * of the valid range isn't page-aligned, it gets corrected
499 * here as we extend the valid range through all of the
500 * contiguous valid pages.
503 nfs_buf_normalize_valid_range(nfsnode_t np
, struct nfsbuf
*bp
)
506 /* pull validoff back to start of contiguous valid page range */
507 pg
= bp
->nb_validoff
/PAGE_SIZE
;
508 while (pg
>= 0 && NBPGVALID(bp
,pg
))
510 bp
->nb_validoff
= (pg
+1) * PAGE_SIZE
;
511 /* push validend forward to end of contiguous valid page range */
512 npg
= bp
->nb_bufsize
/PAGE_SIZE
;
513 pg
= bp
->nb_validend
/PAGE_SIZE
;
514 while (pg
< npg
&& NBPGVALID(bp
,pg
))
516 bp
->nb_validend
= pg
* PAGE_SIZE
;
518 if (NBOFF(bp
) + bp
->nb_validend
> (off_t
)np
->n_size
)
519 bp
->nb_validend
= np
->n_size
% bp
->nb_bufsize
;
523 * process some entries on the delayed write queue
524 * (must be called with nfs_buf_mutex held)
527 nfs_buf_delwri_service(void)
533 while (i
< 8 && (bp
= TAILQ_FIRST(&nfsbufdelwri
)) != NULL
) {
537 while ((error
= nfs_buf_acquire(bp
, 0, 0, 0)) == EAGAIN
);
542 /* buffer is no longer valid */
546 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
547 nfs_buf_check_write_verifier(np
, bp
);
548 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
549 /* put buffer at end of delwri list */
550 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
553 lck_mtx_unlock(nfs_buf_mutex
);
554 nfs_flushcommits(np
, 1);
556 SET(bp
->nb_flags
, NB_ASYNC
);
557 lck_mtx_unlock(nfs_buf_mutex
);
561 lck_mtx_lock(nfs_buf_mutex
);
566 * thread to service the delayed write queue when asked
569 nfs_buf_delwri_thread(__unused
void *arg
, __unused wait_result_t wr
)
571 struct timespec ts
= { 30, 0 };
574 lck_mtx_lock(nfs_buf_mutex
);
576 nfs_buf_delwri_service();
577 error
= msleep(&nfsbufdelwrithd
, nfs_buf_mutex
, 0, "nfsbufdelwri", &ts
);
579 nfsbufdelwrithd
= NULL
;
580 lck_mtx_unlock(nfs_buf_mutex
);
581 thread_terminate(nfsbufdelwrithd
);
585 * try to push out some delayed/uncommitted writes
586 * ("locked" indicates whether nfs_buf_mutex is already held)
589 nfs_buf_delwri_push(int locked
)
591 if (TAILQ_EMPTY(&nfsbufdelwri
))
594 lck_mtx_lock(nfs_buf_mutex
);
595 /* wake up the delayed write service thread */
597 wakeup(&nfsbufdelwrithd
);
598 else if (kernel_thread_start(nfs_buf_delwri_thread
, NULL
, &nfsbufdelwrithd
) == KERN_SUCCESS
)
599 thread_deallocate(nfsbufdelwrithd
);
600 /* otherwise, try to do some of the work ourselves */
601 if (!nfsbufdelwrithd
)
602 nfs_buf_delwri_service();
604 lck_mtx_unlock(nfs_buf_mutex
);
610 * Returns errno on error, 0 otherwise.
611 * Any buffer is returned in *bpp.
613 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
614 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
616 * Check for existence of buffer in cache.
617 * Or attempt to reuse a buffer from one of the free lists.
618 * Or allocate a new buffer if we haven't already hit max allocation.
619 * Or wait for a free buffer.
621 * If available buffer found, prepare it, and return it.
623 * If the calling process is interrupted by a signal for
624 * an interruptible mount point, return EINTR.
635 vnode_t vp
= NFSTOV(np
);
636 struct nfsmount
*nmp
= VTONMP(vp
);
639 int slpflag
= PCATCH
;
640 int operation
= (flags
& NBLK_OPMASK
);
644 FSDBG_TOP(541, np
, blkno
, size
, flags
);
648 if (bufsize
> NFS_MAXBSIZE
)
649 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
652 FSDBG_BOT(541, np
, blkno
, 0, ENXIO
);
656 if (!UBCINFOEXISTS(vp
)) {
657 operation
= NBLK_META
;
658 } else if (bufsize
< nmp
->nm_biosize
) {
659 /* reg files should always have biosize blocks */
660 bufsize
= nmp
->nm_biosize
;
663 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
664 if ((operation
== NBLK_WRITE
) && (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
)) {
665 FSDBG_TOP(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
667 /* poke the delwri list */
668 nfs_buf_delwri_push(0);
670 /* sleep to let other threads run... */
671 tsleep(&nfs_nbdwrite
, PCATCH
, "nfs_nbdwrite", 1);
672 FSDBG_BOT(542, np
, blkno
, nfs_nbdwrite
, NFS_A_LOT_OF_DELAYED_WRITES
);
676 lck_mtx_lock(nfs_buf_mutex
);
678 /* check for existence of nfsbuf in cache */
679 if ((bp
= nfs_buf_incore(np
, blkno
))) {
680 /* if busy, set wanted and wait */
681 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
682 if (flags
& NBLK_NOWAIT
) {
683 lck_mtx_unlock(nfs_buf_mutex
);
684 FSDBG_BOT(541, np
, blkno
, bp
, 0xbcbcbcbc);
687 FSDBG_TOP(543, np
, blkno
, bp
, bp
->nb_flags
);
688 SET(bp
->nb_lflags
, NBL_WANTED
);
692 error
= msleep(bp
, nfs_buf_mutex
, slpflag
|(PRIBIO
+1)|PDROP
,
693 "nfsbufget", (slpflag
== PCATCH
) ? NULL
: &ts
);
694 if (error
== EWOULDBLOCK
)
697 FSDBG_BOT(543, np
, blkno
, bp
, bp
->nb_flags
);
698 if (error
|| ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0)))) {
699 FSDBG_BOT(541, np
, blkno
, 0, error
);
704 if (bp
->nb_bufsize
!= bufsize
)
705 panic("nfsbuf size mismatch");
706 SET(bp
->nb_lflags
, NBL_BUSY
);
707 SET(bp
->nb_flags
, NB_CACHE
);
709 /* additional paranoia: */
710 if (ISSET(bp
->nb_flags
, NB_PAGELIST
))
711 panic("pagelist buffer was not busy");
715 if (flags
& NBLK_ONLYVALID
) {
716 lck_mtx_unlock(nfs_buf_mutex
);
717 FSDBG_BOT(541, np
, blkno
, 0, 0x0000cace);
722 * where to get a free buffer:
723 * - if meta and maxmeta reached, must reuse meta
724 * - alloc new if we haven't reached min bufs
725 * - if free lists are NOT empty
726 * - if free list is stale, use it
727 * - else if freemeta list is stale, use it
728 * - else if max bufs allocated, use least-time-to-stale
729 * - alloc new if we haven't reached max allowed
730 * - start clearing out delwri list and try again
733 if ((operation
== NBLK_META
) && (nfsbufmetacnt
>= nfsbufmetamax
)) {
734 /* if we've hit max meta buffers, must reuse a meta buffer */
735 bp
= TAILQ_FIRST(&nfsbuffreemeta
);
736 } else if ((nfsbufcnt
> nfsbufmin
) &&
737 (!TAILQ_EMPTY(&nfsbuffree
) || !TAILQ_EMPTY(&nfsbuffreemeta
))) {
738 /* try to pull an nfsbuf off a free list */
739 struct nfsbuf
*lrubp
, *metabp
;
743 /* if the next LRU or META buffer is invalid or stale, use it */
744 lrubp
= TAILQ_FIRST(&nfsbuffree
);
745 if (lrubp
&& (!NBUFSTAMPVALID(lrubp
) ||
746 ((lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
) < now
.tv_sec
)))
748 metabp
= TAILQ_FIRST(&nfsbuffreemeta
);
749 if (!bp
&& metabp
&& (!NBUFSTAMPVALID(metabp
) ||
750 ((metabp
->nb_timestamp
+ NFSBUF_META_STALE
) < now
.tv_sec
)))
753 if (!bp
&& (nfsbufcnt
>= nfsbufmax
)) {
754 /* we've already allocated all bufs, so */
755 /* choose the buffer that'll go stale first */
761 int32_t lru_stale_time
, meta_stale_time
;
762 lru_stale_time
= lrubp
->nb_timestamp
+ NFSBUF_LRU_STALE
;
763 meta_stale_time
= metabp
->nb_timestamp
+ NFSBUF_META_STALE
;
764 if (lru_stale_time
<= meta_stale_time
)
773 /* we have a buffer to reuse */
774 FSDBG(544, np
, blkno
, bp
, bp
->nb_flags
);
776 if (ISSET(bp
->nb_flags
, NB_DELWRI
))
777 panic("nfs_buf_get: delwri");
778 SET(bp
->nb_lflags
, NBL_BUSY
);
779 /* disassociate buffer from previous nfsnode */
781 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
782 LIST_REMOVE(bp
, nb_vnbufs
);
783 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
787 LIST_REMOVE(bp
, nb_hash
);
788 /* nuke any creds we're holding */
789 if (IS_VALID_CRED(bp
->nb_rcred
))
790 kauth_cred_unref(&bp
->nb_rcred
);
791 if (IS_VALID_CRED(bp
->nb_wcred
))
792 kauth_cred_unref(&bp
->nb_wcred
);
793 /* if buf will no longer be NB_META, dump old buffer */
794 if (operation
== NBLK_META
) {
795 if (!ISSET(bp
->nb_flags
, NB_META
))
797 } else if (ISSET(bp
->nb_flags
, NB_META
)) {
799 kfree(bp
->nb_data
, bp
->nb_bufsize
);
804 /* re-init buf fields */
806 bp
->nb_validoff
= bp
->nb_validend
= -1;
807 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
812 /* no buffer to reuse */
813 if ((nfsbufcnt
< nfsbufmax
) &&
814 ((operation
!= NBLK_META
) || (nfsbufmetacnt
< nfsbufmetamax
))) {
815 /* just alloc a new one */
816 MALLOC(bp
, struct nfsbuf
*, sizeof(struct nfsbuf
), M_TEMP
, M_WAITOK
);
818 lck_mtx_unlock(nfs_buf_mutex
);
819 FSDBG_BOT(541, np
, blkno
, 0, error
);
825 * If any excess bufs, make sure the timer
826 * is running to free them up later.
828 if (nfsbufcnt
> nfsbufmin
&& !nfs_buf_timer_on
) {
829 nfs_buf_timer_on
= 1;
830 nfs_interval_timer_start(nfs_buf_timer_call
,
831 NFSBUF_FREE_PERIOD
* 1000);
834 if (operation
== NBLK_META
)
838 bzero(bp
, sizeof(*bp
));
839 bp
->nb_free
.tqe_next
= NFSNOLIST
;
840 bp
->nb_validoff
= bp
->nb_validend
= -1;
841 FSDBG(545, np
, blkno
, bp
, 0);
843 /* too many bufs... wait for buffers to free up */
844 FSDBG_TOP(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
846 /* poke the delwri list */
847 nfs_buf_delwri_push(1);
850 error
= msleep(&nfsneedbuffer
, nfs_buf_mutex
, PCATCH
|PDROP
, "nfsbufget", NULL
);
851 FSDBG_BOT(546, np
, blkno
, nfsbufcnt
, nfsbufmax
);
852 if (error
|| ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0)))) {
853 FSDBG_BOT(541, np
, blkno
, 0, error
);
861 bp
->nb_lflags
= NBL_BUSY
;
863 bp
->nb_lblkno
= blkno
;
864 /* insert buf in hash */
865 LIST_INSERT_HEAD(NFSBUFHASH(np
, blkno
), bp
, nb_hash
);
866 /* associate buffer with new nfsnode */
868 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
873 lck_mtx_unlock(nfs_buf_mutex
);
877 SET(bp
->nb_flags
, NB_META
);
878 if ((bp
->nb_bufsize
!= bufsize
) && bp
->nb_data
) {
879 kfree(bp
->nb_data
, bp
->nb_bufsize
);
881 bp
->nb_validoff
= bp
->nb_validend
= -1;
882 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
885 CLR(bp
->nb_flags
, NB_CACHE
);
888 bp
->nb_data
= kalloc(bufsize
);
890 /* Ack! couldn't allocate the data buffer! */
891 /* clean up buffer and return error */
892 lck_mtx_lock(nfs_buf_mutex
);
893 LIST_REMOVE(bp
, nb_vnbufs
);
894 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
896 /* invalidate usage timestamp to allow immediate freeing */
897 NBUFSTAMPINVALIDATE(bp
);
898 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
899 panic("nfsbuf on freelist");
900 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
902 lck_mtx_unlock(nfs_buf_mutex
);
903 FSDBG_BOT(541, np
, blkno
, 0xb00, ENOMEM
);
906 bp
->nb_bufsize
= bufsize
;
912 * Set or clear NB_READ now to let the UPL subsystem know
913 * if we intend to modify the pages or not.
915 if (operation
== NBLK_READ
) {
916 SET(bp
->nb_flags
, NB_READ
);
918 CLR(bp
->nb_flags
, NB_READ
);
920 if (bufsize
< PAGE_SIZE
)
922 bp
->nb_bufsize
= bufsize
;
923 bp
->nb_validoff
= bp
->nb_validend
= -1;
925 if (UBCINFOEXISTS(vp
)) {
927 if (nfs_buf_upl_setup(bp
)) {
928 /* unable to create upl */
929 /* vm object must no longer exist */
930 /* clean up buffer and return error */
931 lck_mtx_lock(nfs_buf_mutex
);
932 LIST_REMOVE(bp
, nb_vnbufs
);
933 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
935 /* invalidate usage timestamp to allow immediate freeing */
936 NBUFSTAMPINVALIDATE(bp
);
937 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
938 panic("nfsbuf on freelist");
939 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
941 lck_mtx_unlock(nfs_buf_mutex
);
942 FSDBG_BOT(541, np
, blkno
, 0x2bc, EIO
);
945 nfs_buf_upl_check(bp
);
950 panic("nfs_buf_get: %d unknown operation", operation
);
955 FSDBG_BOT(541, np
, blkno
, bp
, bp
->nb_flags
);
961 nfs_buf_release(struct nfsbuf
*bp
, int freeup
)
963 nfsnode_t np
= bp
->nb_np
;
966 int wakeup_needbuffer
, wakeup_buffer
, wakeup_nbdwrite
;
968 FSDBG_TOP(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
969 FSDBG(548, bp
->nb_validoff
, bp
->nb_validend
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
970 FSDBG(548, bp
->nb_valid
, 0, bp
->nb_dirty
, 0);
972 vp
= np
? NFSTOV(np
) : NULL
;
973 if (vp
&& UBCINFOEXISTS(vp
) && bp
->nb_bufsize
) {
978 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
) && !ISSET(bp
->nb_flags
, NB_INVAL
)) {
979 rv
= nfs_buf_upl_setup(bp
);
981 printf("nfs_buf_release: upl create failed %d\n", rv
);
983 nfs_buf_upl_check(bp
);
985 upl
= bp
->nb_pagelist
;
987 goto pagelist_cleanup_done
;
989 if (ubc_upl_unmap(upl
) != KERN_SUCCESS
)
990 panic("ubc_upl_unmap failed");
994 * Abort the pages on error or: if this is an invalid or
995 * non-needcommit nocache buffer AND no pages are dirty.
997 if (ISSET(bp
->nb_flags
, NB_ERROR
) || (!bp
->nb_dirty
&& (ISSET(bp
->nb_flags
, NB_INVAL
) ||
998 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
)))))) {
999 if (ISSET(bp
->nb_flags
, (NB_READ
| NB_INVAL
| NB_NOCACHE
)))
1000 upl_flags
= UPL_ABORT_DUMP_PAGES
;
1003 ubc_upl_abort(upl
, upl_flags
);
1004 goto pagelist_cleanup_done
;
1006 for (i
=0; i
<= (bp
->nb_bufsize
- 1)/PAGE_SIZE
; i
++) {
1007 if (!NBPGVALID(bp
,i
))
1008 ubc_upl_abort_range(upl
,
1009 i
*PAGE_SIZE
, PAGE_SIZE
,
1010 UPL_ABORT_DUMP_PAGES
|
1011 UPL_ABORT_FREE_ON_EMPTY
);
1013 if (NBPGDIRTY(bp
,i
))
1014 upl_flags
= UPL_COMMIT_SET_DIRTY
;
1016 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
1017 ubc_upl_commit_range(upl
,
1018 i
*PAGE_SIZE
, PAGE_SIZE
,
1020 UPL_COMMIT_INACTIVATE
|
1021 UPL_COMMIT_FREE_ON_EMPTY
);
1024 pagelist_cleanup_done
:
1025 /* was this the last buffer in the file? */
1026 if (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)(np
->n_size
)) {
1027 /* if so, invalidate all pages of last buffer past EOF */
1029 start
= trunc_page_64(np
->n_size
) + PAGE_SIZE_64
;
1030 end
= trunc_page_64(NBOFF(bp
) + bp
->nb_bufsize
);
1032 if (!(rv
= ubc_sync_range(vp
, start
, end
, UBC_INVALIDATE
)))
1033 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1036 CLR(bp
->nb_flags
, NB_PAGELIST
);
1037 bp
->nb_pagelist
= NULL
;
1040 lck_mtx_lock(nfs_buf_mutex
);
1042 wakeup_needbuffer
= wakeup_buffer
= wakeup_nbdwrite
= 0;
1044 /* Wake up any processes waiting for any buffer to become free. */
1045 if (nfsneedbuffer
) {
1047 wakeup_needbuffer
= 1;
1049 /* Wake up any processes waiting for _this_ buffer to become free. */
1050 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1051 CLR(bp
->nb_lflags
, NBL_WANTED
);
1055 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1056 if (ISSET(bp
->nb_flags
, NB_ERROR
) ||
1057 (ISSET(bp
->nb_flags
, NB_NOCACHE
) && !ISSET(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_DELWRI
))))
1058 SET(bp
->nb_flags
, NB_INVAL
);
1060 if ((bp
->nb_bufsize
<= 0) || ISSET(bp
->nb_flags
, NB_INVAL
)) {
1061 /* If it's invalid or empty, dissociate it from its nfsnode */
1062 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
) {
1063 LIST_REMOVE(bp
, nb_vnbufs
);
1064 bp
->nb_vnbufs
.le_next
= NFSNOLIST
;
1067 /* if this was a delayed write, wakeup anyone */
1068 /* waiting for delayed writes to complete */
1069 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1070 CLR(bp
->nb_flags
, NB_DELWRI
);
1073 wakeup_nbdwrite
= 1;
1075 /* invalidate usage timestamp to allow immediate freeing */
1076 NBUFSTAMPINVALIDATE(bp
);
1077 /* put buffer at head of free list */
1078 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1079 panic("nfsbuf on freelist");
1080 SET(bp
->nb_flags
, NB_INVAL
);
1081 if (ISSET(bp
->nb_flags
, NB_META
)) {
1082 TAILQ_INSERT_HEAD(&nfsbuffreemeta
, bp
, nb_free
);
1083 nfsbuffreemetacnt
++;
1085 TAILQ_INSERT_HEAD(&nfsbuffree
, bp
, nb_free
);
1088 } else if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1089 /* put buffer at end of delwri list */
1090 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1091 panic("nfsbuf on freelist");
1092 TAILQ_INSERT_TAIL(&nfsbufdelwri
, bp
, nb_free
);
1096 /* update usage timestamp */
1098 bp
->nb_timestamp
= now
.tv_sec
;
1099 /* put buffer at end of free list */
1100 if (bp
->nb_free
.tqe_next
!= NFSNOLIST
)
1101 panic("nfsbuf on freelist");
1102 if (ISSET(bp
->nb_flags
, NB_META
)) {
1103 TAILQ_INSERT_TAIL(&nfsbuffreemeta
, bp
, nb_free
);
1104 nfsbuffreemetacnt
++;
1106 TAILQ_INSERT_TAIL(&nfsbuffree
, bp
, nb_free
);
1113 /* Unlock the buffer. */
1114 CLR(bp
->nb_flags
, (NB_ASYNC
| NB_STABLE
));
1115 CLR(bp
->nb_lflags
, NBL_BUSY
);
1117 FSDBG_BOT(548, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_data
);
1119 lck_mtx_unlock(nfs_buf_mutex
);
1121 if (wakeup_needbuffer
)
1122 wakeup(&nfsneedbuffer
);
1125 if (wakeup_nbdwrite
)
1126 wakeup(&nfs_nbdwrite
);
1132 * Wait for operations on the buffer to complete.
1133 * When they do, extract and return the I/O's error value.
1136 nfs_buf_iowait(struct nfsbuf
*bp
)
1138 FSDBG_TOP(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1140 lck_mtx_lock(nfs_buf_mutex
);
1142 while (!ISSET(bp
->nb_flags
, NB_DONE
))
1143 msleep(bp
, nfs_buf_mutex
, PRIBIO
+ 1, "nfs_buf_iowait", NULL
);
1145 lck_mtx_unlock(nfs_buf_mutex
);
1147 FSDBG_BOT(549, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1149 /* check for interruption of I/O, then errors. */
1150 if (ISSET(bp
->nb_flags
, NB_EINTR
)) {
1151 CLR(bp
->nb_flags
, NB_EINTR
);
1153 } else if (ISSET(bp
->nb_flags
, NB_ERROR
))
1154 return (bp
->nb_error
? bp
->nb_error
: EIO
);
1159 * Mark I/O complete on a buffer.
1162 nfs_buf_iodone(struct nfsbuf
*bp
)
1165 FSDBG_TOP(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1167 if (ISSET(bp
->nb_flags
, NB_DONE
))
1168 panic("nfs_buf_iodone already");
1170 if (!ISSET(bp
->nb_flags
, NB_READ
)) {
1171 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
1173 * vnode_writedone() takes care of waking up
1174 * any throttled write operations
1176 vnode_writedone(NFSTOV(bp
->nb_np
));
1178 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) { /* if async, release it */
1179 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1180 nfs_buf_release(bp
, 1);
1181 } else { /* or just wakeup the buffer */
1182 lck_mtx_lock(nfs_buf_mutex
);
1183 SET(bp
->nb_flags
, NB_DONE
); /* note that it's done */
1184 CLR(bp
->nb_lflags
, NBL_WANTED
);
1185 lck_mtx_unlock(nfs_buf_mutex
);
1189 FSDBG_BOT(550, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1193 nfs_buf_write_delayed(struct nfsbuf
*bp
)
1195 nfsnode_t np
= bp
->nb_np
;
1197 FSDBG_TOP(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1198 FSDBG(551, bp
, bp
->nb_dirtyoff
, bp
->nb_dirtyend
, bp
->nb_dirty
);
1201 * If the block hasn't been seen before:
1202 * (1) Mark it as having been seen,
1203 * (2) Make sure it's on its node's correct block list,
1205 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
1206 SET(bp
->nb_flags
, NB_DELWRI
);
1207 /* move to dirty list */
1208 lck_mtx_lock(nfs_buf_mutex
);
1211 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
1212 LIST_REMOVE(bp
, nb_vnbufs
);
1213 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
1214 lck_mtx_unlock(nfs_buf_mutex
);
1218 * If the vnode has "too many" write operations in progress
1219 * wait for them to finish the IO
1221 vnode_waitforwrites(NFSTOV(np
), VNODE_ASYNC_THROTTLE
, 0, 0, "nfs_buf_write_delayed");
1223 /* the file is in a modified state, so make sure the flag's set */
1224 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
1225 np
->n_flag
|= NMODIFIED
;
1229 * If we have too many delayed write buffers,
1230 * just fall back to doing the async write.
1232 if (nfs_nbdwrite
< 0)
1233 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1234 if (nfs_nbdwrite
> NFS_A_LOT_OF_DELAYED_WRITES
) {
1235 /* issue async write */
1236 SET(bp
->nb_flags
, NB_ASYNC
);
1238 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, bp
->nb_error
);
1242 /* Otherwise, the "write" is done, so mark and release the buffer. */
1243 SET(bp
->nb_flags
, NB_DONE
);
1244 nfs_buf_release(bp
, 1);
1245 FSDBG_BOT(551, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
1250 * Check that a "needcommit" buffer can still be committed.
1251 * If the write verifier has changed, we need to clear the
1252 * the needcommit flag.
1255 nfs_buf_check_write_verifier(nfsnode_t np
, struct nfsbuf
*bp
)
1257 struct nfsmount
*nmp
;
1259 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
1265 if (!ISSET(bp
->nb_flags
, NB_STALEWVERF
) && (bp
->nb_verf
== nmp
->nm_verf
))
1268 /* write verifier changed, clear commit/wverf flags */
1269 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_STALEWVERF
));
1271 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
1272 np
->n_needcommitcnt
--;
1273 CHECK_NEEDCOMMITCNT(np
);
1278 * add a reference to a buffer so it doesn't disappear while being used
1279 * (must be called with nfs_buf_mutex held)
1282 nfs_buf_refget(struct nfsbuf
*bp
)
1287 * release a reference on a buffer
1288 * (must be called with nfs_buf_mutex held)
1291 nfs_buf_refrele(struct nfsbuf
*bp
)
1297 * mark a particular buffer as BUSY
1298 * (must be called with nfs_buf_mutex held)
1301 nfs_buf_acquire(struct nfsbuf
*bp
, int flags
, int slpflag
, int slptimeo
)
1306 if (ISSET(bp
->nb_lflags
, NBL_BUSY
)) {
1308 * since the mutex_lock may block, the buffer
1309 * may become BUSY, so we need to recheck for
1312 if (flags
& NBAC_NOWAIT
)
1314 SET(bp
->nb_lflags
, NBL_WANTED
);
1316 ts
.tv_sec
= (slptimeo
/100);
1317 /* the hz value is 100; which leads to 10ms */
1318 ts
.tv_nsec
= (slptimeo
% 100) * 10 * NSEC_PER_USEC
* 1000;
1320 error
= msleep(bp
, nfs_buf_mutex
, slpflag
| (PRIBIO
+ 1),
1321 "nfs_buf_acquire", &ts
);
1326 if (flags
& NBAC_REMOVE
)
1327 nfs_buf_remfree(bp
);
1328 SET(bp
->nb_lflags
, NBL_BUSY
);
1334 * simply drop the BUSY status of a buffer
1335 * (must be called with nfs_buf_mutex held)
1338 nfs_buf_drop(struct nfsbuf
*bp
)
1340 int need_wakeup
= 0;
1342 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
))
1343 panic("nfs_buf_drop: buffer not busy!");
1344 if (ISSET(bp
->nb_lflags
, NBL_WANTED
)) {
1345 /* delay the actual wakeup until after we clear NBL_BUSY */
1348 /* Unlock the buffer. */
1349 CLR(bp
->nb_lflags
, (NBL_BUSY
| NBL_WANTED
));
1356 * prepare for iterating over an nfsnode's buffer list
1357 * this lock protects the queue manipulation
1358 * (must be called with nfs_buf_mutex held)
1361 nfs_buf_iterprepare(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1363 struct nfsbuflists
*listheadp
;
1365 if (flags
& NBI_DIRTY
)
1366 listheadp
= &np
->n_dirtyblkhd
;
1368 listheadp
= &np
->n_cleanblkhd
;
1370 if ((flags
& NBI_NOWAIT
) && (np
->n_bufiterflags
& NBI_ITER
)) {
1371 LIST_INIT(iterheadp
);
1372 return(EWOULDBLOCK
);
1375 while (np
->n_bufiterflags
& NBI_ITER
) {
1376 np
->n_bufiterflags
|= NBI_ITERWANT
;
1377 msleep(&np
->n_bufiterflags
, nfs_buf_mutex
, 0, "nfs_buf_iterprepare", NULL
);
1379 if (LIST_EMPTY(listheadp
)) {
1380 LIST_INIT(iterheadp
);
1383 np
->n_bufiterflags
|= NBI_ITER
;
1385 iterheadp
->lh_first
= listheadp
->lh_first
;
1386 listheadp
->lh_first
->nb_vnbufs
.le_prev
= &iterheadp
->lh_first
;
1387 LIST_INIT(listheadp
);
1393 * clean up after iterating over an nfsnode's buffer list
1394 * this lock protects the queue manipulation
1395 * (must be called with nfs_buf_mutex held)
1398 nfs_buf_itercomplete(nfsnode_t np
, struct nfsbuflists
*iterheadp
, int flags
)
1400 struct nfsbuflists
* listheadp
;
1403 if (flags
& NBI_DIRTY
)
1404 listheadp
= &np
->n_dirtyblkhd
;
1406 listheadp
= &np
->n_cleanblkhd
;
1408 while (!LIST_EMPTY(iterheadp
)) {
1409 bp
= LIST_FIRST(iterheadp
);
1410 LIST_REMOVE(bp
, nb_vnbufs
);
1411 LIST_INSERT_HEAD(listheadp
, bp
, nb_vnbufs
);
1414 np
->n_bufiterflags
&= ~NBI_ITER
;
1415 if (np
->n_bufiterflags
& NBI_ITERWANT
) {
1416 np
->n_bufiterflags
&= ~NBI_ITERWANT
;
1417 wakeup(&np
->n_bufiterflags
);
1423 * Read an NFS buffer for a file.
1426 nfs_buf_read(struct nfsbuf
*bp
)
1434 cred
= bp
->nb_rcred
;
1435 if (IS_VALID_CRED(cred
))
1436 kauth_cred_ref(cred
);
1437 thd
= ISSET(bp
->nb_flags
, NB_ASYNC
) ? NULL
: current_thread();
1440 if (!ISSET(bp
->nb_flags
, NB_READ
))
1441 panic("nfs_buf_read: !NB_READ");
1442 if (ISSET(bp
->nb_flags
, NB_DONE
))
1443 CLR(bp
->nb_flags
, NB_DONE
);
1447 OSAddAtomic(1, (SInt32
*)&nfsstats
.read_bios
);
1449 error
= nfs_buf_read_rpc(bp
, thd
, cred
);
1451 * For async I/O, the callbacks will finish up the
1452 * read. Otherwise, the read has already been finished.
1455 if (IS_VALID_CRED(cred
))
1456 kauth_cred_unref(&cred
);
1461 * finish the reading of a buffer
1464 nfs_buf_read_finish(struct nfsbuf
*bp
)
1466 nfsnode_t np
= bp
->nb_np
;
1467 struct nfsmount
*nmp
;
1469 if (!ISSET(bp
->nb_flags
, NB_ERROR
)) {
1470 /* update valid range */
1471 bp
->nb_validoff
= 0;
1472 bp
->nb_validend
= bp
->nb_endio
;
1473 if (bp
->nb_endio
< bp
->nb_bufsize
) {
1475 * The read may be short because we have unflushed writes
1476 * that are extending the file size and the reads hit the
1477 * (old) EOF on the server. So, just make sure nb_validend
1478 * correctly tracks EOF.
1479 * Note that the missing data should have already been zeroed
1480 * in nfs_buf_read_rpc_finish().
1482 off_t boff
= NBOFF(bp
);
1483 if ((off_t
)np
->n_size
>= (boff
+ bp
->nb_bufsize
))
1484 bp
->nb_validend
= bp
->nb_bufsize
;
1485 else if ((off_t
)np
->n_size
>= boff
)
1486 bp
->nb_validend
= np
->n_size
- boff
;
1488 bp
->nb_validend
= 0;
1490 if ((nmp
= NFSTONMP(np
)) && (nmp
->nm_vers
== NFS_VER2
) &&
1491 ((NBOFF(bp
) + bp
->nb_validend
) > 0x100000000LL
))
1492 bp
->nb_validend
= 0x100000000LL
- NBOFF(bp
);
1493 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
) / PAGE_SIZE
)) - 1;
1494 if (bp
->nb_validend
& PAGE_MASK
) {
1495 /* zero-fill remainder of last page */
1496 bzero(bp
->nb_data
+ bp
->nb_validend
, bp
->nb_bufsize
- bp
->nb_validend
);
1503 * initiate the NFS READ RPC(s) for a buffer
1506 nfs_buf_read_rpc(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
1508 struct nfsmount
*nmp
;
1509 nfsnode_t np
= bp
->nb_np
;
1510 int error
= 0, nfsvers
, async
;
1511 int offset
, length
, nmrsize
, nrpcs
, len
;
1514 struct nfsreq_cbinfo cb
;
1518 bp
->nb_error
= error
= ENXIO
;
1519 SET(bp
->nb_flags
, NB_ERROR
);
1523 nfsvers
= nmp
->nm_vers
;
1524 nmrsize
= nmp
->nm_rsize
;
1528 length
= bp
->nb_bufsize
;
1530 if (nfsvers
== NFS_VER2
) {
1531 if (boff
> 0xffffffffLL
) {
1532 bp
->nb_error
= error
= EFBIG
;
1533 SET(bp
->nb_flags
, NB_ERROR
);
1537 if ((boff
+ length
- 1) > 0xffffffffLL
)
1538 length
= 0x100000000LL
- boff
;
1541 /* Note: Can only do async I/O if nfsiods are configured. */
1542 async
= (bp
->nb_flags
& NB_ASYNC
);
1543 cb
.rcb_func
= async
? nfs_buf_read_rpc_finish
: NULL
;
1546 bp
->nb_offio
= bp
->nb_endio
= 0;
1547 bp
->nb_rpcs
= nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1548 if (async
&& (nrpcs
> 1)) {
1549 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1551 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
1554 while (length
> 0) {
1555 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1556 error
= bp
->nb_error
;
1559 len
= (length
> nmrsize
) ? nmrsize
: length
;
1560 cb
.rcb_args
[0] = offset
;
1561 cb
.rcb_args
[1] = len
;
1563 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, boff
+ offset
, len
, thd
, cred
, &cb
, &req
);
1570 nfs_buf_read_rpc_finish(req
);
1571 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
1572 error
= bp
->nb_error
;
1579 * Something bad happened while trying to send the RPC(s).
1580 * Wait for any outstanding requests to complete.
1582 bp
->nb_error
= error
;
1583 SET(bp
->nb_flags
, NB_ERROR
);
1584 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
1585 nrpcs
= (length
+ nmrsize
- 1) / nmrsize
;
1586 lck_mtx_lock(nfs_buf_mutex
);
1587 bp
->nb_rpcs
-= nrpcs
;
1588 if (bp
->nb_rpcs
== 0) {
1589 /* No RPCs left, so the buffer's done */
1590 lck_mtx_unlock(nfs_buf_mutex
);
1593 /* wait for the last RPC to mark it done */
1594 while (bp
->nb_rpcs
> 0)
1595 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
1596 "nfs_buf_read_rpc_cancel", NULL
);
1597 lck_mtx_unlock(nfs_buf_mutex
);
1608 * finish up an NFS READ RPC on a buffer
1611 nfs_buf_read_rpc_finish(struct nfsreq
*req
)
1613 struct nfsmount
*nmp
;
1615 struct nfsreq_cbinfo cb
;
1617 int error
= 0, nfsvers
, offset
, length
, eof
= 0, multasyncrpc
, finished
;
1618 void *wakeme
= NULL
;
1619 struct nfsreq
*rreq
= NULL
;
1628 thd
= req
->r_thread
;
1630 if (IS_VALID_CRED(cred
))
1631 kauth_cred_ref(cred
);
1632 cb
= req
->r_callback
;
1637 SET(bp
->nb_flags
, NB_ERROR
);
1638 bp
->nb_error
= error
= ENXIO
;
1640 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
1642 nfs_request_async_cancel(req
);
1646 nfsvers
= nmp
->nm_vers
;
1647 offset
= cb
.rcb_args
[0];
1648 rlen
= length
= cb
.rcb_args
[1];
1650 uio
.uio_iovs
.iov32p
= &io
;
1652 uio
.uio_rw
= UIO_READ
;
1653 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
1654 uio
.uio_segflg
= UIO_SYSSPACE
;
1656 uio
.uio_segflg
= UIO_SYSSPACE32
;
1658 io
.iov_len
= length
;
1659 uio_uio_resid_set(&uio
, io
.iov_len
);
1660 uio
.uio_offset
= NBOFF(bp
) + offset
;
1661 io
.iov_base
= (uintptr_t) bp
->nb_data
+ offset
;
1663 /* finish the RPC */
1664 error
= nmp
->nm_funcs
->nf_read_rpc_async_finish(np
, req
, &uio
, &rlen
, &eof
);
1665 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
1666 /* async request restarted */
1667 if (IS_VALID_CRED(cred
))
1668 kauth_cred_unref(&cred
);
1673 SET(bp
->nb_flags
, NB_ERROR
);
1674 bp
->nb_error
= error
;
1678 if ((rlen
> 0) && (bp
->nb_endio
< (offset
+ (int)rlen
)))
1679 bp
->nb_endio
= offset
+ rlen
;
1681 if ((nfsvers
== NFS_VER2
) || eof
|| (rlen
== 0)) {
1682 /* zero out the remaining data (up to EOF) */
1683 off_t rpcrem
, eofrem
, rem
;
1684 rpcrem
= (length
- rlen
);
1685 eofrem
= np
->n_size
- (NBOFF(bp
) + offset
+ rlen
);
1686 rem
= (rpcrem
< eofrem
) ? rpcrem
: eofrem
;
1688 bzero(bp
->nb_data
+ offset
+ rlen
, rem
);
1689 } else if (((int)rlen
< length
) && !ISSET(bp
->nb_flags
, NB_ERROR
)) {
1693 * We haven't hit EOF and we didn't get all the data
1694 * requested, so we need to issue another read for the rest.
1695 * (Don't bother if the buffer already hit an error.)
1699 cb
.rcb_args
[0] = offset
;
1700 cb
.rcb_args
[1] = length
;
1701 error
= nmp
->nm_funcs
->nf_read_rpc_async(np
, offset
, length
, thd
, cred
, &cb
, &rreq
);
1703 if (IS_VALID_CRED(cred
))
1704 kauth_cred_unref(&cred
);
1706 /* if !async we'll need to wait for this RPC to finish */
1712 * Outstanding RPC count is unchanged.
1713 * Callback will be called when RPC is done.
1717 SET(bp
->nb_flags
, NB_ERROR
);
1718 bp
->nb_error
= error
;
1722 if (IS_VALID_CRED(cred
))
1723 kauth_cred_unref(&cred
);
1726 * Decrement outstanding RPC count on buffer
1727 * and call nfs_buf_read_finish on last RPC.
1729 * (Note: when there are multiple async RPCs issued for a
1730 * buffer we need nfs_buffer_mutex to avoid problems when
1731 * aborting a partially-initiated set of RPCs)
1734 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
1736 lck_mtx_lock(nfs_buf_mutex
);
1739 finished
= (bp
->nb_rpcs
== 0);
1742 lck_mtx_unlock(nfs_buf_mutex
);
1746 wakeme
= &bp
->nb_rpcs
;
1747 nfs_buf_read_finish(bp
);
1754 * Do buffer readahead.
1755 * Initiate async I/O to read buffers not in cache.
1758 nfs_buf_readahead(nfsnode_t np
, int ioflag
, daddr64_t
*rabnp
, daddr64_t lastrabn
, thread_t thd
, kauth_cred_t cred
)
1760 struct nfsmount
*nmp
= NFSTONMP(np
);
1766 if (nmp
->nm_readahead
<= 0)
1768 if (*rabnp
> lastrabn
)
1771 for (nra
= 0; (nra
< nmp
->nm_readahead
) && (*rabnp
<= lastrabn
); nra
++, *rabnp
= *rabnp
+ 1) {
1772 /* check if block exists and is valid. */
1773 error
= nfs_buf_get(np
, *rabnp
, nmp
->nm_biosize
, thd
, NBLK_READ
|NBLK_NOWAIT
, &bp
);
1778 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
) &&
1779 !bp
->nb_dirty
&& !ISSET(bp
->nb_flags
, (NB_DELWRI
|NB_NCRDAHEAD
))) {
1780 CLR(bp
->nb_flags
, NB_CACHE
);
1782 bp
->nb_validoff
= bp
->nb_validend
= -1;
1784 if ((bp
->nb_dirtyend
<= 0) && !bp
->nb_dirty
&&
1785 !ISSET(bp
->nb_flags
, (NB_CACHE
|NB_DELWRI
))) {
1786 SET(bp
->nb_flags
, (NB_READ
|NB_ASYNC
));
1787 if (ioflag
& IO_NOCACHE
)
1788 SET(bp
->nb_flags
, NB_NCRDAHEAD
);
1789 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
1790 kauth_cred_ref(cred
);
1791 bp
->nb_rcred
= cred
;
1793 if ((error
= nfs_buf_read(bp
)))
1797 nfs_buf_release(bp
, 1);
1803 * NFS buffer I/O for reading files/directories.
1806 nfs_bioread(nfsnode_t np
, struct uio
*uio
, int ioflag
, int *eofflag
, vfs_context_t ctx
)
1808 vnode_t vp
= NFSTOV(np
);
1809 struct nfsbuf
*bp
= NULL
;
1810 struct nfs_vattr nvattr
;
1811 struct nfsmount
*nmp
= VTONMP(vp
);
1812 daddr64_t lbn
, rabn
= 0, lastrabn
, maxrabn
= -1, tlbn
;
1814 int error
= 0, n
= 0, on
= 0;
1815 int nfsvers
, biosize
;
1817 struct dirent
*direntp
= NULL
;
1822 FSDBG_TOP(514, np
, uio
->uio_offset
, uio_uio_resid(uio
), ioflag
);
1824 if (uio_uio_resid(uio
) == 0) {
1825 FSDBG_BOT(514, np
, 0xd1e0001, 0, 0);
1828 if (uio
->uio_offset
< 0) {
1829 FSDBG_BOT(514, np
, 0xd1e0002, 0, EINVAL
);
1833 nfsvers
= nmp
->nm_vers
;
1834 biosize
= nmp
->nm_biosize
;
1835 thd
= vfs_context_thread(ctx
);
1836 cred
= vfs_context_ucred(ctx
);
1838 vtype
= vnode_vtype(vp
);
1839 if ((vtype
!= VREG
) && (vtype
!= VDIR
)) {
1840 printf("nfs_bioread: type %x unexpected\n", vtype
);
1841 FSDBG_BOT(514, np
, 0xd1e0016, 0, EINVAL
);
1846 * For nfs, cache consistency can only be maintained approximately.
1847 * Although RFC1094 does not specify the criteria, the following is
1848 * believed to be compatible with the reference port.
1850 * If the file's modify time on the server has changed since the
1851 * last read rpc or you have written to the file,
1852 * you may have lost data cache consistency with the
1853 * server, so flush all of the file's data out of the cache.
1854 * Then force a getattr rpc to ensure that you have up to date
1856 * NB: This implies that cache data can be read when up to
1857 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1858 * current attributes this could be forced by calling
1859 * NATTRINVALIDATE() before the nfs_getattr() call.
1862 if (ISSET(np
->n_flag
, NUPDATESIZE
))
1863 nfs_data_update_size(np
, 0);
1865 if ((error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
))) {
1866 FSDBG_BOT(514, np
, 0xd1e0222, 0, error
);
1870 if (np
->n_flag
& NNEEDINVALIDATE
) {
1871 np
->n_flag
&= ~NNEEDINVALIDATE
;
1873 nfs_vinvalbuf(vp
, V_SAVE
|V_IGNORE_WRITEERR
, ctx
, 1);
1874 if ((error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
))) {
1875 FSDBG_BOT(514, np
, 0xd1e0322, 0, error
);
1880 if (np
->n_flag
& NMODIFIED
) {
1881 if (vtype
== VDIR
) {
1884 error
= nfs_vinvalbuf(vp
, V_SAVE
, ctx
, 1);
1886 error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
);
1888 FSDBG_BOT(514, np
, 0xd1e0003, 0, error
);
1892 NATTRINVALIDATE(np
);
1893 error
= nfs_getattr(np
, &nvattr
, ctx
, 1);
1896 FSDBG_BOT(514, np
, 0xd1e0004, 0, error
);
1899 if (vtype
== VDIR
) {
1900 /* if directory changed, purge any name cache entries */
1901 if (NFS_CHANGED_NC(nfsvers
, np
, &nvattr
))
1903 NFS_CHANGED_UPDATE_NC(nfsvers
, np
, &nvattr
);
1905 NFS_CHANGED_UPDATE(nfsvers
, np
, &nvattr
);
1907 error
= nfs_getattr(np
, &nvattr
, ctx
, 1);
1910 FSDBG_BOT(514, np
, 0xd1e0005, 0, error
);
1913 if (NFS_CHANGED(nfsvers
, np
, &nvattr
)) {
1914 if (vtype
== VDIR
) {
1916 /* purge name cache entries */
1917 if (NFS_CHANGED_NC(nfsvers
, np
, &nvattr
))
1921 error
= nfs_vinvalbuf(vp
, V_SAVE
, ctx
, 1);
1923 error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
);
1925 FSDBG_BOT(514, np
, 0xd1e0006, 0, error
);
1929 NFS_CHANGED_UPDATE_NC(nfsvers
, np
, &nvattr
);
1930 NFS_CHANGED_UPDATE(nfsvers
, np
, &nvattr
);
1936 if (vtype
== VREG
) {
1937 if ((ioflag
& IO_NOCACHE
) && (uio_uio_resid(uio
) < (2*biosize
))) {
1938 /* We have only a block or so to read, just do the rpc directly. */
1939 error
= nfs_read_rpc(np
, uio
, ctx
);
1940 FSDBG_BOT(514, np
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
1944 * set up readahead - which may be limited by:
1945 * + current request length (for IO_NOCACHE)
1946 * + readahead setting
1949 if (nmp
->nm_readahead
> 0) {
1950 off_t end
= uio
->uio_offset
+ uio_uio_resid(uio
);
1951 if (end
> (off_t
)np
->n_size
)
1953 rabn
= uio
->uio_offset
/ biosize
;
1954 maxrabn
= (end
- 1) / biosize
;
1955 if (!(ioflag
& IO_NOCACHE
) &&
1956 (!rabn
|| (rabn
== np
->n_lastread
) || (rabn
== (np
->n_lastread
+1)))) {
1957 maxrabn
+= nmp
->nm_readahead
;
1958 if ((maxrabn
* biosize
) >= (off_t
)np
->n_size
)
1959 maxrabn
= ((off_t
)np
->n_size
- 1)/biosize
;
1968 if (vtype
== VREG
) {
1969 nfs_data_lock(np
, NFS_NODE_LOCK_SHARED
);
1970 lbn
= uio
->uio_offset
/ biosize
;
1973 * Copy directly from any cached pages without grabbing the bufs.
1975 * Note: for "nocache" reads, we don't copy directly from UBC
1976 * because any cached pages will be for readahead buffers that
1977 * need to be invalidated anyway before we finish this request.
1979 if (!(ioflag
& IO_NOCACHE
) &&
1980 (uio
->uio_segflg
== UIO_USERSPACE32
||
1981 uio
->uio_segflg
== UIO_USERSPACE64
||
1982 uio
->uio_segflg
== UIO_USERSPACE
)) {
1983 // LP64todo - fix this!
1984 int io_resid
= uio_uio_resid(uio
);
1985 diff
= np
->n_size
- uio
->uio_offset
;
1986 if (diff
< io_resid
)
1989 error
= cluster_copy_ubc_data(vp
, uio
, &io_resid
, 0);
1991 nfs_data_unlock(np
);
1992 FSDBG_BOT(514, np
, uio
->uio_offset
, 0xcacefeed, error
);
1996 /* count any biocache reads that we just copied directly */
1997 if (lbn
!= (uio
->uio_offset
/ biosize
)) {
1998 OSAddAtomic((uio
->uio_offset
/ biosize
) - lbn
, (SInt32
*)&nfsstats
.biocache_reads
);
1999 FSDBG(514, np
, 0xcacefeed, uio
->uio_offset
, error
);
2003 lbn
= uio
->uio_offset
/ biosize
;
2004 on
= uio
->uio_offset
% biosize
;
2005 np
->n_lastread
= (uio
->uio_offset
- 1) / biosize
;
2007 /* adjust readahead block number, if necessary */
2010 lastrabn
= MIN(maxrabn
, lbn
+ nmp
->nm_readahead
);
2011 if (rabn
<= lastrabn
) { /* start readaheads */
2012 error
= nfs_buf_readahead(np
, ioflag
, &rabn
, lastrabn
, thd
, cred
);
2014 nfs_data_unlock(np
);
2015 FSDBG_BOT(514, np
, 0xd1e000b, 1, error
);
2020 if ((uio_uio_resid(uio
) <= 0) || (uio
->uio_offset
>= (off_t
)np
->n_size
)) {
2021 nfs_data_unlock(np
);
2022 FSDBG_BOT(514, np
, uio
->uio_offset
, uio_uio_resid(uio
), 0xaaaaaaaa);
2026 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_reads
);
2029 * If the block is in the cache and has the required data
2030 * in a valid region, just copy it out.
2031 * Otherwise, get the block and write back/read in,
2035 // LP64todo - fix this!
2036 n
= min((unsigned)(biosize
- on
), uio_uio_resid(uio
));
2037 diff
= np
->n_size
- uio
->uio_offset
;
2041 error
= nfs_buf_get(np
, lbn
, biosize
, thd
, NBLK_READ
, &bp
);
2043 nfs_data_unlock(np
);
2044 FSDBG_BOT(514, np
, 0xd1e000c, 0, error
);
2048 if ((ioflag
& IO_NOCACHE
) && ISSET(bp
->nb_flags
, NB_CACHE
)) {
2050 * IO_NOCACHE found a cached buffer.
2051 * Flush the buffer if it's dirty.
2052 * Invalidate the data if it wasn't just read
2053 * in as part of a "nocache readahead".
2055 if (bp
->nb_dirty
|| (bp
->nb_dirtyend
> 0)) {
2056 /* so write the buffer out and try again */
2057 SET(bp
->nb_flags
, NB_NOCACHE
);
2060 if (!ISSET(bp
->nb_flags
, NB_NCRDAHEAD
)) {
2061 CLR(bp
->nb_flags
, NB_CACHE
);
2064 CLR(bp
->nb_flags
, NB_NCRDAHEAD
);
2068 /* if any pages are valid... */
2070 /* ...check for any invalid pages in the read range */
2071 int pg
, firstpg
, lastpg
, dirtypg
;
2072 dirtypg
= firstpg
= lastpg
= -1;
2074 while (pg
<= (on
+ n
- 1)/PAGE_SIZE
) {
2075 if (!NBPGVALID(bp
,pg
)) {
2079 } else if (firstpg
>= 0 && dirtypg
< 0 && NBPGDIRTY(bp
,pg
))
2084 /* if there are no invalid pages, we're all set */
2086 if (bp
->nb_validoff
< 0) {
2087 /* valid range isn't set up, so */
2088 /* set it to what we know is valid */
2089 bp
->nb_validoff
= trunc_page(on
);
2090 bp
->nb_validend
= round_page(on
+n
);
2091 nfs_buf_normalize_valid_range(np
, bp
);
2096 /* there are invalid pages in the read range */
2097 if (((dirtypg
> firstpg
) && (dirtypg
< lastpg
)) ||
2098 (((firstpg
*PAGE_SIZE
) < bp
->nb_dirtyend
) && (((lastpg
+1)*PAGE_SIZE
) > bp
->nb_dirtyoff
))) {
2099 /* there are also dirty page(s) (or range) in the read range, */
2100 /* so write the buffer out and try again */
2102 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2103 SET(bp
->nb_flags
, NB_ASYNC
);
2104 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
2105 kauth_cred_ref(cred
);
2106 bp
->nb_wcred
= cred
;
2108 error
= nfs_buf_write(bp
);
2110 nfs_data_unlock(np
);
2111 FSDBG_BOT(514, np
, 0xd1e000d, 0, error
);
2116 if (!bp
->nb_dirty
&& bp
->nb_dirtyend
<= 0 &&
2117 (lastpg
- firstpg
+ 1) > (biosize
/PAGE_SIZE
)/2) {
2118 /* we need to read in more than half the buffer and the */
2119 /* buffer's not dirty, so just fetch the whole buffer */
2122 /* read the page range in */
2124 char uio_buf
[ UIO_SIZEOF(1) ];
2127 auio
= uio_createwithbuffer(1, (NBOFF(bp
) + firstpg
* PAGE_SIZE_64
),
2128 UIO_SYSSPACE
, UIO_READ
, &uio_buf
[0], sizeof(uio_buf
));
2132 uio_addiov(auio
, CAST_USER_ADDR_T((bp
->nb_data
+ firstpg
* PAGE_SIZE
)),
2133 ((lastpg
- firstpg
+ 1) * PAGE_SIZE
));
2134 error
= nfs_read_rpc(np
, auio
, ctx
);
2137 if (ioflag
& IO_NOCACHE
)
2138 SET(bp
->nb_flags
, NB_NOCACHE
);
2139 nfs_buf_release(bp
, 1);
2140 nfs_data_unlock(np
);
2141 FSDBG_BOT(514, np
, 0xd1e000e, 0, error
);
2144 /* Make sure that the valid range is set to cover this read. */
2145 bp
->nb_validoff
= trunc_page_32(on
);
2146 bp
->nb_validend
= round_page_32(on
+n
);
2147 nfs_buf_normalize_valid_range(np
, bp
);
2148 if (uio_resid(auio
) > 0) {
2149 /* if short read, must have hit EOF, */
2150 /* so zero the rest of the range */
2151 bzero(CAST_DOWN(caddr_t
, uio_curriovbase(auio
)), uio_resid(auio
));
2153 /* mark the pages (successfully read) as valid */
2154 for (pg
=firstpg
; pg
<= lastpg
; pg
++)
2155 NBPGVALID_SET(bp
,pg
);
2158 /* if no pages are valid, read the whole block */
2159 if (!bp
->nb_valid
) {
2160 if (!IS_VALID_CRED(bp
->nb_rcred
) && IS_VALID_CRED(cred
)) {
2161 kauth_cred_ref(cred
);
2162 bp
->nb_rcred
= cred
;
2164 SET(bp
->nb_flags
, NB_READ
);
2165 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
));
2166 error
= nfs_buf_read(bp
);
2168 nfs_data_unlock(np
);
2169 nfs_buf_release(bp
, 1);
2170 FSDBG_BOT(514, np
, 0xd1e000f, 0, error
);
2175 /* validate read range against valid range and clip */
2176 if (bp
->nb_validend
> 0) {
2177 diff
= (on
>= bp
->nb_validend
) ? 0 : (bp
->nb_validend
- on
);
2183 } else if (vtype
== VDIR
) {
2184 OSAddAtomic(1, (SInt32
*)&nfsstats
.biocache_readdirs
);
2185 error
= nfs_lock(np
, NFS_NODE_LOCK_SHARED
);
2186 if (error
|| (np
->n_direofoffset
&& (uio
->uio_offset
>= np
->n_direofoffset
))) {
2191 FSDBG_BOT(514, np
, 0xde0f0001, 0, 0);
2195 lbn
= uio
->uio_offset
/ NFS_DIRBLKSIZ
;
2196 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
2197 error
= nfs_buf_get(np
, lbn
, NFS_DIRBLKSIZ
, thd
, NBLK_READ
, &bp
);
2199 FSDBG_BOT(514, np
, 0xd1e0012, 0, error
);
2202 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
2203 SET(bp
->nb_flags
, NB_READ
);
2204 error
= nfs_buf_readdir(bp
, ctx
);
2206 nfs_buf_release(bp
, 1);
2207 while (error
== NFSERR_BAD_COOKIE
) {
2208 error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
);
2213 error
= nfs_vinvalbuf(vp
, 0, ctx
, 1);
2215 * Yuck! The directory has been modified on the
2216 * server. The only way to get the block is by
2217 * reading from the beginning to get all the
2220 for (tlbn
= 0; tlbn
<= lbn
&& !error
; tlbn
++) {
2221 if ((error
= nfs_lock(np
, NFS_NODE_LOCK_SHARED
)))
2223 if (np
->n_direofoffset
2224 && (tlbn
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
) {
2228 FSDBG_BOT(514, np
, 0xde0f0002, 0, 0);
2232 error
= nfs_buf_get(np
, tlbn
, NFS_DIRBLKSIZ
, thd
, NBLK_READ
, &bp
);
2234 FSDBG_BOT(514, np
, 0xd1e0013, 0, error
);
2237 if (!ISSET(bp
->nb_flags
, NB_CACHE
)) {
2238 SET(bp
->nb_flags
, NB_READ
);
2239 error
= nfs_buf_readdir(bp
, ctx
);
2241 * no error + NB_INVAL == directory EOF,
2244 if (error
== 0 && ISSET(bp
->nb_flags
, NB_INVAL
)) {
2251 * An error will throw away the block and the
2252 * for loop will break out. If no error and this
2253 * is not the block we want, we throw away the
2254 * block and go for the next one via the for loop.
2256 if (error
|| (tlbn
< lbn
))
2257 nfs_buf_release(bp
, 1);
2261 * The above while is repeated if we hit another cookie
2262 * error. If we hit an error and it wasn't a cookie error,
2266 FSDBG_BOT(514, np
, 0xd1e0014, 0, error
);
2271 * Make sure we use a signed variant of min() since
2272 * the second term may be negative.
2274 // LP64todo - fix this!
2275 n
= lmin(uio_uio_resid(uio
), bp
->nb_validend
- on
);
2277 * We keep track of the directory eof in
2278 * np->n_direofoffset and chop it off as an
2279 * extra step right here.
2281 if ((error
= nfs_lock(np
, NFS_NODE_LOCK_SHARED
))) {
2282 FSDBG_BOT(514, np
, 0xd1e0115, 0, error
);
2285 if (np
->n_direofoffset
&&
2286 n
> np
->n_direofoffset
- uio
->uio_offset
)
2287 n
= np
->n_direofoffset
- uio
->uio_offset
;
2290 * Make sure that we return an integral number of entries so
2291 * that any subsequent calls will start copying from the start
2292 * of the next entry.
2294 * If the current value of n has the last entry cut short,
2295 * set n to copy everything up to the last entry instead.
2298 dp
= bp
->nb_data
+ on
;
2299 while (dp
< (bp
->nb_data
+ on
+ n
)) {
2300 direntp
= (struct dirent
*)dp
;
2301 dp
+= direntp
->d_reclen
;
2303 if (dp
> (bp
->nb_data
+ on
+ n
))
2304 n
= (dp
- direntp
->d_reclen
) - (bp
->nb_data
+ on
);
2309 error
= uiomove(bp
->nb_data
+ on
, (int)n
, uio
);
2311 if (vtype
== VREG
) {
2312 if (ioflag
& IO_NOCACHE
)
2313 SET(bp
->nb_flags
, NB_NOCACHE
);
2314 nfs_buf_release(bp
, 1);
2315 nfs_data_unlock(np
);
2316 np
->n_lastread
= (uio
->uio_offset
- 1) / biosize
;
2318 nfs_buf_release(bp
, 1);
2320 } while (error
== 0 && uio_uio_resid(uio
) > 0 && n
> 0);
2321 FSDBG_BOT(514, np
, uio
->uio_offset
, uio_uio_resid(uio
), error
);
2326 * limit the number of outstanding async I/O writes
2329 nfs_async_write_start(struct nfsmount
*nmp
)
2331 int error
= 0, slpflag
= (nmp
->nm_flag
& NFSMNT_INT
) ? PCATCH
: 0;
2332 struct timespec ts
= {1, 0};
2334 if (nfs_max_async_writes
<= 0)
2336 lck_mtx_lock(&nmp
->nm_lock
);
2337 while (!error
&& (nfs_max_async_writes
> 0) && (nmp
->nm_asyncwrites
>= nfs_max_async_writes
)) {
2338 if ((error
= nfs_sigintr(nmp
, NULL
, current_thread(), 1)))
2340 error
= msleep(&nmp
->nm_asyncwrites
, &nmp
->nm_lock
, slpflag
|(PZERO
-1), "nfsasyncwrites", &ts
);
2341 if (error
== EWOULDBLOCK
)
2345 nmp
->nm_asyncwrites
++;
2346 lck_mtx_unlock(&nmp
->nm_lock
);
2350 nfs_async_write_done(struct nfsmount
*nmp
)
2352 if (nmp
->nm_asyncwrites
<= 0)
2354 lck_mtx_lock(&nmp
->nm_lock
);
2355 if (nmp
->nm_asyncwrites
-- >= nfs_max_async_writes
)
2356 wakeup(&nmp
->nm_asyncwrites
);
2357 lck_mtx_unlock(&nmp
->nm_lock
);
2361 * write (or commit) the given NFS buffer
2363 * Commit the buffer if we can.
2364 * Write out any dirty range.
2365 * If any dirty pages remain, write them out.
2368 * For async requests, all the work beyond sending the initial
2369 * write RPC is handled in the RPC callback(s).
2372 nfs_buf_write(struct nfsbuf
*bp
)
2374 int error
= 0, oldflags
, async
;
2378 proc_t p
= current_proc();
2379 int iomode
, doff
, dend
, firstpg
, lastpg
;
2382 FSDBG_TOP(553, bp
, NBOFF(bp
), bp
->nb_flags
, 0);
2384 if (!ISSET(bp
->nb_lflags
, NBL_BUSY
))
2385 panic("nfs_buf_write: buffer is not busy???");
2388 async
= ISSET(bp
->nb_flags
, NB_ASYNC
);
2389 oldflags
= bp
->nb_flags
;
2391 CLR(bp
->nb_flags
, (NB_READ
|NB_DONE
|NB_ERROR
|NB_DELWRI
));
2392 if (ISSET(oldflags
, NB_DELWRI
)) {
2393 lck_mtx_lock(nfs_buf_mutex
);
2396 lck_mtx_unlock(nfs_buf_mutex
);
2397 wakeup(&nfs_nbdwrite
);
2400 /* move to clean list */
2401 if (ISSET(oldflags
, (NB_ASYNC
|NB_DELWRI
))) {
2402 lck_mtx_lock(nfs_buf_mutex
);
2403 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
2404 LIST_REMOVE(bp
, nb_vnbufs
);
2405 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2406 lck_mtx_unlock(nfs_buf_mutex
);
2408 vnode_startwrite(NFSTOV(np
));
2410 if (p
&& p
->p_stats
)
2411 OSIncrementAtomic(&p
->p_stats
->p_ru
.ru_oublock
);
2413 cred
= bp
->nb_wcred
;
2414 if (!IS_VALID_CRED(cred
) && ISSET(bp
->nb_flags
, NB_READ
))
2415 cred
= bp
->nb_rcred
; /* shouldn't really happen, but... */
2416 if (IS_VALID_CRED(cred
))
2417 kauth_cred_ref(cred
);
2418 thd
= async
? NULL
: current_thread();
2420 /* We need to make sure the pages are locked before doing I/O. */
2421 if (!ISSET(bp
->nb_flags
, NB_META
) && UBCINFOEXISTS(NFSTOV(np
))) {
2422 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
2423 error
= nfs_buf_upl_setup(bp
);
2425 printf("nfs_buf_write: upl create failed %d\n", error
);
2426 SET(bp
->nb_flags
, NB_ERROR
);
2427 bp
->nb_error
= error
= EIO
;
2431 nfs_buf_upl_check(bp
);
2435 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2436 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
2437 nfs_buf_check_write_verifier(np
, bp
);
2438 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2439 struct nfsmount
*nmp
= NFSTONMP(np
);
2441 SET(bp
->nb_flags
, NB_ERROR
);
2442 bp
->nb_error
= error
= EIO
;
2446 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2447 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
) + bp
->nb_dirtyoff
,
2448 bp
->nb_dirtyend
- bp
->nb_dirtyoff
, bp
->nb_wcred
);
2449 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2451 if (error
!= NFSERR_STALEWRITEVERF
) {
2452 SET(bp
->nb_flags
, NB_ERROR
);
2453 bp
->nb_error
= error
;
2458 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2459 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2460 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
2461 np
->n_needcommitcnt
--;
2462 CHECK_NEEDCOMMITCNT(np
);
2465 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2466 /* sanity check the dirty range */
2467 if (NBOFF(bp
) + bp
->nb_dirtyend
> (off_t
) np
->n_size
) {
2468 bp
->nb_dirtyend
= np
->n_size
- NBOFF(bp
);
2469 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
)
2470 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2473 if (!error
&& (bp
->nb_dirtyend
> 0)) {
2474 /* there's a dirty range that needs to be written out */
2477 doff
= bp
->nb_dirtyoff
;
2478 dend
= bp
->nb_dirtyend
;
2480 /* if doff page is dirty, move doff to start of page */
2481 if (NBPGDIRTY(bp
, doff
/ PAGE_SIZE
))
2482 doff
-= doff
& PAGE_MASK
;
2483 /* try to expand write range to include preceding dirty pages */
2484 if (!(doff
& PAGE_MASK
))
2485 while ((doff
> 0) && NBPGDIRTY(bp
, (doff
- 1) / PAGE_SIZE
))
2487 /* if dend page is dirty, move dend to start of next page */
2488 if ((dend
& PAGE_MASK
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
))
2489 dend
= round_page_32(dend
);
2490 /* try to expand write range to include trailing dirty pages */
2491 if (!(dend
& PAGE_MASK
))
2492 while ((dend
< bp
->nb_bufsize
) && NBPGDIRTY(bp
, dend
/ PAGE_SIZE
))
2494 /* make sure to keep dend clipped to EOF */
2495 if ((NBOFF(bp
) + dend
) > (off_t
) np
->n_size
)
2496 dend
= np
->n_size
- NBOFF(bp
);
2497 /* calculate range of complete pages being written */
2498 firstpg
= round_page_32(doff
) / PAGE_SIZE
;
2499 lastpg
= (trunc_page_32(dend
) - 1) / PAGE_SIZE
;
2500 /* calculate mask for that page range */
2501 pagemask
= ((1 << (lastpg
+ 1)) - 1) & ~((1 << firstpg
) - 1);
2504 * compare page mask to nb_dirty; if there are other dirty pages
2505 * then write FILESYNC; otherwise, write UNSTABLE if async and
2506 * not needcommit/stable; otherwise write FILESYNC
2508 if (bp
->nb_dirty
& ~pagemask
)
2509 iomode
= NFS_WRITE_FILESYNC
;
2510 else if ((bp
->nb_flags
& (NB_ASYNC
| NB_NEEDCOMMIT
| NB_STABLE
)) == NB_ASYNC
)
2511 iomode
= NFS_WRITE_UNSTABLE
;
2513 iomode
= NFS_WRITE_FILESYNC
;
2515 /* write the whole contiguous dirty range */
2516 bp
->nb_offio
= doff
;
2517 bp
->nb_endio
= dend
;
2519 OSAddAtomic(1, (SInt32
*)&nfsstats
.write_bios
);
2521 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2522 error
= nfs_buf_write_rpc(bp
, iomode
, thd
, cred
);
2524 * For async I/O, the callbacks will finish up the
2525 * write and push out any dirty pages. Otherwise,
2526 * the write has already been finished and any dirty
2530 if (!error
&& bp
->nb_dirty
) /* write out any dirty pages */
2531 error
= nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2534 /* note: bp is still valid only for !async case */
2537 error
= nfs_buf_iowait(bp
);
2538 /* move to clean list */
2539 if (oldflags
& NB_DELWRI
) {
2540 lck_mtx_lock(nfs_buf_mutex
);
2541 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
2542 LIST_REMOVE(bp
, nb_vnbufs
);
2543 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
2544 lck_mtx_unlock(nfs_buf_mutex
);
2546 FSDBG_BOT(553, bp
, NBOFF(bp
), bp
->nb_flags
, error
);
2547 nfs_buf_release(bp
, 1);
2548 /* check if we need to invalidate (and we can) */
2549 if ((np
->n_flag
& NNEEDINVALIDATE
) &&
2550 !(np
->n_bflag
& (NBINVALINPROG
|NBFLUSHINPROG
))) {
2552 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
2553 if (np
->n_flag
& NNEEDINVALIDATE
) {
2555 np
->n_flag
&= ~NNEEDINVALIDATE
;
2560 * There was a write error and we need to
2561 * invalidate attrs and flush buffers in
2562 * order to sync up with the server.
2563 * (if this write was extending the file,
2564 * we may no longer know the correct size)
2566 * But we couldn't call vinvalbuf while holding
2567 * the buffer busy. So we call vinvalbuf() after
2568 * releasing the buffer.
2570 nfs_vinvalbuf2(NFSTOV(np
), V_SAVE
|V_IGNORE_WRITEERR
, thd
, cred
, 1);
2575 if (IS_VALID_CRED(cred
))
2576 kauth_cred_unref(&cred
);
2581 * finish the writing of a buffer
2584 nfs_buf_write_finish(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2586 nfsnode_t np
= bp
->nb_np
;
2587 int error
= (bp
->nb_flags
& NB_ERROR
) ? bp
->nb_error
: 0;
2588 int firstpg
, lastpg
;
2591 if ((error
== EINTR
) || (error
== ERESTART
)) {
2592 CLR(bp
->nb_flags
, NB_ERROR
);
2593 SET(bp
->nb_flags
, NB_EINTR
);
2597 /* calculate range of complete pages being written */
2598 firstpg
= round_page_32(bp
->nb_offio
) / PAGE_SIZE
;
2599 lastpg
= (trunc_page_32(bp
->nb_endio
) - 1) / PAGE_SIZE
;
2600 /* calculate mask for that page range written */
2601 pagemask
= ((1 << (lastpg
+ 1)) - 1) & ~((1 << firstpg
) - 1);
2602 /* clear dirty bits for pages we've written */
2603 bp
->nb_dirty
&= ~pagemask
;
2606 /* manage needcommit state */
2607 if (!error
&& (bp
->nb_commitlevel
== NFS_WRITE_UNSTABLE
)) {
2608 if (!ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2609 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
2610 np
->n_needcommitcnt
++;
2612 SET(bp
->nb_flags
, NB_NEEDCOMMIT
);
2614 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2615 bp
->nb_dirtyoff
= bp
->nb_offio
;
2616 bp
->nb_dirtyend
= bp
->nb_endio
;
2617 } else if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
2618 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
2619 np
->n_needcommitcnt
--;
2620 CHECK_NEEDCOMMITCNT(np
);
2622 CLR(bp
->nb_flags
, NB_NEEDCOMMIT
);
2625 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2628 * For an unstable write, the buffer is still treated as dirty until
2629 * a commit (or stable (re)write) is performed. Buffers needing only
2630 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2632 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2633 * because that would cause the buffer to be dropped. The buffer is
2634 * still valid and simply needs to be written again.
2636 if ((error
== EINTR
) || (error
== ERESTART
) || (!error
&& (bp
->nb_flags
& NB_NEEDCOMMIT
))) {
2637 CLR(bp
->nb_flags
, NB_INVAL
);
2638 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
2639 SET(bp
->nb_flags
, NB_DELWRI
);
2640 lck_mtx_lock(nfs_buf_mutex
);
2643 lck_mtx_unlock(nfs_buf_mutex
);
2646 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2647 * clean list, we have to reassign it back to the dirty one. Ugh.
2649 if (ISSET(bp
->nb_flags
, NB_ASYNC
)) {
2650 /* move to dirty list */
2651 lck_mtx_lock(nfs_buf_mutex
);
2652 if (bp
->nb_vnbufs
.le_next
!= NFSNOLIST
)
2653 LIST_REMOVE(bp
, nb_vnbufs
);
2654 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
2655 lck_mtx_unlock(nfs_buf_mutex
);
2658 /* either there's an error or we don't need to commit */
2661 * There was a write error and we need to invalidate
2662 * attrs and flush buffers in order to sync up with the
2663 * server. (if this write was extending the file, we
2664 * may no longer know the correct size)
2666 * But we can't call vinvalbuf while holding this
2667 * buffer busy. Set a flag to do it after releasing
2670 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
2671 np
->n_error
= error
;
2672 np
->n_flag
|= (NWRITEERR
| NNEEDINVALIDATE
);
2673 NATTRINVALIDATE(np
);
2676 /* clear the dirty range */
2677 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
2680 if (!error
&& bp
->nb_dirty
)
2681 nfs_buf_write_dirty_pages(bp
, thd
, cred
);
2686 * write out any pages marked dirty in a buffer
2688 * We do use unstable writes and follow up with a commit.
2689 * If we catch the write verifier changing we'll restart
2690 * do the writes filesync.
2693 nfs_buf_write_dirty_pages(struct nfsbuf
*bp
, thread_t thd
, kauth_cred_t cred
)
2695 nfsnode_t np
= bp
->nb_np
;
2696 struct nfsmount
*nmp
= NFSTONMP(np
);
2697 int error
= 0, commit
, iomode
, iomode2
, len
, pg
, count
, npages
, off
;
2698 uint32_t dirty
= bp
->nb_dirty
;
2706 /* there are pages marked dirty that need to be written out */
2707 OSAddAtomic(1, (SInt32
*)&nfsstats
.write_bios
);
2709 SET(bp
->nb_flags
, NB_WRITEINPROG
);
2710 npages
= bp
->nb_bufsize
/ PAGE_SIZE
;
2711 iomode
= NFS_WRITE_UNSTABLE
;
2713 uio
.uio_iovs
.iov32p
= &io
;
2715 uio
.uio_rw
= UIO_WRITE
;
2716 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2717 uio
.uio_segflg
= UIO_SYSSPACE
;
2719 uio
.uio_segflg
= UIO_SYSSPACE32
;
2723 dirty
= bp
->nb_dirty
;
2724 wverf
= bp
->nb_verf
;
2725 commit
= NFS_WRITE_FILESYNC
;
2726 for (pg
= 0; pg
< npages
; pg
++) {
2727 if (!NBPGDIRTY(bp
, pg
))
2730 while (((pg
+ count
) < npages
) && NBPGDIRTY(bp
, pg
+ count
))
2732 /* write count pages starting with page pg */
2733 off
= pg
* PAGE_SIZE
;
2734 len
= count
* PAGE_SIZE
;
2735 /* clip writes to EOF */
2736 if (NBOFF(bp
) + off
+ len
> (off_t
) np
->n_size
)
2737 len
-= (NBOFF(bp
) + off
+ len
) - np
->n_size
;
2741 uio_uio_resid_set(&uio
, io
.iov_len
);
2742 uio
.uio_offset
= NBOFF(bp
) + off
;
2743 io
.iov_base
= (uintptr_t) bp
->nb_data
+ off
;
2744 error
= nfs_write_rpc2(np
, &uio
, thd
, cred
, &iomode2
, &bp
->nb_verf
);
2747 if (iomode2
< commit
) /* Retain the lowest commitment level returned. */
2749 if ((commit
!= NFS_WRITE_FILESYNC
) && (wverf
!= bp
->nb_verf
)) {
2750 /* verifier changed, redo all the writes filesync */
2751 iomode
= NFS_WRITE_FILESYNC
;
2755 /* clear dirty bits */
2757 dirty
&= ~(1 << pg
);
2758 if (count
) /* leave pg on last page */
2762 CLR(bp
->nb_flags
, NB_WRITEINPROG
);
2764 if (!error
&& (commit
!= NFS_WRITE_FILESYNC
)) {
2765 error
= nmp
->nm_funcs
->nf_commit_rpc(np
, NBOFF(bp
), bp
->nb_bufsize
, cred
);
2766 if (error
== NFSERR_STALEWRITEVERF
) {
2767 /* verifier changed, so we need to restart all the writes */
2768 iomode
= NFS_WRITE_FILESYNC
;
2773 bp
->nb_dirty
= dirty
;
2775 SET(bp
->nb_flags
, NB_ERROR
);
2776 bp
->nb_error
= error
;
2782 * initiate the NFS WRITE RPC(s) for a buffer
2785 nfs_buf_write_rpc(struct nfsbuf
*bp
, int iomode
, thread_t thd
, kauth_cred_t cred
)
2787 struct nfsmount
*nmp
;
2788 nfsnode_t np
= bp
->nb_np
;
2789 int error
= 0, nfsvers
, async
;
2790 int offset
, length
, nmwsize
, nrpcs
, len
;
2792 struct nfsreq_cbinfo cb
;
2798 bp
->nb_error
= error
= ENXIO
;
2799 SET(bp
->nb_flags
, NB_ERROR
);
2803 nfsvers
= nmp
->nm_vers
;
2804 nmwsize
= nmp
->nm_wsize
;
2806 offset
= bp
->nb_offio
;
2807 length
= bp
->nb_endio
- bp
->nb_offio
;
2809 /* Note: Can only do async I/O if nfsiods are configured. */
2810 async
= (bp
->nb_flags
& NB_ASYNC
) && (NFSIOD_MAX
> 0);
2811 bp
->nb_commitlevel
= NFS_WRITE_FILESYNC
;
2812 cb
.rcb_func
= async
? nfs_buf_write_rpc_finish
: NULL
;
2815 if ((nfsvers
== NFS_VER2
) && ((NBOFF(bp
) + bp
->nb_endio
) > 0xffffffffLL
)) {
2816 bp
->nb_error
= error
= EFBIG
;
2817 SET(bp
->nb_flags
, NB_ERROR
);
2822 uio
.uio_iovs
.iov32p
= &io
;
2824 uio
.uio_rw
= UIO_WRITE
;
2825 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2826 uio
.uio_segflg
= UIO_SYSSPACE
;
2828 uio
.uio_segflg
= UIO_SYSSPACE32
;
2830 io
.iov_len
= length
;
2831 uio_uio_resid_set(&uio
, io
.iov_len
);
2832 uio
.uio_offset
= NBOFF(bp
) + offset
;
2833 io
.iov_base
= (uintptr_t) bp
->nb_data
+ offset
;
2835 bp
->nb_rpcs
= nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
2836 if (async
&& (nrpcs
> 1)) {
2837 SET(bp
->nb_flags
, NB_MULTASYNCRPC
);
2839 CLR(bp
->nb_flags
, NB_MULTASYNCRPC
);
2842 while (length
> 0) {
2843 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
2844 error
= bp
->nb_error
;
2847 len
= (length
> nmwsize
) ? nmwsize
: length
;
2848 cb
.rcb_args
[0] = offset
;
2849 cb
.rcb_args
[1] = len
;
2850 if (async
&& ((error
= nfs_async_write_start(nmp
))))
2853 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, &uio
, len
, thd
, cred
,
2857 nfs_async_write_done(nmp
);
2864 nfs_buf_write_rpc_finish(req
);
2869 * Something bad happened while trying to send the RPCs.
2870 * Wait for any outstanding requests to complete.
2872 bp
->nb_error
= error
;
2873 SET(bp
->nb_flags
, NB_ERROR
);
2874 if (ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
)) {
2875 nrpcs
= (length
+ nmwsize
- 1) / nmwsize
;
2876 lck_mtx_lock(nfs_buf_mutex
);
2877 bp
->nb_rpcs
-= nrpcs
;
2878 if (bp
->nb_rpcs
== 0) {
2879 /* No RPCs left, so the buffer's done */
2880 lck_mtx_unlock(nfs_buf_mutex
);
2881 nfs_buf_write_finish(bp
, thd
, cred
);
2883 /* wait for the last RPC to mark it done */
2884 while (bp
->nb_rpcs
> 0)
2885 msleep(&bp
->nb_rpcs
, nfs_buf_mutex
, 0,
2886 "nfs_buf_write_rpc_cancel", NULL
);
2887 lck_mtx_unlock(nfs_buf_mutex
);
2890 nfs_buf_write_finish(bp
, thd
, cred
);
2898 * finish up an NFS WRITE RPC on a buffer
2901 nfs_buf_write_rpc_finish(struct nfsreq
*req
)
2903 int error
= 0, nfsvers
, offset
, length
, multasyncrpc
, finished
;
2904 int committed
= NFS_WRITE_FILESYNC
;
2907 void *wakeme
= NULL
;
2908 struct nfsreq_cbinfo cb
;
2909 struct nfsreq
*wreq
= NULL
;
2911 struct nfsmount
*nmp
;
2920 thd
= req
->r_thread
;
2922 if (IS_VALID_CRED(cred
))
2923 kauth_cred_ref(cred
);
2924 cb
= req
->r_callback
;
2929 SET(bp
->nb_flags
, NB_ERROR
);
2930 bp
->nb_error
= error
= ENXIO
;
2932 if (error
|| ISSET(bp
->nb_flags
, NB_ERROR
)) {
2934 nfs_request_async_cancel(req
);
2937 nfsvers
= nmp
->nm_vers
;
2939 offset
= cb
.rcb_args
[0];
2940 rlen
= length
= cb
.rcb_args
[1];
2942 /* finish the RPC */
2943 error
= nmp
->nm_funcs
->nf_write_rpc_async_finish(np
, req
, &committed
, &rlen
, &wverf
);
2944 if ((error
== EINPROGRESS
) && cb
.rcb_func
) {
2945 /* async request restarted */
2946 if (IS_VALID_CRED(cred
))
2947 kauth_cred_unref(&cred
);
2952 SET(bp
->nb_flags
, NB_ERROR
);
2953 bp
->nb_error
= error
;
2955 if (error
|| (nfsvers
== NFS_VER2
))
2958 SET(bp
->nb_flags
, NB_ERROR
);
2959 bp
->nb_error
= error
= EIO
;
2963 /* save lowest commit level returned */
2964 if (committed
< bp
->nb_commitlevel
)
2965 bp
->nb_commitlevel
= committed
;
2967 /* check the write verifier */
2969 bp
->nb_verf
= wverf
;
2970 } else if (bp
->nb_verf
!= wverf
) {
2971 /* verifier changed, so buffer will need to be rewritten */
2972 bp
->nb_flags
|= NB_STALEWVERF
;
2973 bp
->nb_commitlevel
= NFS_WRITE_UNSTABLE
;
2974 bp
->nb_verf
= wverf
;
2978 * check for a short write
2980 * If the server didn't write all the data, then we
2981 * need to issue another write for the rest of it.
2982 * (Don't bother if the buffer hit an error or stale wverf.)
2984 if (((int)rlen
< length
) && !(bp
->nb_flags
& (NB_STALEWVERF
|NB_ERROR
))) {
2988 uio
.uio_iovs
.iov32p
= &io
;
2990 uio
.uio_rw
= UIO_WRITE
;
2991 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2992 uio
.uio_segflg
= UIO_SYSSPACE
;
2994 uio
.uio_segflg
= UIO_SYSSPACE32
;
2996 io
.iov_len
= length
;
2997 uio_uio_resid_set(&uio
, io
.iov_len
);
2998 uio
.uio_offset
= NBOFF(bp
) + offset
;
2999 io
.iov_base
= (uintptr_t) bp
->nb_data
+ offset
;
3001 cb
.rcb_args
[0] = offset
;
3002 cb
.rcb_args
[1] = length
;
3004 error
= nmp
->nm_funcs
->nf_write_rpc_async(np
, &uio
, length
, thd
, cred
,
3005 NFS_WRITE_FILESYNC
, &cb
, &wreq
);
3007 if (IS_VALID_CRED(cred
))
3008 kauth_cred_unref(&cred
);
3010 /* if !async we'll need to wait for this RPC to finish */
3016 * Outstanding RPC count is unchanged.
3017 * Callback will be called when RPC is done.
3021 SET(bp
->nb_flags
, NB_ERROR
);
3022 bp
->nb_error
= error
;
3027 nfs_async_write_done(nmp
);
3029 * Decrement outstanding RPC count on buffer
3030 * and call nfs_buf_write_finish on last RPC.
3032 * (Note: when there are multiple async RPCs issued for a
3033 * buffer we need nfs_buffer_mutex to avoid problems when
3034 * aborting a partially-initiated set of RPCs)
3036 multasyncrpc
= ISSET(bp
->nb_flags
, NB_MULTASYNCRPC
);
3038 lck_mtx_lock(nfs_buf_mutex
);
3041 finished
= (bp
->nb_rpcs
== 0);
3044 lck_mtx_unlock(nfs_buf_mutex
);
3048 wakeme
= &bp
->nb_rpcs
;
3049 nfs_buf_write_finish(bp
, thd
, cred
);
3054 if (IS_VALID_CRED(cred
))
3055 kauth_cred_unref(&cred
);
3059 * Send commit(s) for the given node's "needcommit" buffers
3062 nfs_flushcommits(nfsnode_t np
, int nowait
)
3064 struct nfsmount
*nmp
;
3066 struct nfsbuflists blist
, commitlist
;
3067 int error
= 0, retv
, wcred_set
, flags
, dirty
;
3068 u_quad_t off
, endoff
, toff
;
3070 kauth_cred_t wcred
= NULL
;
3072 FSDBG_TOP(557, np
, 0, 0, 0);
3075 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3076 * server, but nas not been committed to stable storage on the server
3077 * yet. The byte range is worked out for as many nfsbufs as we can handle
3078 * and the commit rpc is done.
3080 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3081 error
= nfs_lock(np
, NFS_NODE_LOCK_EXCLUSIVE
);
3084 np
->n_flag
|= NMODIFIED
;
3091 LIST_INIT(&commitlist
);
3098 if (nmp
->nm_vers
== NFS_VER2
) {
3105 flags
|= NBI_NOWAIT
;
3106 lck_mtx_lock(nfs_buf_mutex
);
3107 if (!nfs_buf_iterprepare(np
, &blist
, flags
)) {
3108 while ((bp
= LIST_FIRST(&blist
))) {
3109 LIST_REMOVE(bp
, nb_vnbufs
);
3110 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3111 error
= nfs_buf_acquire(bp
, NBAC_NOWAIT
, 0, 0);
3114 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
3115 nfs_buf_check_write_verifier(np
, bp
);
3116 if (((bp
->nb_flags
& (NB_DELWRI
| NB_NEEDCOMMIT
))
3117 != (NB_DELWRI
| NB_NEEDCOMMIT
))) {
3121 nfs_buf_remfree(bp
);
3122 lck_mtx_unlock(nfs_buf_mutex
);
3124 * we need a upl to see if the page has been
3125 * dirtied (think mmap) since the unstable write, and
3126 * also to prevent vm from paging it during our commit rpc
3128 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3129 retv
= nfs_buf_upl_setup(bp
);
3131 /* unable to create upl */
3132 /* vm object must no longer exist */
3133 /* this could be fatal if we need */
3134 /* to write the data again, we'll see... */
3135 printf("nfs_flushcommits: upl create failed %d\n", retv
);
3136 bp
->nb_valid
= bp
->nb_dirty
= 0;
3139 nfs_buf_upl_check(bp
);
3140 lck_mtx_lock(nfs_buf_mutex
);
3142 FSDBG(557, bp
, bp
->nb_flags
, bp
->nb_valid
, bp
->nb_dirty
);
3143 FSDBG(557, bp
->nb_validoff
, bp
->nb_validend
,
3144 bp
->nb_dirtyoff
, bp
->nb_dirtyend
);
3147 * We used to check for dirty pages here; if there were any
3148 * we'd abort the commit and force the entire buffer to be
3151 * Instead of doing that, we now go ahead and commit the dirty
3152 * range, and then leave the buffer around with dirty pages
3153 * that will be written out later.
3157 * Work out if all buffers are using the same cred
3158 * so we can deal with them all with one commit.
3160 * Note: creds in bp's must be obtained by kauth_cred_ref
3161 * on the same original cred in order for them to be equal.
3163 if (wcred_set
== 0) {
3164 wcred
= bp
->nb_wcred
;
3165 if (!IS_VALID_CRED(wcred
))
3166 panic("nfs: needcommit w/out wcred");
3168 } else if ((wcred_set
== 1) && wcred
!= bp
->nb_wcred
) {
3171 SET(bp
->nb_flags
, NB_WRITEINPROG
);
3174 * A list of these buffers is kept so that the
3175 * second loop knows which buffers have actually
3176 * been committed. This is necessary, since there
3177 * may be a race between the commit rpc and new
3178 * uncommitted writes on the file.
3180 LIST_REMOVE(bp
, nb_vnbufs
);
3181 LIST_INSERT_HEAD(&commitlist
, bp
, nb_vnbufs
);
3182 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3185 toff
+= (u_quad_t
)(bp
->nb_dirtyend
- bp
->nb_dirtyoff
);
3189 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3191 lck_mtx_unlock(nfs_buf_mutex
);
3193 if (LIST_EMPTY(&commitlist
)) {
3199 * Commit data on the server, as required.
3200 * If all bufs are using the same wcred, then use that with
3201 * one call for all of them, otherwise commit each one
3204 if (wcred_set
== 1) {
3206 * Note, it's possible the commit range could be >2^32-1.
3207 * If it is, we'll send one commit that covers the whole file.
3209 if ((endoff
- off
) > 0xffffffff)
3212 count
= (endoff
- off
);
3213 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, off
, count
, wcred
);
3216 LIST_FOREACH(bp
, &commitlist
, nb_vnbufs
) {
3217 toff
= NBOFF(bp
) + bp
->nb_dirtyoff
;
3218 count
= bp
->nb_dirtyend
- bp
->nb_dirtyoff
;
3219 retv
= nmp
->nm_funcs
->nf_commit_rpc(np
, toff
, count
, bp
->nb_wcred
);
3226 * Now, either mark the blocks I/O done or mark the
3227 * blocks dirty, depending on whether the commit
3230 while ((bp
= LIST_FIRST(&commitlist
))) {
3231 LIST_REMOVE(bp
, nb_vnbufs
);
3232 FSDBG(557, bp
, retv
, bp
->nb_flags
, bp
->nb_dirty
);
3233 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3234 CLR(bp
->nb_flags
, (NB_NEEDCOMMIT
| NB_WRITEINPROG
));
3235 np
->n_needcommitcnt
--;
3236 CHECK_NEEDCOMMITCNT(np
);
3240 /* move back to dirty list */
3241 lck_mtx_lock(nfs_buf_mutex
);
3242 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3243 lck_mtx_unlock(nfs_buf_mutex
);
3244 nfs_buf_release(bp
, 1);
3248 vnode_startwrite(NFSTOV(np
));
3249 if (ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3250 lck_mtx_lock(nfs_buf_mutex
);
3253 lck_mtx_unlock(nfs_buf_mutex
);
3254 wakeup(&nfs_nbdwrite
);
3256 CLR(bp
->nb_flags
, (NB_READ
|NB_DONE
|NB_ERROR
|NB_DELWRI
));
3257 /* if block still has dirty pages, we don't want it to */
3258 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3259 if (!(dirty
= bp
->nb_dirty
))
3260 SET(bp
->nb_flags
, NB_ASYNC
);
3262 CLR(bp
->nb_flags
, NB_ASYNC
);
3264 /* move to clean list */
3265 lck_mtx_lock(nfs_buf_mutex
);
3266 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3267 lck_mtx_unlock(nfs_buf_mutex
);
3269 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3273 /* throw it back in as a delayed write buffer */
3274 CLR(bp
->nb_flags
, NB_DONE
);
3275 nfs_buf_write_delayed(bp
);
3280 FSDBG_BOT(557, np
, 0, 0, error
);
3285 * Flush all the blocks associated with a vnode.
3286 * Walk through the buffer pool and push any dirty pages
3287 * associated with the vnode.
3290 nfs_flush(nfsnode_t np
, int waitfor
, thread_t thd
, int ignore_writeerr
)
3293 struct nfsbuflists blist
;
3294 struct nfsmount
*nmp
= NFSTONMP(np
);
3295 int error
= 0, error2
, slptimeo
= 0, slpflag
= 0;
3296 int nfsvers
, flags
, passone
= 1;
3298 FSDBG_TOP(517, np
, waitfor
, ignore_writeerr
, 0);
3304 nfsvers
= nmp
->nm_vers
;
3305 if (nmp
->nm_flag
& NFSMNT_INT
)
3308 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3309 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3310 np
->n_flag
|= NMODIFIED
;
3314 lck_mtx_lock(nfs_buf_mutex
);
3315 while (np
->n_bflag
& NBFLUSHINPROG
) {
3316 np
->n_bflag
|= NBFLUSHWANT
;
3317 error
= msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_flush", NULL
);
3319 lck_mtx_unlock(nfs_buf_mutex
);
3323 np
->n_bflag
|= NBFLUSHINPROG
;
3326 * On the first pass, start async/unstable writes on all
3327 * delayed write buffers. Then wait for all writes to complete
3328 * and call nfs_flushcommits() to commit any uncommitted buffers.
3329 * On all subsequent passes, start STABLE writes on any remaining
3330 * dirty buffers. Then wait for all writes to complete.
3333 FSDBG(518, LIST_FIRST(&np
->n_dirtyblkhd
), np
->n_flag
, 0, 0);
3334 if (!NFSTONMP(np
)) {
3335 lck_mtx_unlock(nfs_buf_mutex
);
3340 /* Start/do any write(s) that are required. */
3341 if (!nfs_buf_iterprepare(np
, &blist
, NBI_DIRTY
)) {
3342 while ((bp
= LIST_FIRST(&blist
))) {
3343 LIST_REMOVE(bp
, nb_vnbufs
);
3344 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3345 flags
= (passone
|| (waitfor
!= MNT_WAIT
)) ? NBAC_NOWAIT
: 0;
3346 if (flags
!= NBAC_NOWAIT
)
3348 while ((error
= nfs_buf_acquire(bp
, flags
, slpflag
, slptimeo
))) {
3349 FSDBG(524, bp
, flags
, bp
->nb_lflags
, bp
->nb_flags
);
3353 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3355 if (flags
!= NBAC_NOWAIT
)
3356 nfs_buf_refrele(bp
);
3357 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3358 lck_mtx_unlock(nfs_buf_mutex
);
3362 if (slpflag
== PCATCH
) {
3368 if (flags
!= NBAC_NOWAIT
)
3369 nfs_buf_refrele(bp
);
3373 /* buffer is no longer valid */
3377 if (ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
))
3378 nfs_buf_check_write_verifier(np
, bp
);
3379 if (!ISSET(bp
->nb_flags
, NB_DELWRI
)) {
3380 /* buffer is no longer dirty */
3384 FSDBG(525, bp
, passone
, bp
->nb_lflags
, bp
->nb_flags
);
3385 if ((passone
|| (waitfor
!= MNT_WAIT
)) &&
3386 ISSET(bp
->nb_flags
, NB_NEEDCOMMIT
)) {
3390 nfs_buf_remfree(bp
);
3391 lck_mtx_unlock(nfs_buf_mutex
);
3392 if (ISSET(bp
->nb_flags
, NB_ERROR
)) {
3393 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3394 np
->n_error
= bp
->nb_error
? bp
->nb_error
: EIO
;
3395 np
->n_flag
|= NWRITEERR
;
3397 nfs_buf_release(bp
, 1);
3398 lck_mtx_lock(nfs_buf_mutex
);
3401 SET(bp
->nb_flags
, NB_ASYNC
);
3403 /* NB_STABLE forces this to be written FILESYNC */
3404 SET(bp
->nb_flags
, NB_STABLE
);
3407 lck_mtx_lock(nfs_buf_mutex
);
3409 nfs_buf_itercomplete(np
, &blist
, NBI_DIRTY
);
3411 lck_mtx_unlock(nfs_buf_mutex
);
3413 if (waitfor
== MNT_WAIT
) {
3414 while ((error
= vnode_waitforwrites(NFSTOV(np
), 0, slpflag
, slptimeo
, "nfsflush"))) {
3415 error2
= nfs_sigintr(NFSTONMP(np
), NULL
, thd
, 0);
3420 if (slpflag
== PCATCH
) {
3427 if (nfsvers
!= NFS_VER2
) {
3428 /* loop while it looks like there are still buffers to be */
3429 /* commited and nfs_flushcommits() seems to be handling them. */
3430 while (np
->n_needcommitcnt
)
3431 if (nfs_flushcommits(np
, 0))
3437 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3438 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3439 np
->n_flag
|= NMODIFIED
;
3442 lck_mtx_lock(nfs_buf_mutex
);
3446 if (waitfor
== MNT_WAIT
) {
3447 if (!LIST_EMPTY(&np
->n_dirtyblkhd
)) {
3448 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3449 np
->n_flag
|= NMODIFIED
;
3452 lck_mtx_lock(nfs_buf_mutex
);
3453 if (!LIST_EMPTY(&np
->n_dirtyblkhd
))
3455 lck_mtx_unlock(nfs_buf_mutex
);
3456 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3457 /* if we have no dirty blocks, we can clear the modified flag */
3459 np
->n_flag
&= ~NMODIFIED
;
3461 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3464 FSDBG(526, np
->n_flag
, np
->n_error
, 0, 0);
3465 if (!ignore_writeerr
&& (np
->n_flag
& NWRITEERR
)) {
3466 error
= np
->n_error
;
3467 np
->n_flag
&= ~NWRITEERR
;
3471 lck_mtx_lock(nfs_buf_mutex
);
3472 flags
= np
->n_bflag
;
3473 np
->n_bflag
&= ~(NBFLUSHINPROG
|NBFLUSHWANT
);
3474 lck_mtx_unlock(nfs_buf_mutex
);
3475 if (flags
& NBFLUSHWANT
)
3476 wakeup(&np
->n_bflag
);
3478 FSDBG_BOT(517, np
, error
, ignore_writeerr
, 0);
3483 * Flush out and invalidate all buffers associated with a vnode.
3484 * Called with the underlying object locked.
3487 nfs_vinvalbuf_internal(
3496 struct nfsbuflists blist
;
3497 int list
, error
= 0;
3499 if (flags
& V_SAVE
) {
3500 if ((error
= nfs_flush(np
, MNT_WAIT
, thd
, (flags
& V_IGNORE_WRITEERR
))))
3504 lck_mtx_lock(nfs_buf_mutex
);
3507 if (nfs_buf_iterprepare(np
, &blist
, list
)) {
3509 if (nfs_buf_iterprepare(np
, &blist
, list
))
3512 while ((bp
= LIST_FIRST(&blist
))) {
3513 LIST_REMOVE(bp
, nb_vnbufs
);
3514 if (list
== NBI_CLEAN
)
3515 LIST_INSERT_HEAD(&np
->n_cleanblkhd
, bp
, nb_vnbufs
);
3517 LIST_INSERT_HEAD(&np
->n_dirtyblkhd
, bp
, nb_vnbufs
);
3519 while ((error
= nfs_buf_acquire(bp
, NBAC_REMOVE
, slpflag
, slptimeo
))) {
3520 FSDBG(556, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3521 if (error
!= EAGAIN
) {
3522 FSDBG(554, np
, bp
, -1, error
);
3523 nfs_buf_refrele(bp
);
3524 nfs_buf_itercomplete(np
, &blist
, list
);
3525 lck_mtx_unlock(nfs_buf_mutex
);
3529 nfs_buf_refrele(bp
);
3530 FSDBG(554, np
, bp
, NBOFF(bp
), bp
->nb_flags
);
3531 lck_mtx_unlock(nfs_buf_mutex
);
3532 if ((flags
& V_SAVE
) && UBCINFOEXISTS(NFSTOV(np
)) && bp
->nb_np
&&
3533 (NBOFF(bp
) < (off_t
)np
->n_size
)) {
3534 /* extra paranoia: make sure we're not */
3535 /* somehow leaving any dirty data around */
3537 int end
= (NBOFF(bp
) + bp
->nb_bufsize
> (off_t
)np
->n_size
) ?
3538 ((off_t
)np
->n_size
- NBOFF(bp
)) : bp
->nb_bufsize
;
3539 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
)) {
3540 error
= nfs_buf_upl_setup(bp
);
3541 if (error
== EINVAL
) {
3542 /* vm object must no longer exist */
3543 /* hopefully we don't need to do */
3544 /* anything for this buffer */
3546 printf("nfs_vinvalbuf: upl setup failed %d\n", error
);
3547 bp
->nb_valid
= bp
->nb_dirty
= 0;
3549 nfs_buf_upl_check(bp
);
3550 /* check for any dirty data before the EOF */
3551 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
)) {
3552 /* clip dirty range to EOF */
3553 if (bp
->nb_dirtyend
> end
) {
3554 bp
->nb_dirtyend
= end
;
3555 if (bp
->nb_dirtyoff
>= bp
->nb_dirtyend
)
3556 bp
->nb_dirtyoff
= bp
->nb_dirtyend
= 0;
3558 if ((bp
->nb_dirtyend
> 0) && (bp
->nb_dirtyoff
< end
))
3561 bp
->nb_dirty
&= (1 << (round_page_32(end
)/PAGE_SIZE
)) - 1;
3564 /* also make sure we'll have a credential to do the write */
3565 if (mustwrite
&& !IS_VALID_CRED(bp
->nb_wcred
) && !IS_VALID_CRED(cred
)) {
3566 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3570 FSDBG(554, np
, bp
, 0xd00dee, bp
->nb_flags
);
3571 if (!ISSET(bp
->nb_flags
, NB_PAGELIST
))
3572 panic("nfs_vinvalbuf: dirty buffer without upl");
3573 /* gotta write out dirty data before invalidating */
3574 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3575 /* (NB_NOCACHE indicates buffer should be discarded) */
3576 CLR(bp
->nb_flags
, (NB_DONE
| NB_ERROR
| NB_INVAL
| NB_ASYNC
));
3577 SET(bp
->nb_flags
, NB_STABLE
| NB_NOCACHE
);
3578 if (!IS_VALID_CRED(bp
->nb_wcred
)) {
3579 kauth_cred_ref(cred
);
3580 bp
->nb_wcred
= cred
;
3582 error
= nfs_buf_write(bp
);
3583 // Note: bp has been released
3585 FSDBG(554, bp
, 0xd00dee, 0xbad, error
);
3586 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3587 np
->n_error
= error
;
3588 np
->n_flag
|= NWRITEERR
;
3591 * There was a write error and we need to
3592 * invalidate attrs to sync with server.
3593 * (if this write was extending the file,
3594 * we may no longer know the correct size)
3596 NATTRINVALIDATE(np
);
3599 lck_mtx_lock(nfs_buf_mutex
);
3603 SET(bp
->nb_flags
, NB_INVAL
);
3604 // hold off on FREEUPs until we're done here
3605 nfs_buf_release(bp
, 0);
3606 lck_mtx_lock(nfs_buf_mutex
);
3608 nfs_buf_itercomplete(np
, &blist
, list
);
3610 if (!LIST_EMPTY(&(np
)->n_dirtyblkhd
) || !LIST_EMPTY(&(np
)->n_cleanblkhd
))
3611 panic("nfs_vinvalbuf: flush/inval failed");
3612 lck_mtx_unlock(nfs_buf_mutex
);
3613 if (!(flags
& V_SAVE
)) {
3614 nfs_lock(np
, NFS_NODE_LOCK_FORCE
);
3615 np
->n_flag
&= ~NMODIFIED
;
3624 * Flush and invalidate all dirty buffers. If another process is already
3625 * doing the flush, just wait for completion.
3628 nfs_vinvalbuf(vnode_t vp
, int flags
, vfs_context_t ctx
, int intrflg
)
3630 return nfs_vinvalbuf2(vp
, flags
, vfs_context_thread(ctx
), vfs_context_ucred(ctx
), intrflg
);
3634 nfs_vinvalbuf2(vnode_t vp
, int flags
, thread_t thd
, kauth_cred_t cred
, int intrflg
)
3636 nfsnode_t np
= VTONFS(vp
);
3637 struct nfsmount
*nmp
= VTONMP(vp
);
3638 int error
, rv
, slpflag
, slptimeo
, nflags
;
3641 FSDBG_TOP(554, np
, flags
, intrflg
, 0);
3643 if (nmp
&& !(nmp
->nm_flag
& NFSMNT_INT
))
3653 /* First wait for any other process doing a flush to complete. */
3654 lck_mtx_lock(nfs_buf_mutex
);
3655 while (np
->n_bflag
& NBINVALINPROG
) {
3656 np
->n_bflag
|= NBINVALWANT
;
3657 error
= msleep(&np
->n_bflag
, nfs_buf_mutex
, slpflag
, "nfs_vinvalbuf", NULL
);
3659 lck_mtx_unlock(nfs_buf_mutex
);
3663 np
->n_bflag
|= NBINVALINPROG
;
3664 lck_mtx_unlock(nfs_buf_mutex
);
3666 /* Now, flush as required. */
3667 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, slpflag
, 0);
3669 FSDBG(554, np
, 0, 0, error
);
3670 if ((error
= nfs_sigintr(VTONMP(vp
), NULL
, thd
, 0)))
3672 error
= nfs_vinvalbuf_internal(np
, flags
, thd
, cred
, 0, slptimeo
);
3675 /* get the pages out of vm also */
3676 if (UBCINFOEXISTS(vp
) && (size
= ubc_getsize(vp
)))
3677 if (!(rv
= ubc_sync_range(vp
, 0, size
, UBC_PUSHALL
| UBC_SYNC
| UBC_INVALIDATE
)))
3678 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
3680 lck_mtx_lock(nfs_buf_mutex
);
3681 nflags
= np
->n_bflag
;
3682 np
->n_bflag
&= ~(NBINVALINPROG
|NBINVALWANT
);
3683 lck_mtx_unlock(nfs_buf_mutex
);
3684 if (nflags
& NBINVALWANT
)
3685 wakeup(&np
->n_bflag
);
3687 FSDBG_BOT(554, np
, flags
, intrflg
, error
);
3692 * Add an async I/O request to the mount's async I/O queue and make
3693 * sure that an nfsiod will service it.
3696 nfs_asyncio_finish(struct nfsreq
*req
)
3698 struct nfsmount
*nmp
;
3699 struct nfsiod
*niod
;
3702 FSDBG_TOP(552, nmp
, 0, 0, 0);
3704 if (((nmp
= req
->r_nmp
)) == NULL
)
3706 lck_mtx_lock(nfsiod_mutex
);
3707 niod
= nmp
->nm_niod
;
3709 /* grab an nfsiod if we don't have one already */
3711 niod
= TAILQ_FIRST(&nfsiodfree
);
3713 TAILQ_REMOVE(&nfsiodfree
, niod
, niod_link
);
3714 TAILQ_INSERT_TAIL(&nfsiodwork
, niod
, niod_link
);
3715 niod
->niod_nmp
= nmp
;
3716 } else if (((nfsiod_thread_count
< NFSIOD_MAX
) || (nfsiod_thread_count
<= 0)) && (started
< 4)) {
3718 * Try starting a new thread.
3719 * We may try a couple times if other callers
3720 * get the new threads before we do.
3722 lck_mtx_unlock(nfsiod_mutex
);
3724 if (!nfsiod_start())
3726 lck_mtx_lock(nfsiod_mutex
);
3730 if (req
->r_achain
.tqe_next
== NFSREQNOLIST
)
3731 TAILQ_INSERT_TAIL(&nmp
->nm_iodq
, req
, r_achain
);
3733 /* If this mount doesn't already have an nfsiod working on it... */
3734 if (!nmp
->nm_niod
) {
3735 if (niod
) { /* give it the nfsiod we just grabbed */
3736 nmp
->nm_niod
= niod
;
3737 lck_mtx_unlock(nfsiod_mutex
);
3739 } else if (nfsiod_thread_count
> 0) {
3740 /* just queue it up on nfsiod mounts queue */
3741 TAILQ_INSERT_TAIL(&nfsiodmounts
, nmp
, nm_iodlink
);
3742 lck_mtx_unlock(nfsiod_mutex
);
3744 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count
, NFSIOD_MAX
, started
);
3745 lck_mtx_unlock(nfsiod_mutex
);
3746 /* we have no other option but to be persistent */
3751 lck_mtx_unlock(nfsiod_mutex
);
3754 FSDBG_BOT(552, nmp
, 0, 0, 0);
3758 * queue up async I/O request for resend
3761 nfs_asyncio_resend(struct nfsreq
*req
)
3763 struct nfsmount
*nmp
= req
->r_nmp
;
3767 nfs_gss_clnt_rpcdone(req
);
3768 lck_mtx_lock(&nmp
->nm_lock
);
3769 if (req
->r_rchain
.tqe_next
== NFSREQNOLIST
) {
3770 TAILQ_INSERT_TAIL(&nmp
->nm_resendq
, req
, r_rchain
);
3771 req
->r_flags
|= R_RESENDQ
;
3773 nfs_mount_sock_thread_wake(nmp
);
3774 lck_mtx_unlock(&nmp
->nm_lock
);
3778 * Read an NFS buffer for a directory.
3781 nfs_buf_readdir(struct nfsbuf
*bp
, vfs_context_t ctx
)
3785 struct nfsmount
*nmp
;
3786 int error
= 0, nfsvers
;
3793 nfsvers
= nmp
->nm_vers
;
3794 uio
.uio_iovs
.iov32p
= &io
;
3796 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
3797 uio
.uio_segflg
= UIO_SYSSPACE
;
3799 uio
.uio_segflg
= UIO_SYSSPACE32
;
3803 if (ISSET(bp
->nb_flags
, NB_DONE
))
3804 CLR(bp
->nb_flags
, NB_DONE
);
3806 uio
.uio_rw
= UIO_READ
;
3807 io
.iov_len
= bp
->nb_bufsize
;
3808 uio_uio_resid_set(&uio
, io
.iov_len
);
3809 io
.iov_base
= (uintptr_t) bp
->nb_data
;
3810 uio
.uio_offset
= NBOFF(bp
);
3812 OSAddAtomic(1, (SInt32
*)&nfsstats
.readdir_bios
);
3813 if (nfsvers
< NFS_VER4
) {
3814 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
3815 error
= nfs3_readdirplus_rpc(np
, &uio
, ctx
);
3816 if (error
== NFSERR_NOTSUPP
) {
3817 lck_mtx_lock(&nmp
->nm_lock
);
3818 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
3819 lck_mtx_unlock(&nmp
->nm_lock
);
3822 if (!(nmp
->nm_flag
& NFSMNT_RDIRPLUS
))
3823 error
= nfs3_readdir_rpc(np
, &uio
, ctx
);
3825 error
= nfs4_readdir_rpc(np
, &uio
, ctx
);
3828 SET(bp
->nb_flags
, NB_ERROR
);
3829 bp
->nb_error
= error
;
3831 bp
->nb_validoff
= 0;
3832 bp
->nb_validend
= uio
.uio_offset
- NBOFF(bp
);
3833 bp
->nb_valid
= (1 << (round_page_32(bp
->nb_validend
)/PAGE_SIZE
)) - 1;