]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-6153.121.1.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
67
68 #include <nfs/nfs_conf.h>
69 #if CONFIG_NFS_CLIENT
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/resourcevar.h>
74 #include <sys/signalvar.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/malloc.h>
78 #include <sys/vnode.h>
79 #include <sys/dirent.h>
80 #include <sys/mount_internal.h>
81 #include <sys/kernel.h>
82 #include <sys/ubc_internal.h>
83 #include <sys/uio_internal.h>
84 #include <sys/kpi_mbuf.h>
85
86 #include <sys/vm.h>
87 #include <sys/vmparam.h>
88
89 #include <sys/time.h>
90 #include <kern/clock.h>
91 #include <libkern/OSAtomic.h>
92 #include <kern/kalloc.h>
93 #include <kern/thread_call.h>
94
95 #include <nfs/rpcv2.h>
96 #include <nfs/nfsproto.h>
97 #include <nfs/nfs.h>
98 #include <nfs/nfs_gss.h>
99 #include <nfs/nfsmount.h>
100 #include <nfs/nfsnode.h>
101 #include <sys/buf_internal.h>
102 #include <libkern/OSAtomic.h>
103 #include <os/refcnt.h>
104
105 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
106
107 kern_return_t thread_terminate(thread_t); /* XXX */
108
109 #define NFSBUFHASH(np, lbn) \
110 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
111 LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
112 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
113 u_long nfsbufhash;
114 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
115 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
116 int nfs_nbdwrite;
117 int nfs_buf_timer_on = 0;
118 thread_t nfsbufdelwrithd = NULL;
119
120 lck_grp_t *nfs_buf_lck_grp;
121 lck_mtx_t *nfs_buf_mutex;
122
123 #define NFSBUF_FREE_PERIOD 30 /* seconds */
124 #define NFSBUF_LRU_STALE 120
125 #define NFSBUF_META_STALE 240
126
127 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
128 #define LRU_TO_FREEUP 6
129 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
130 #define META_TO_FREEUP 3
131 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
132 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
133 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
134 #define LRU_FREEUP_FRAC_ON_TIMER 8
135 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
136 #define META_FREEUP_FRAC_ON_TIMER 16
137 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
138 #define LRU_FREEUP_MIN_FRAC 4
139 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
140 #define META_FREEUP_MIN_FRAC 2
141
142 #define NFS_BUF_FREEUP() \
143 do { \
144 /* only call nfs_buf_freeup() if it has work to do: */ \
145 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
146 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
147 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
148 nfs_buf_freeup(0); \
149 } while (0)
150
151 /*
152 * Initialize nfsbuf lists
153 */
154 void
155 nfs_nbinit(void)
156 {
157 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
158 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
159
160 nfsbufcnt = nfsbufmetacnt =
161 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
162 nfsbufmin = 128;
163 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
164 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
165 nfsbufmetamax = nfsbufmax / 4;
166 nfsneedbuffer = 0;
167 nfs_nbdwrite = 0;
168
169 nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
170 TAILQ_INIT(&nfsbuffree);
171 TAILQ_INIT(&nfsbuffreemeta);
172 TAILQ_INIT(&nfsbufdelwri);
173 }
174
175 /*
176 * Check periodically for stale/unused nfs bufs
177 */
178 void
179 nfs_buf_timer(__unused void *param0, __unused void *param1)
180 {
181 nfs_buf_freeup(1);
182
183 lck_mtx_lock(nfs_buf_mutex);
184 if (nfsbufcnt <= nfsbufmin) {
185 nfs_buf_timer_on = 0;
186 lck_mtx_unlock(nfs_buf_mutex);
187 return;
188 }
189 lck_mtx_unlock(nfs_buf_mutex);
190
191 nfs_interval_timer_start(nfs_buf_timer_call,
192 NFSBUF_FREE_PERIOD * 1000);
193 }
194
195 /*
196 * try to free up some excess, unused nfsbufs
197 */
198 void
199 nfs_buf_freeup(int timer)
200 {
201 struct nfsbuf *fbp;
202 struct timeval now;
203 int count;
204 struct nfsbuffreehead nfsbuffreeup;
205
206 TAILQ_INIT(&nfsbuffreeup);
207
208 lck_mtx_lock(nfs_buf_mutex);
209
210 microuptime(&now);
211
212 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
213
214 count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
215 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
216 fbp = TAILQ_FIRST(&nfsbuffree);
217 if (!fbp) {
218 break;
219 }
220 if (os_ref_get_count(&fbp->nb_refs) > 1) {
221 break;
222 }
223 if (NBUFSTAMPVALID(fbp) &&
224 (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
225 break;
226 }
227 nfs_buf_remfree(fbp);
228 /* disassociate buffer from any nfsnode */
229 if (fbp->nb_np) {
230 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
231 LIST_REMOVE(fbp, nb_vnbufs);
232 fbp->nb_vnbufs.le_next = NFSNOLIST;
233 }
234 fbp->nb_np = NULL;
235 }
236 LIST_REMOVE(fbp, nb_hash);
237 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
238 nfsbufcnt--;
239 }
240
241 count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
242 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
243 fbp = TAILQ_FIRST(&nfsbuffreemeta);
244 if (!fbp) {
245 break;
246 }
247 if (os_ref_get_count(&fbp->nb_refs) > 1) {
248 break;
249 }
250 if (NBUFSTAMPVALID(fbp) &&
251 (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
252 break;
253 }
254 nfs_buf_remfree(fbp);
255 /* disassociate buffer from any nfsnode */
256 if (fbp->nb_np) {
257 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
258 LIST_REMOVE(fbp, nb_vnbufs);
259 fbp->nb_vnbufs.le_next = NFSNOLIST;
260 }
261 fbp->nb_np = NULL;
262 }
263 LIST_REMOVE(fbp, nb_hash);
264 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
265 nfsbufcnt--;
266 nfsbufmetacnt--;
267 }
268
269 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
270 NFSBUFCNTCHK();
271
272 lck_mtx_unlock(nfs_buf_mutex);
273
274 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
275 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
276 /* nuke any creds */
277 if (IS_VALID_CRED(fbp->nb_rcred)) {
278 kauth_cred_unref(&fbp->nb_rcred);
279 }
280 if (IS_VALID_CRED(fbp->nb_wcred)) {
281 kauth_cred_unref(&fbp->nb_wcred);
282 }
283 /* if buf was NB_META, dump buffer */
284 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
285 kfree(fbp->nb_data, fbp->nb_bufsize);
286 }
287 FREE(fbp, M_TEMP);
288 }
289 }
290
291 /*
292 * remove a buffer from the freelist
293 * (must be called with nfs_buf_mutex held)
294 */
295 void
296 nfs_buf_remfree(struct nfsbuf *bp)
297 {
298 if (bp->nb_free.tqe_next == NFSNOLIST) {
299 panic("nfsbuf not on free list");
300 }
301 if (ISSET(bp->nb_flags, NB_DELWRI)) {
302 nfsbufdelwricnt--;
303 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
304 } else if (ISSET(bp->nb_flags, NB_META)) {
305 nfsbuffreemetacnt--;
306 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
307 } else {
308 nfsbuffreecnt--;
309 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
310 }
311 bp->nb_free.tqe_next = NFSNOLIST;
312 NFSBUFCNTCHK();
313 }
314
315 /*
316 * check for existence of nfsbuf in cache
317 */
318 boolean_t
319 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
320 {
321 boolean_t rv;
322 lck_mtx_lock(nfs_buf_mutex);
323 if (nfs_buf_incore(np, blkno)) {
324 rv = TRUE;
325 } else {
326 rv = FALSE;
327 }
328 lck_mtx_unlock(nfs_buf_mutex);
329 return rv;
330 }
331
332 /*
333 * return incore buffer (must be called with nfs_buf_mutex held)
334 */
335 struct nfsbuf *
336 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
337 {
338 /* Search hash chain */
339 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
340 for (; bp != NULL; bp = bp->nb_hash.le_next) {
341 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
342 if (!ISSET(bp->nb_flags, NB_INVAL)) {
343 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
344 return bp;
345 }
346 }
347 }
348 return NULL;
349 }
350
351 /*
352 * Check if it's OK to drop a page.
353 *
354 * Called by vnode_pager() on pageout request of non-dirty page.
355 * We need to make sure that it's not part of a delayed write.
356 * If it is, we can't let the VM drop it because we may need it
357 * later when/if we need to write the data (again).
358 */
359 int
360 nfs_buf_page_inval(vnode_t vp, off_t offset)
361 {
362 struct nfsmount *nmp = VTONMP(vp);
363 struct nfsbuf *bp;
364 int error = 0;
365
366 if (nfs_mount_gone(nmp)) {
367 return ENXIO;
368 }
369
370 lck_mtx_lock(nfs_buf_mutex);
371 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
372 if (!bp) {
373 goto out;
374 }
375 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
376 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
377 error = EBUSY;
378 goto out;
379 }
380 /*
381 * If there's a dirty range in the buffer, check to
382 * see if this page intersects with the dirty range.
383 * If it does, we can't let the pager drop the page.
384 */
385 if (bp->nb_dirtyend > 0) {
386 int start = offset - NBOFF(bp);
387 if ((bp->nb_dirtyend > start) &&
388 (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
389 /*
390 * Before returning the bad news, move the
391 * buffer to the start of the delwri list and
392 * give the list a push to try to flush the
393 * buffer out.
394 */
395 error = EBUSY;
396 nfs_buf_remfree(bp);
397 TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
398 nfsbufdelwricnt++;
399 nfs_buf_delwri_push(1);
400 }
401 }
402 out:
403 lck_mtx_unlock(nfs_buf_mutex);
404 return error;
405 }
406
407 /*
408 * set up the UPL for a buffer
409 * (must NOT be called with nfs_buf_mutex held)
410 */
411 int
412 nfs_buf_upl_setup(struct nfsbuf *bp)
413 {
414 kern_return_t kret;
415 upl_t upl;
416 int upl_flags;
417
418 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
419 return 0;
420 }
421
422 upl_flags = UPL_PRECIOUS;
423 if (!ISSET(bp->nb_flags, NB_READ)) {
424 /*
425 * We're doing a "write", so we intend to modify
426 * the pages we're gathering.
427 */
428 upl_flags |= UPL_WILL_MODIFY;
429 }
430 kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
431 &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
432 if (kret == KERN_INVALID_ARGUMENT) {
433 /* vm object probably doesn't exist any more */
434 bp->nb_pagelist = NULL;
435 return EINVAL;
436 }
437 if (kret != KERN_SUCCESS) {
438 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
439 bp->nb_pagelist = NULL;
440 return EIO;
441 }
442
443 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
444
445 bp->nb_pagelist = upl;
446 SET(bp->nb_flags, NB_PAGELIST);
447 return 0;
448 }
449
450 /*
451 * update buffer's valid/dirty info from UBC
452 * (must NOT be called with nfs_buf_mutex held)
453 */
454 void
455 nfs_buf_upl_check(struct nfsbuf *bp)
456 {
457 upl_page_info_t *pl;
458 off_t filesize, fileoffset;
459 int i, npages;
460
461 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
462 return;
463 }
464
465 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
466 filesize = ubc_getsize(NFSTOV(bp->nb_np));
467 fileoffset = NBOFF(bp);
468 if (fileoffset < filesize) {
469 SET(bp->nb_flags, NB_CACHE);
470 } else {
471 CLR(bp->nb_flags, NB_CACHE);
472 }
473
474 pl = ubc_upl_pageinfo(bp->nb_pagelist);
475 bp->nb_valid = bp->nb_dirty = 0;
476
477 for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
478 /* anything beyond the end of the file is not valid or dirty */
479 if (fileoffset >= filesize) {
480 break;
481 }
482 if (!upl_valid_page(pl, i)) {
483 CLR(bp->nb_flags, NB_CACHE);
484 continue;
485 }
486 NBPGVALID_SET(bp, i);
487 if (upl_dirty_page(pl, i)) {
488 NBPGDIRTY_SET(bp, i);
489 }
490 }
491 fileoffset = NBOFF(bp);
492 if (ISSET(bp->nb_flags, NB_CACHE)) {
493 bp->nb_validoff = 0;
494 bp->nb_validend = bp->nb_bufsize;
495 if (fileoffset + bp->nb_validend > filesize) {
496 bp->nb_validend = filesize - fileoffset;
497 }
498 } else {
499 bp->nb_validoff = bp->nb_validend = -1;
500 }
501 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
502 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
503 }
504
505 /*
506 * make sure that a buffer is mapped
507 * (must NOT be called with nfs_buf_mutex held)
508 */
509 int
510 nfs_buf_map(struct nfsbuf *bp)
511 {
512 kern_return_t kret;
513
514 if (bp->nb_data) {
515 return 0;
516 }
517 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
518 return EINVAL;
519 }
520
521 kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
522 if (kret != KERN_SUCCESS) {
523 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
524 }
525 if (bp->nb_data == 0) {
526 panic("ubc_upl_map mapped 0");
527 }
528 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
529 return 0;
530 }
531
532 /*
533 * normalize an nfsbuf's valid range
534 *
535 * the read/write code guarantees that we'll always have a valid
536 * region that is an integral number of pages. If either end
537 * of the valid range isn't page-aligned, it gets corrected
538 * here as we extend the valid range through all of the
539 * contiguous valid pages.
540 */
541 void
542 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
543 {
544 int pg, npg;
545 /* pull validoff back to start of contiguous valid page range */
546 pg = bp->nb_validoff / PAGE_SIZE;
547 while (pg >= 0 && NBPGVALID(bp, pg)) {
548 pg--;
549 }
550 bp->nb_validoff = (pg + 1) * PAGE_SIZE;
551 /* push validend forward to end of contiguous valid page range */
552 npg = bp->nb_bufsize / PAGE_SIZE;
553 pg = bp->nb_validend / PAGE_SIZE;
554 while (pg < npg && NBPGVALID(bp, pg)) {
555 pg++;
556 }
557 bp->nb_validend = pg * PAGE_SIZE;
558 /* clip to EOF */
559 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
560 bp->nb_validend = np->n_size % bp->nb_bufsize;
561 }
562 }
563
564 /*
565 * process some entries on the delayed write queue
566 * (must be called with nfs_buf_mutex held)
567 */
568 void
569 nfs_buf_delwri_service(void)
570 {
571 struct nfsbuf *bp;
572 nfsnode_t np;
573 int error, i = 0;
574
575 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
576 np = bp->nb_np;
577 nfs_buf_remfree(bp);
578 nfs_buf_refget(bp);
579 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
580 ;
581 }
582 nfs_buf_refrele(bp);
583 if (error) {
584 break;
585 }
586 if (!bp->nb_np) {
587 /* buffer is no longer valid */
588 nfs_buf_drop(bp);
589 continue;
590 }
591 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
592 nfs_buf_check_write_verifier(np, bp);
593 }
594 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
595 /* put buffer at end of delwri list */
596 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
597 nfsbufdelwricnt++;
598 nfs_buf_drop(bp);
599 lck_mtx_unlock(nfs_buf_mutex);
600 nfs_flushcommits(np, 1);
601 } else {
602 SET(bp->nb_flags, NB_ASYNC);
603 lck_mtx_unlock(nfs_buf_mutex);
604 nfs_buf_write(bp);
605 }
606 i++;
607 lck_mtx_lock(nfs_buf_mutex);
608 }
609 }
610
611 /*
612 * thread to service the delayed write queue when asked
613 */
614 void
615 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
616 {
617 struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
618 int error = 0;
619
620 lck_mtx_lock(nfs_buf_mutex);
621 while (!error) {
622 nfs_buf_delwri_service();
623 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
624 }
625 nfsbufdelwrithd = NULL;
626 lck_mtx_unlock(nfs_buf_mutex);
627 thread_terminate(nfsbufdelwrithd);
628 }
629
630 /*
631 * try to push out some delayed/uncommitted writes
632 * ("locked" indicates whether nfs_buf_mutex is already held)
633 */
634 void
635 nfs_buf_delwri_push(int locked)
636 {
637 if (TAILQ_EMPTY(&nfsbufdelwri)) {
638 return;
639 }
640 if (!locked) {
641 lck_mtx_lock(nfs_buf_mutex);
642 }
643 /* wake up the delayed write service thread */
644 if (nfsbufdelwrithd) {
645 wakeup(&nfsbufdelwrithd);
646 } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
647 thread_deallocate(nfsbufdelwrithd);
648 }
649 /* otherwise, try to do some of the work ourselves */
650 if (!nfsbufdelwrithd) {
651 nfs_buf_delwri_service();
652 }
653 if (!locked) {
654 lck_mtx_unlock(nfs_buf_mutex);
655 }
656 }
657
658 /*
659 * Get an nfs buffer.
660 *
661 * Returns errno on error, 0 otherwise.
662 * Any buffer is returned in *bpp.
663 *
664 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
665 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
666 *
667 * Check for existence of buffer in cache.
668 * Or attempt to reuse a buffer from one of the free lists.
669 * Or allocate a new buffer if we haven't already hit max allocation.
670 * Or wait for a free buffer.
671 *
672 * If available buffer found, prepare it, and return it.
673 *
674 * If the calling process is interrupted by a signal for
675 * an interruptible mount point, return EINTR.
676 */
677 int
678 nfs_buf_get(
679 nfsnode_t np,
680 daddr64_t blkno,
681 uint32_t size,
682 thread_t thd,
683 int flags,
684 struct nfsbuf **bpp)
685 {
686 vnode_t vp = NFSTOV(np);
687 struct nfsmount *nmp = VTONMP(vp);
688 struct nfsbuf *bp;
689 uint32_t bufsize;
690 int slpflag = PCATCH;
691 int operation = (flags & NBLK_OPMASK);
692 int error = 0;
693 struct timespec ts;
694
695 FSDBG_TOP(541, np, blkno, size, flags);
696 *bpp = NULL;
697
698 bufsize = size;
699 if (bufsize > NFS_MAXBSIZE) {
700 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
701 }
702
703 if (nfs_mount_gone(nmp)) {
704 FSDBG_BOT(541, np, blkno, 0, ENXIO);
705 return ENXIO;
706 }
707
708 if (!UBCINFOEXISTS(vp)) {
709 operation = NBLK_META;
710 } else if (bufsize < (uint32_t)nmp->nm_biosize) {
711 /* reg files should always have biosize blocks */
712 bufsize = nmp->nm_biosize;
713 }
714
715 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
716 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
717 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
718
719 /* poke the delwri list */
720 nfs_buf_delwri_push(0);
721
722 /* sleep to let other threads run... */
723 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
724 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
725 }
726
727 loop:
728 lck_mtx_lock(nfs_buf_mutex);
729
730 /* wait for any buffer invalidation/flushing to complete */
731 while (np->n_bflag & NBINVALINPROG) {
732 np->n_bflag |= NBINVALWANT;
733 ts.tv_sec = 2;
734 ts.tv_nsec = 0;
735 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
736 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
737 lck_mtx_unlock(nfs_buf_mutex);
738 FSDBG_BOT(541, np, blkno, 0, error);
739 return error;
740 }
741 if (np->n_bflag & NBINVALINPROG) {
742 slpflag = 0;
743 }
744 }
745
746 /* check for existence of nfsbuf in cache */
747 if ((bp = nfs_buf_incore(np, blkno))) {
748 /* if busy, set wanted and wait */
749 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
750 if (flags & NBLK_NOWAIT) {
751 lck_mtx_unlock(nfs_buf_mutex);
752 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
753 return 0;
754 }
755 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
756 SET(bp->nb_lflags, NBL_WANTED);
757
758 ts.tv_sec = 2;
759 ts.tv_nsec = 0;
760 msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
761 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
762 slpflag = 0;
763 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
764 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
765 FSDBG_BOT(541, np, blkno, 0, error);
766 return error;
767 }
768 goto loop;
769 }
770 if (bp->nb_bufsize != bufsize) {
771 panic("nfsbuf size mismatch");
772 }
773 SET(bp->nb_lflags, NBL_BUSY);
774 SET(bp->nb_flags, NB_CACHE);
775 nfs_buf_remfree(bp);
776 /* additional paranoia: */
777 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
778 panic("pagelist buffer was not busy");
779 }
780 goto buffer_setup;
781 }
782
783 if (flags & NBLK_ONLYVALID) {
784 lck_mtx_unlock(nfs_buf_mutex);
785 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
786 return 0;
787 }
788
789 /*
790 * where to get a free buffer:
791 * - if meta and maxmeta reached, must reuse meta
792 * - alloc new if we haven't reached min bufs
793 * - if free lists are NOT empty
794 * - if free list is stale, use it
795 * - else if freemeta list is stale, use it
796 * - else if max bufs allocated, use least-time-to-stale
797 * - alloc new if we haven't reached max allowed
798 * - start clearing out delwri list and try again
799 */
800
801 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
802 /* if we've hit max meta buffers, must reuse a meta buffer */
803 bp = TAILQ_FIRST(&nfsbuffreemeta);
804 } else if ((nfsbufcnt > nfsbufmin) &&
805 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
806 /* try to pull an nfsbuf off a free list */
807 struct nfsbuf *lrubp, *metabp;
808 struct timeval now;
809 microuptime(&now);
810
811 /* if the next LRU or META buffer is invalid or stale, use it */
812 lrubp = TAILQ_FIRST(&nfsbuffree);
813 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
814 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
815 bp = lrubp;
816 }
817 metabp = TAILQ_FIRST(&nfsbuffreemeta);
818 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
819 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
820 bp = metabp;
821 }
822
823 if (!bp && (nfsbufcnt >= nfsbufmax)) {
824 /* we've already allocated all bufs, so */
825 /* choose the buffer that'll go stale first */
826 if (!metabp) {
827 bp = lrubp;
828 } else if (!lrubp) {
829 bp = metabp;
830 } else {
831 int32_t lru_stale_time, meta_stale_time;
832 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
833 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
834 if (lru_stale_time <= meta_stale_time) {
835 bp = lrubp;
836 } else {
837 bp = metabp;
838 }
839 }
840 }
841 }
842
843 if (bp) {
844 /* we have a buffer to reuse */
845 FSDBG(544, np, blkno, bp, bp->nb_flags);
846 nfs_buf_remfree(bp);
847 if (ISSET(bp->nb_flags, NB_DELWRI)) {
848 panic("nfs_buf_get: delwri");
849 }
850 SET(bp->nb_lflags, NBL_BUSY);
851 /* disassociate buffer from previous nfsnode */
852 if (bp->nb_np) {
853 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
854 LIST_REMOVE(bp, nb_vnbufs);
855 bp->nb_vnbufs.le_next = NFSNOLIST;
856 }
857 bp->nb_np = NULL;
858 }
859 LIST_REMOVE(bp, nb_hash);
860 /* nuke any creds we're holding */
861 if (IS_VALID_CRED(bp->nb_rcred)) {
862 kauth_cred_unref(&bp->nb_rcred);
863 }
864 if (IS_VALID_CRED(bp->nb_wcred)) {
865 kauth_cred_unref(&bp->nb_wcred);
866 }
867 /* if buf will no longer be NB_META, dump old buffer */
868 if (operation == NBLK_META) {
869 if (!ISSET(bp->nb_flags, NB_META)) {
870 nfsbufmetacnt++;
871 }
872 } else if (ISSET(bp->nb_flags, NB_META)) {
873 if (bp->nb_data) {
874 kfree(bp->nb_data, bp->nb_bufsize);
875 bp->nb_data = NULL;
876 }
877 nfsbufmetacnt--;
878 }
879 /* re-init buf fields */
880 bp->nb_error = 0;
881 bp->nb_validoff = bp->nb_validend = -1;
882 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
883 bp->nb_valid = 0;
884 bp->nb_dirty = 0;
885 bp->nb_verf = 0;
886 } else {
887 /* no buffer to reuse */
888 if ((nfsbufcnt < nfsbufmax) &&
889 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
890 /* just alloc a new one */
891 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
892 if (!bp) {
893 lck_mtx_unlock(nfs_buf_mutex);
894 FSDBG_BOT(541, np, blkno, 0, error);
895 return ENOMEM;
896 }
897 nfsbufcnt++;
898
899 /*
900 * If any excess bufs, make sure the timer
901 * is running to free them up later.
902 */
903 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
904 nfs_buf_timer_on = 1;
905 nfs_interval_timer_start(nfs_buf_timer_call,
906 NFSBUF_FREE_PERIOD * 1000);
907 }
908
909 if (operation == NBLK_META) {
910 nfsbufmetacnt++;
911 }
912 NFSBUFCNTCHK();
913 /* init nfsbuf */
914 bzero(bp, sizeof(*bp));
915 os_ref_init(&bp->nb_refs, NULL);
916
917 bp->nb_free.tqe_next = NFSNOLIST;
918 bp->nb_validoff = bp->nb_validend = -1;
919 FSDBG(545, np, blkno, bp, 0);
920 } else {
921 /* too many bufs... wait for buffers to free up */
922 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
923
924 /* poke the delwri list */
925 nfs_buf_delwri_push(1);
926
927 nfsneedbuffer = 1;
928 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
929 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
930 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
931 FSDBG_BOT(541, np, blkno, 0, error);
932 return error;
933 }
934 goto loop;
935 }
936 }
937
938 /* set up nfsbuf */
939 SET(bp->nb_lflags, NBL_BUSY);
940 bp->nb_flags = 0;
941 bp->nb_lblkno = blkno;
942 /* insert buf in hash */
943 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
944 /* associate buffer with new nfsnode */
945 bp->nb_np = np;
946 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
947
948 buffer_setup:
949
950 /* unlock hash */
951 lck_mtx_unlock(nfs_buf_mutex);
952
953 switch (operation) {
954 case NBLK_META:
955 SET(bp->nb_flags, NB_META);
956 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
957 kfree(bp->nb_data, bp->nb_bufsize);
958 bp->nb_data = NULL;
959 bp->nb_validoff = bp->nb_validend = -1;
960 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
961 bp->nb_valid = 0;
962 bp->nb_dirty = 0;
963 CLR(bp->nb_flags, NB_CACHE);
964 }
965 if (!bp->nb_data) {
966 bp->nb_data = kalloc(bufsize);
967 }
968 if (!bp->nb_data) {
969 /* Ack! couldn't allocate the data buffer! */
970 /* clean up buffer and return error */
971 lck_mtx_lock(nfs_buf_mutex);
972 LIST_REMOVE(bp, nb_vnbufs);
973 bp->nb_vnbufs.le_next = NFSNOLIST;
974 bp->nb_np = NULL;
975 /* invalidate usage timestamp to allow immediate freeing */
976 NBUFSTAMPINVALIDATE(bp);
977 if (bp->nb_free.tqe_next != NFSNOLIST) {
978 panic("nfsbuf on freelist");
979 }
980 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
981 nfsbuffreecnt++;
982 lck_mtx_unlock(nfs_buf_mutex);
983 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
984 return ENOMEM;
985 }
986 bp->nb_bufsize = bufsize;
987 break;
988
989 case NBLK_READ:
990 case NBLK_WRITE:
991 /*
992 * Set or clear NB_READ now to let the UPL subsystem know
993 * if we intend to modify the pages or not.
994 */
995 if (operation == NBLK_READ) {
996 SET(bp->nb_flags, NB_READ);
997 } else {
998 CLR(bp->nb_flags, NB_READ);
999 }
1000 if (bufsize < PAGE_SIZE) {
1001 bufsize = PAGE_SIZE;
1002 }
1003 bp->nb_bufsize = bufsize;
1004 bp->nb_validoff = bp->nb_validend = -1;
1005
1006 if (UBCINFOEXISTS(vp)) {
1007 /* set up upl */
1008 if (nfs_buf_upl_setup(bp)) {
1009 /* unable to create upl */
1010 /* vm object must no longer exist */
1011 /* clean up buffer and return error */
1012 lck_mtx_lock(nfs_buf_mutex);
1013 LIST_REMOVE(bp, nb_vnbufs);
1014 bp->nb_vnbufs.le_next = NFSNOLIST;
1015 bp->nb_np = NULL;
1016 /* invalidate usage timestamp to allow immediate freeing */
1017 NBUFSTAMPINVALIDATE(bp);
1018 if (bp->nb_free.tqe_next != NFSNOLIST) {
1019 panic("nfsbuf on freelist");
1020 }
1021 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1022 nfsbuffreecnt++;
1023 lck_mtx_unlock(nfs_buf_mutex);
1024 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
1025 return EIO;
1026 }
1027 nfs_buf_upl_check(bp);
1028 }
1029 break;
1030
1031 default:
1032 panic("nfs_buf_get: %d unknown operation", operation);
1033 }
1034
1035 *bpp = bp;
1036
1037 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
1038
1039 return 0;
1040 }
1041
1042 void
1043 nfs_buf_release(struct nfsbuf *bp, int freeup)
1044 {
1045 nfsnode_t np = bp->nb_np;
1046 vnode_t vp;
1047 struct timeval now;
1048 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
1049
1050 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1051 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1052 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1053
1054 vp = np ? NFSTOV(np) : NULL;
1055 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
1056 int upl_flags, rv;
1057 upl_t upl;
1058 uint32_t i;
1059
1060 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1061 rv = nfs_buf_upl_setup(bp);
1062 if (rv) {
1063 printf("nfs_buf_release: upl create failed %d\n", rv);
1064 } else {
1065 nfs_buf_upl_check(bp);
1066 }
1067 }
1068 upl = bp->nb_pagelist;
1069 if (!upl) {
1070 goto pagelist_cleanup_done;
1071 }
1072 if (bp->nb_data) {
1073 if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
1074 panic("ubc_upl_unmap failed");
1075 }
1076 bp->nb_data = NULL;
1077 }
1078 /*
1079 * Abort the pages on error or: if this is an invalid or
1080 * non-needcommit nocache buffer AND no pages are dirty.
1081 */
1082 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1083 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1084 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
1085 upl_flags = UPL_ABORT_DUMP_PAGES;
1086 } else {
1087 upl_flags = 0;
1088 }
1089 ubc_upl_abort(upl, upl_flags);
1090 goto pagelist_cleanup_done;
1091 }
1092 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1093 if (!NBPGVALID(bp, i)) {
1094 ubc_upl_abort_range(upl,
1095 i * PAGE_SIZE, PAGE_SIZE,
1096 UPL_ABORT_DUMP_PAGES |
1097 UPL_ABORT_FREE_ON_EMPTY);
1098 } else {
1099 if (NBPGDIRTY(bp, i)) {
1100 upl_flags = UPL_COMMIT_SET_DIRTY;
1101 } else {
1102 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1103 }
1104
1105 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
1106 upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1107 }
1108
1109 ubc_upl_commit_range(upl,
1110 i * PAGE_SIZE, PAGE_SIZE,
1111 upl_flags |
1112 UPL_COMMIT_INACTIVATE |
1113 UPL_COMMIT_FREE_ON_EMPTY);
1114 }
1115 }
1116 pagelist_cleanup_done:
1117 /* invalidate any pages past EOF */
1118 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1119 off_t start, end;
1120 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1121 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1122 if (start < NBOFF(bp)) {
1123 start = NBOFF(bp);
1124 }
1125 if (end > start) {
1126 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
1127 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1128 }
1129 }
1130 }
1131 CLR(bp->nb_flags, NB_PAGELIST);
1132 bp->nb_pagelist = NULL;
1133 }
1134
1135 lck_mtx_lock(nfs_buf_mutex);
1136
1137 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1138
1139 /* Wake up any processes waiting for any buffer to become free. */
1140 if (nfsneedbuffer) {
1141 nfsneedbuffer = 0;
1142 wakeup_needbuffer = 1;
1143 }
1144 /* Wake up any processes waiting for _this_ buffer to become free. */
1145 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1146 CLR(bp->nb_lflags, NBL_WANTED);
1147 wakeup_buffer = 1;
1148 }
1149
1150 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1151 if (ISSET(bp->nb_flags, NB_ERROR) ||
1152 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
1153 SET(bp->nb_flags, NB_INVAL);
1154 }
1155
1156 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1157 /* If it's invalid or empty, dissociate it from its nfsnode */
1158 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1159 LIST_REMOVE(bp, nb_vnbufs);
1160 bp->nb_vnbufs.le_next = NFSNOLIST;
1161 }
1162 bp->nb_np = NULL;
1163 /* if this was a delayed write, wakeup anyone */
1164 /* waiting for delayed writes to complete */
1165 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1166 CLR(bp->nb_flags, NB_DELWRI);
1167 nfs_nbdwrite--;
1168 NFSBUFCNTCHK();
1169 wakeup_nbdwrite = 1;
1170 }
1171 /* invalidate usage timestamp to allow immediate freeing */
1172 NBUFSTAMPINVALIDATE(bp);
1173 /* put buffer at head of free list */
1174 if (bp->nb_free.tqe_next != NFSNOLIST) {
1175 panic("nfsbuf on freelist");
1176 }
1177 SET(bp->nb_flags, NB_INVAL);
1178 if (ISSET(bp->nb_flags, NB_META)) {
1179 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1180 nfsbuffreemetacnt++;
1181 } else {
1182 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1183 nfsbuffreecnt++;
1184 }
1185 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1186 /* put buffer at end of delwri list */
1187 if (bp->nb_free.tqe_next != NFSNOLIST) {
1188 panic("nfsbuf on freelist");
1189 }
1190 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1191 nfsbufdelwricnt++;
1192 freeup = 0;
1193 } else {
1194 /* update usage timestamp */
1195 microuptime(&now);
1196 bp->nb_timestamp = now.tv_sec;
1197 /* put buffer at end of free list */
1198 if (bp->nb_free.tqe_next != NFSNOLIST) {
1199 panic("nfsbuf on freelist");
1200 }
1201 if (ISSET(bp->nb_flags, NB_META)) {
1202 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1203 nfsbuffreemetacnt++;
1204 } else {
1205 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1206 nfsbuffreecnt++;
1207 }
1208 }
1209
1210 NFSBUFCNTCHK();
1211
1212 /* Unlock the buffer. */
1213 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1214 CLR(bp->nb_lflags, NBL_BUSY);
1215
1216 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1217
1218 lck_mtx_unlock(nfs_buf_mutex);
1219
1220 if (wakeup_needbuffer) {
1221 wakeup(&nfsneedbuffer);
1222 }
1223 if (wakeup_buffer) {
1224 wakeup(bp);
1225 }
1226 if (wakeup_nbdwrite) {
1227 wakeup(&nfs_nbdwrite);
1228 }
1229 if (freeup) {
1230 NFS_BUF_FREEUP();
1231 }
1232 }
1233
1234 /*
1235 * Wait for operations on the buffer to complete.
1236 * When they do, extract and return the I/O's error value.
1237 */
1238 int
1239 nfs_buf_iowait(struct nfsbuf *bp)
1240 {
1241 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1242
1243 lck_mtx_lock(nfs_buf_mutex);
1244
1245 while (!ISSET(bp->nb_flags, NB_DONE)) {
1246 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1247 }
1248
1249 lck_mtx_unlock(nfs_buf_mutex);
1250
1251 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1252
1253 /* check for interruption of I/O, then errors. */
1254 if (ISSET(bp->nb_flags, NB_EINTR)) {
1255 CLR(bp->nb_flags, NB_EINTR);
1256 return EINTR;
1257 } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1258 return bp->nb_error ? bp->nb_error : EIO;
1259 }
1260 return 0;
1261 }
1262
1263 /*
1264 * Mark I/O complete on a buffer.
1265 */
1266 void
1267 nfs_buf_iodone(struct nfsbuf *bp)
1268 {
1269 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1270
1271 if (ISSET(bp->nb_flags, NB_DONE)) {
1272 panic("nfs_buf_iodone already");
1273 }
1274
1275 if (!ISSET(bp->nb_flags, NB_READ)) {
1276 CLR(bp->nb_flags, NB_WRITEINPROG);
1277 /*
1278 * vnode_writedone() takes care of waking up
1279 * any throttled write operations
1280 */
1281 vnode_writedone(NFSTOV(bp->nb_np));
1282 nfs_node_lock_force(bp->nb_np);
1283 bp->nb_np->n_numoutput--;
1284 nfs_node_unlock(bp->nb_np);
1285 }
1286 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1287 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1288 nfs_buf_release(bp, 1);
1289 } else { /* or just wakeup the buffer */
1290 lck_mtx_lock(nfs_buf_mutex);
1291 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1292 CLR(bp->nb_lflags, NBL_WANTED);
1293 lck_mtx_unlock(nfs_buf_mutex);
1294 wakeup(bp);
1295 }
1296
1297 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1298 }
1299
1300 void
1301 nfs_buf_write_delayed(struct nfsbuf *bp)
1302 {
1303 nfsnode_t np = bp->nb_np;
1304
1305 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1306 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1307
1308 /*
1309 * If the block hasn't been seen before:
1310 * (1) Mark it as having been seen,
1311 * (2) Make sure it's on its node's correct block list,
1312 */
1313 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1314 SET(bp->nb_flags, NB_DELWRI);
1315 /* move to dirty list */
1316 lck_mtx_lock(nfs_buf_mutex);
1317 nfs_nbdwrite++;
1318 NFSBUFCNTCHK();
1319 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1320 LIST_REMOVE(bp, nb_vnbufs);
1321 }
1322 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1323 lck_mtx_unlock(nfs_buf_mutex);
1324 }
1325
1326 /*
1327 * If the vnode has "too many" write operations in progress
1328 * wait for them to finish the IO
1329 */
1330 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1331
1332 /* the file is in a modified state, so make sure the flag's set */
1333 nfs_node_lock_force(np);
1334 np->n_flag |= NMODIFIED;
1335 nfs_node_unlock(np);
1336
1337 /*
1338 * If we have too many delayed write buffers,
1339 * just fall back to doing the async write.
1340 */
1341 if (nfs_nbdwrite < 0) {
1342 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1343 }
1344 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1345 /* issue async write */
1346 SET(bp->nb_flags, NB_ASYNC);
1347 nfs_buf_write(bp);
1348 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1349 return;
1350 }
1351
1352 /* Otherwise, the "write" is done, so mark and release the buffer. */
1353 SET(bp->nb_flags, NB_DONE);
1354 nfs_buf_release(bp, 1);
1355 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1356 return;
1357 }
1358
1359 /*
1360 * Check that a "needcommit" buffer can still be committed.
1361 * If the write verifier has changed, we need to clear the
1362 * the needcommit flag.
1363 */
1364 void
1365 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1366 {
1367 struct nfsmount *nmp;
1368
1369 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
1370 return;
1371 }
1372
1373 nmp = NFSTONMP(np);
1374 if (nfs_mount_gone(nmp)) {
1375 return;
1376 }
1377 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
1378 return;
1379 }
1380
1381 /* write verifier changed, clear commit/wverf flags */
1382 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1383 bp->nb_verf = 0;
1384 nfs_node_lock_force(np);
1385 np->n_needcommitcnt--;
1386 CHECK_NEEDCOMMITCNT(np);
1387 nfs_node_unlock(np);
1388 }
1389
1390 /*
1391 * add a reference to a buffer so it doesn't disappear while being used
1392 * (must be called with nfs_buf_mutex held)
1393 */
1394 void
1395 nfs_buf_refget(struct nfsbuf *bp)
1396 {
1397 os_ref_retain_locked(&bp->nb_refs);
1398 }
1399 /*
1400 * release a reference on a buffer
1401 * (must be called with nfs_buf_mutex held)
1402 */
1403 void
1404 nfs_buf_refrele(struct nfsbuf *bp)
1405 {
1406 (void) os_ref_release_locked(&bp->nb_refs);
1407 }
1408
1409 /*
1410 * mark a particular buffer as BUSY
1411 * (must be called with nfs_buf_mutex held)
1412 */
1413 errno_t
1414 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1415 {
1416 errno_t error;
1417 struct timespec ts;
1418
1419 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1420 /*
1421 * since the lck_mtx_lock may block, the buffer
1422 * may become BUSY, so we need to recheck for
1423 * a NOWAIT request
1424 */
1425 if (flags & NBAC_NOWAIT) {
1426 return EBUSY;
1427 }
1428 SET(bp->nb_lflags, NBL_WANTED);
1429
1430 ts.tv_sec = (slptimeo / 100);
1431 /* the hz value is 100; which leads to 10ms */
1432 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1433
1434 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1435 "nfs_buf_acquire", &ts);
1436 if (error) {
1437 return error;
1438 }
1439 return EAGAIN;
1440 }
1441 if (flags & NBAC_REMOVE) {
1442 nfs_buf_remfree(bp);
1443 }
1444 SET(bp->nb_lflags, NBL_BUSY);
1445
1446 return 0;
1447 }
1448
1449 /*
1450 * simply drop the BUSY status of a buffer
1451 * (must be called with nfs_buf_mutex held)
1452 */
1453 void
1454 nfs_buf_drop(struct nfsbuf *bp)
1455 {
1456 int need_wakeup = 0;
1457
1458 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
1459 panic("nfs_buf_drop: buffer not busy!");
1460 }
1461 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1462 /* delay the actual wakeup until after we clear NBL_BUSY */
1463 need_wakeup = 1;
1464 }
1465 /* Unlock the buffer. */
1466 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1467
1468 if (need_wakeup) {
1469 wakeup(bp);
1470 }
1471 }
1472
1473 /*
1474 * prepare for iterating over an nfsnode's buffer list
1475 * this lock protects the queue manipulation
1476 * (must be called with nfs_buf_mutex held)
1477 */
1478 int
1479 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1480 {
1481 struct nfsbuflists *listheadp;
1482
1483 if (flags & NBI_DIRTY) {
1484 listheadp = &np->n_dirtyblkhd;
1485 } else {
1486 listheadp = &np->n_cleanblkhd;
1487 }
1488
1489 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1490 LIST_INIT(iterheadp);
1491 return EWOULDBLOCK;
1492 }
1493
1494 while (np->n_bufiterflags & NBI_ITER) {
1495 np->n_bufiterflags |= NBI_ITERWANT;
1496 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1497 }
1498 if (LIST_EMPTY(listheadp)) {
1499 LIST_INIT(iterheadp);
1500 return EINVAL;
1501 }
1502 np->n_bufiterflags |= NBI_ITER;
1503
1504 iterheadp->lh_first = listheadp->lh_first;
1505 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1506 LIST_INIT(listheadp);
1507
1508 return 0;
1509 }
1510
1511 /*
1512 * clean up after iterating over an nfsnode's buffer list
1513 * this lock protects the queue manipulation
1514 * (must be called with nfs_buf_mutex held)
1515 */
1516 void
1517 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1518 {
1519 struct nfsbuflists * listheadp;
1520 struct nfsbuf *bp;
1521
1522 if (flags & NBI_DIRTY) {
1523 listheadp = &np->n_dirtyblkhd;
1524 } else {
1525 listheadp = &np->n_cleanblkhd;
1526 }
1527
1528 while (!LIST_EMPTY(iterheadp)) {
1529 bp = LIST_FIRST(iterheadp);
1530 LIST_REMOVE(bp, nb_vnbufs);
1531 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1532 }
1533
1534 np->n_bufiterflags &= ~NBI_ITER;
1535 if (np->n_bufiterflags & NBI_ITERWANT) {
1536 np->n_bufiterflags &= ~NBI_ITERWANT;
1537 wakeup(&np->n_bufiterflags);
1538 }
1539 }
1540
1541
1542 /*
1543 * Read an NFS buffer for a file.
1544 */
1545 int
1546 nfs_buf_read(struct nfsbuf *bp)
1547 {
1548 int error = 0;
1549 nfsnode_t np;
1550 thread_t thd;
1551 kauth_cred_t cred;
1552
1553 np = bp->nb_np;
1554 cred = bp->nb_rcred;
1555 if (IS_VALID_CRED(cred)) {
1556 kauth_cred_ref(cred);
1557 }
1558 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1559
1560 /* sanity checks */
1561 if (!ISSET(bp->nb_flags, NB_READ)) {
1562 panic("nfs_buf_read: !NB_READ");
1563 }
1564 if (ISSET(bp->nb_flags, NB_DONE)) {
1565 CLR(bp->nb_flags, NB_DONE);
1566 }
1567
1568 NFS_BUF_MAP(bp);
1569
1570 OSAddAtomic64(1, &nfsstats.read_bios);
1571
1572 error = nfs_buf_read_rpc(bp, thd, cred);
1573 /*
1574 * For async I/O, the callbacks will finish up the
1575 * read. Otherwise, the read has already been finished.
1576 */
1577
1578 if (IS_VALID_CRED(cred)) {
1579 kauth_cred_unref(&cred);
1580 }
1581 return error;
1582 }
1583
1584 /*
1585 * finish the reading of a buffer
1586 */
1587 void
1588 nfs_buf_read_finish(struct nfsbuf *bp)
1589 {
1590 nfsnode_t np = bp->nb_np;
1591 struct nfsmount *nmp;
1592
1593 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1594 /* update valid range */
1595 bp->nb_validoff = 0;
1596 bp->nb_validend = bp->nb_endio;
1597 if (bp->nb_endio < (int)bp->nb_bufsize) {
1598 /*
1599 * The read may be short because we have unflushed writes
1600 * that are extending the file size and the reads hit the
1601 * (old) EOF on the server. So, just make sure nb_validend
1602 * correctly tracks EOF.
1603 * Note that the missing data should have already been zeroed
1604 * in nfs_buf_read_rpc_finish().
1605 */
1606 off_t boff = NBOFF(bp);
1607 if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
1608 bp->nb_validend = bp->nb_bufsize;
1609 } else if ((off_t)np->n_size >= boff) {
1610 bp->nb_validend = np->n_size - boff;
1611 } else {
1612 bp->nb_validend = 0;
1613 }
1614 }
1615 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1616 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
1617 bp->nb_validend = 0x100000000LL - NBOFF(bp);
1618 }
1619 bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1620 if (bp->nb_validend & PAGE_MASK) {
1621 /* zero-fill remainder of last page */
1622 bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1623 }
1624 }
1625 nfs_buf_iodone(bp);
1626 }
1627
1628 /*
1629 * initiate the NFS READ RPC(s) for a buffer
1630 */
1631 int
1632 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1633 {
1634 struct nfsmount *nmp;
1635 nfsnode_t np = bp->nb_np;
1636 int error = 0, nfsvers, async;
1637 int offset, nrpcs;
1638 uint32_t nmrsize, length, len;
1639 off_t boff;
1640 struct nfsreq *req;
1641 struct nfsreq_cbinfo cb;
1642
1643 nmp = NFSTONMP(np);
1644 if (nfs_mount_gone(nmp)) {
1645 bp->nb_error = error = ENXIO;
1646 SET(bp->nb_flags, NB_ERROR);
1647 nfs_buf_iodone(bp);
1648 return error;
1649 }
1650 nfsvers = nmp->nm_vers;
1651 nmrsize = nmp->nm_rsize;
1652
1653 boff = NBOFF(bp);
1654 offset = 0;
1655 length = bp->nb_bufsize;
1656
1657 if (nfsvers == NFS_VER2) {
1658 if (boff > 0xffffffffLL) {
1659 bp->nb_error = error = EFBIG;
1660 SET(bp->nb_flags, NB_ERROR);
1661 nfs_buf_iodone(bp);
1662 return error;
1663 }
1664 if ((boff + length - 1) > 0xffffffffLL) {
1665 length = 0x100000000LL - boff;
1666 }
1667 }
1668
1669 /* Note: Can only do async I/O if nfsiods are configured. */
1670 async = (bp->nb_flags & NB_ASYNC);
1671 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1672 cb.rcb_bp = bp;
1673
1674 bp->nb_offio = bp->nb_endio = 0;
1675 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1676 if (async && (nrpcs > 1)) {
1677 SET(bp->nb_flags, NB_MULTASYNCRPC);
1678 } else {
1679 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1680 }
1681
1682 while (length > 0) {
1683 if (ISSET(bp->nb_flags, NB_ERROR)) {
1684 error = bp->nb_error;
1685 break;
1686 }
1687 len = (length > nmrsize) ? nmrsize : length;
1688 cb.rcb_args[0] = offset;
1689 cb.rcb_args[1] = len;
1690 #if CONFIG_NFS4
1691 if (nmp->nm_vers >= NFS_VER4) {
1692 cb.rcb_args[2] = nmp->nm_stategenid;
1693 }
1694 #endif
1695 req = NULL;
1696 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1697 if (error) {
1698 break;
1699 }
1700 offset += len;
1701 length -= len;
1702 if (async) {
1703 continue;
1704 }
1705 nfs_buf_read_rpc_finish(req);
1706 if (ISSET(bp->nb_flags, NB_ERROR)) {
1707 error = bp->nb_error;
1708 break;
1709 }
1710 }
1711
1712 if (length > 0) {
1713 /*
1714 * Something bad happened while trying to send the RPC(s).
1715 * Wait for any outstanding requests to complete.
1716 */
1717 bp->nb_error = error;
1718 SET(bp->nb_flags, NB_ERROR);
1719 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1720 nrpcs = (length + nmrsize - 1) / nmrsize;
1721 lck_mtx_lock(nfs_buf_mutex);
1722 bp->nb_rpcs -= nrpcs;
1723 if (bp->nb_rpcs == 0) {
1724 /* No RPCs left, so the buffer's done */
1725 lck_mtx_unlock(nfs_buf_mutex);
1726 nfs_buf_iodone(bp);
1727 } else {
1728 /* wait for the last RPC to mark it done */
1729 while (bp->nb_rpcs > 0) {
1730 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1731 "nfs_buf_read_rpc_cancel", NULL);
1732 }
1733 lck_mtx_unlock(nfs_buf_mutex);
1734 }
1735 } else {
1736 nfs_buf_iodone(bp);
1737 }
1738 }
1739
1740 return error;
1741 }
1742
1743 /*
1744 * finish up an NFS READ RPC on a buffer
1745 */
1746 void
1747 nfs_buf_read_rpc_finish(struct nfsreq *req)
1748 {
1749 struct nfsmount *nmp;
1750 size_t rlen;
1751 struct nfsreq_cbinfo cb;
1752 struct nfsbuf *bp;
1753 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1754 void *wakeme = NULL;
1755 struct nfsreq *rreq = NULL;
1756 nfsnode_t np;
1757 thread_t thd;
1758 kauth_cred_t cred;
1759 uio_t auio;
1760 char uio_buf[UIO_SIZEOF(1)];
1761
1762 finish:
1763 np = req->r_np;
1764 thd = req->r_thread;
1765 cred = req->r_cred;
1766 if (IS_VALID_CRED(cred)) {
1767 kauth_cred_ref(cred);
1768 }
1769 cb = req->r_callback;
1770 bp = cb.rcb_bp;
1771 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1772 nfs_request_ref(req, 0);
1773 }
1774
1775 nmp = NFSTONMP(np);
1776 if (nfs_mount_gone(nmp)) {
1777 SET(bp->nb_flags, NB_ERROR);
1778 bp->nb_error = error = ENXIO;
1779 }
1780 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1781 /* just drop it */
1782 nfs_request_async_cancel(req);
1783 goto out;
1784 }
1785
1786 nfsvers = nmp->nm_vers;
1787 offset = cb.rcb_args[0];
1788 rlen = length = cb.rcb_args[1];
1789
1790 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1791 UIO_READ, &uio_buf, sizeof(uio_buf));
1792 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1793
1794 /* finish the RPC */
1795 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1796 if ((error == EINPROGRESS) && cb.rcb_func) {
1797 /* async request restarted */
1798 if (cb.rcb_func) {
1799 nfs_request_rele(req);
1800 }
1801 if (IS_VALID_CRED(cred)) {
1802 kauth_cred_unref(&cred);
1803 }
1804 return;
1805 }
1806 #if CONFIG_NFS4
1807 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1808 lck_mtx_lock(&nmp->nm_lock);
1809 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1810 NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1811 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
1812 nfs_need_recover(nmp, error);
1813 }
1814 lck_mtx_unlock(&nmp->nm_lock);
1815 if (np->n_flag & NREVOKE) {
1816 error = EIO;
1817 } else {
1818 if (error == NFSERR_GRACE) {
1819 if (cb.rcb_func) {
1820 /*
1821 * For an async I/O request, handle a grace delay just like
1822 * jukebox errors. Set the resend time and queue it up.
1823 */
1824 struct timeval now;
1825 if (req->r_nmrep.nmc_mhead) {
1826 mbuf_freem(req->r_nmrep.nmc_mhead);
1827 req->r_nmrep.nmc_mhead = NULL;
1828 }
1829 req->r_error = 0;
1830 microuptime(&now);
1831 lck_mtx_lock(&req->r_mtx);
1832 req->r_resendtime = now.tv_sec + 2;
1833 req->r_xid = 0; // get a new XID
1834 req->r_flags |= R_RESTART;
1835 req->r_start = 0;
1836 nfs_asyncio_resend(req);
1837 lck_mtx_unlock(&req->r_mtx);
1838 if (IS_VALID_CRED(cred)) {
1839 kauth_cred_unref(&cred);
1840 }
1841 /* Note: nfsreq reference taken will be dropped later when finished */
1842 return;
1843 }
1844 /* otherwise, just pause a couple seconds and retry */
1845 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
1846 }
1847 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1848 rlen = 0;
1849 goto readagain;
1850 }
1851 }
1852 }
1853 #endif
1854 if (error) {
1855 SET(bp->nb_flags, NB_ERROR);
1856 bp->nb_error = error;
1857 goto out;
1858 }
1859
1860 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
1861 bp->nb_endio = offset + rlen;
1862 }
1863
1864 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1865 /* zero out the remaining data (up to EOF) */
1866 off_t rpcrem, eofrem, rem;
1867 rpcrem = (length - rlen);
1868 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1869 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1870 if (rem > 0) {
1871 bzero(bp->nb_data + offset + rlen, rem);
1872 }
1873 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1874 /*
1875 * short read
1876 *
1877 * We haven't hit EOF and we didn't get all the data
1878 * requested, so we need to issue another read for the rest.
1879 * (Don't bother if the buffer already hit an error.)
1880 */
1881 #if CONFIG_NFS4
1882 readagain:
1883 #endif
1884 offset += rlen;
1885 length -= rlen;
1886 cb.rcb_args[0] = offset;
1887 cb.rcb_args[1] = length;
1888 #if CONFIG_NFS4
1889 if (nmp->nm_vers >= NFS_VER4) {
1890 cb.rcb_args[2] = nmp->nm_stategenid;
1891 }
1892 #endif
1893 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1894 if (!error) {
1895 if (IS_VALID_CRED(cred)) {
1896 kauth_cred_unref(&cred);
1897 }
1898 if (!cb.rcb_func) {
1899 /* if !async we'll need to wait for this RPC to finish */
1900 req = rreq;
1901 rreq = NULL;
1902 goto finish;
1903 }
1904 nfs_request_rele(req);
1905 /*
1906 * We're done here.
1907 * Outstanding RPC count is unchanged.
1908 * Callback will be called when RPC is done.
1909 */
1910 return;
1911 }
1912 SET(bp->nb_flags, NB_ERROR);
1913 bp->nb_error = error;
1914 }
1915
1916 out:
1917 if (cb.rcb_func) {
1918 nfs_request_rele(req);
1919 }
1920 if (IS_VALID_CRED(cred)) {
1921 kauth_cred_unref(&cred);
1922 }
1923
1924 /*
1925 * Decrement outstanding RPC count on buffer
1926 * and call nfs_buf_read_finish on last RPC.
1927 *
1928 * (Note: when there are multiple async RPCs issued for a
1929 * buffer we need nfs_buffer_mutex to avoid problems when
1930 * aborting a partially-initiated set of RPCs)
1931 */
1932
1933 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1934 if (multasyncrpc) {
1935 lck_mtx_lock(nfs_buf_mutex);
1936 }
1937
1938 bp->nb_rpcs--;
1939 finished = (bp->nb_rpcs == 0);
1940
1941 if (multasyncrpc) {
1942 lck_mtx_unlock(nfs_buf_mutex);
1943 }
1944
1945 if (finished) {
1946 if (multasyncrpc) {
1947 wakeme = &bp->nb_rpcs;
1948 }
1949 nfs_buf_read_finish(bp);
1950 if (wakeme) {
1951 wakeup(wakeme);
1952 }
1953 }
1954 }
1955
1956 /*
1957 * Do buffer readahead.
1958 * Initiate async I/O to read buffers not in cache.
1959 */
1960 int
1961 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1962 {
1963 struct nfsmount *nmp = NFSTONMP(np);
1964 struct nfsbuf *bp;
1965 int error = 0;
1966 uint32_t nra;
1967
1968 if (nfs_mount_gone(nmp)) {
1969 return ENXIO;
1970 }
1971 if (nmp->nm_readahead <= 0) {
1972 return 0;
1973 }
1974 if (*rabnp > lastrabn) {
1975 return 0;
1976 }
1977
1978 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1979 /* check if block exists and is valid. */
1980 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1981 /* stop reading ahead if we're beyond EOF */
1982 *rabnp = lastrabn;
1983 break;
1984 }
1985 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1986 if (error) {
1987 break;
1988 }
1989 nfs_node_lock_force(np);
1990 np->n_lastrahead = *rabnp;
1991 nfs_node_unlock(np);
1992 if (!bp) {
1993 continue;
1994 }
1995 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1996 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
1997 CLR(bp->nb_flags, NB_CACHE);
1998 bp->nb_valid = 0;
1999 bp->nb_validoff = bp->nb_validend = -1;
2000 }
2001 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
2002 !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
2003 SET(bp->nb_flags, (NB_READ | NB_ASYNC));
2004 if (ioflag & IO_NOCACHE) {
2005 SET(bp->nb_flags, NB_NCRDAHEAD);
2006 }
2007 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2008 kauth_cred_ref(cred);
2009 bp->nb_rcred = cred;
2010 }
2011 if ((error = nfs_buf_read(bp))) {
2012 break;
2013 }
2014 continue;
2015 }
2016 nfs_buf_release(bp, 1);
2017 }
2018 return error;
2019 }
2020
2021 /*
2022 * NFS buffer I/O for reading files.
2023 */
2024 int
2025 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2026 {
2027 vnode_t vp = NFSTOV(np);
2028 struct nfsbuf *bp = NULL;
2029 struct nfsmount *nmp = VTONMP(vp);
2030 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2031 off_t diff;
2032 int error = 0, n = 0, on = 0;
2033 int nfsvers, biosize, modified, readaheads = 0;
2034 thread_t thd;
2035 kauth_cred_t cred;
2036 int64_t io_resid;
2037
2038 FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2039
2040 nfsvers = nmp->nm_vers;
2041 biosize = nmp->nm_biosize;
2042 thd = vfs_context_thread(ctx);
2043 cred = vfs_context_ucred(ctx);
2044
2045 if (vnode_vtype(vp) != VREG) {
2046 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2047 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
2048 return EINVAL;
2049 }
2050
2051 /*
2052 * For NFS, cache consistency can only be maintained approximately.
2053 * Although RFC1094 does not specify the criteria, the following is
2054 * believed to be compatible with the reference port.
2055 *
2056 * If the file has changed since the last read RPC or you have
2057 * written to the file, you may have lost data cache consistency
2058 * with the server. So, check for a change, and flush all of the
2059 * file's data out of the cache.
2060 * NB: This implies that cache data can be read when up to
2061 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2062 * need current attributes, nfs_getattr() can be forced to fetch
2063 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2064 */
2065
2066 if (ISSET(np->n_flag, NUPDATESIZE)) {
2067 nfs_data_update_size(np, 0);
2068 }
2069
2070 if ((error = nfs_node_lock(np))) {
2071 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
2072 return error;
2073 }
2074
2075 if (np->n_flag & NNEEDINVALIDATE) {
2076 np->n_flag &= ~NNEEDINVALIDATE;
2077 nfs_node_unlock(np);
2078 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2079 if (!error) {
2080 error = nfs_node_lock(np);
2081 }
2082 if (error) {
2083 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
2084 return error;
2085 }
2086 }
2087
2088 modified = (np->n_flag & NMODIFIED);
2089 nfs_node_unlock(np);
2090 /* nfs_getattr() will check changed and purge caches */
2091 error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
2092 if (error) {
2093 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
2094 return error;
2095 }
2096
2097 if (uio_resid(uio) == 0) {
2098 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
2099 return 0;
2100 }
2101 if (uio_offset(uio) < 0) {
2102 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
2103 return EINVAL;
2104 }
2105
2106 /*
2107 * set up readahead - which may be limited by:
2108 * + current request length (for IO_NOCACHE)
2109 * + readahead setting
2110 * + file size
2111 */
2112 if (nmp->nm_readahead > 0) {
2113 off_t end = uio_offset(uio) + uio_resid(uio);
2114 if (end > (off_t)np->n_size) {
2115 end = np->n_size;
2116 }
2117 rabn = uio_offset(uio) / biosize;
2118 maxrabn = (end - 1) / biosize;
2119 nfs_node_lock_force(np);
2120 if (!(ioflag & IO_NOCACHE) &&
2121 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
2122 maxrabn += nmp->nm_readahead;
2123 if ((maxrabn * biosize) >= (off_t)np->n_size) {
2124 maxrabn = ((off_t)np->n_size - 1) / biosize;
2125 }
2126 }
2127 if (maxrabn < np->n_lastrahead) {
2128 np->n_lastrahead = -1;
2129 }
2130 if (rabn < np->n_lastrahead) {
2131 rabn = np->n_lastrahead + 1;
2132 }
2133 nfs_node_unlock(np);
2134 } else {
2135 rabn = maxrabn = 0;
2136 }
2137
2138 do {
2139 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2140 lbn = uio_offset(uio) / biosize;
2141
2142 /*
2143 * Copy directly from any cached pages without grabbing the bufs.
2144 * (If we are NOCACHE and we've issued readahead requests, we need
2145 * to grab the NB_NCRDAHEAD bufs to drop them.)
2146 */
2147 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2148 ((uio->uio_segflg == UIO_USERSPACE32 ||
2149 uio->uio_segflg == UIO_USERSPACE64 ||
2150 uio->uio_segflg == UIO_USERSPACE))) {
2151 io_resid = uio_resid(uio);
2152 diff = np->n_size - uio_offset(uio);
2153 if (diff < io_resid) {
2154 io_resid = diff;
2155 }
2156 if (io_resid > 0) {
2157 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2158 error = cluster_copy_ubc_data(vp, uio, &count, 0);
2159 if (error) {
2160 nfs_data_unlock(np);
2161 FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2162 return error;
2163 }
2164 }
2165 /* count any biocache reads that we just copied directly */
2166 if (lbn != (uio_offset(uio) / biosize)) {
2167 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
2168 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2169 }
2170 }
2171
2172 lbn = uio_offset(uio) / biosize;
2173 on = uio_offset(uio) % biosize;
2174 nfs_node_lock_force(np);
2175 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2176 nfs_node_unlock(np);
2177
2178 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2179 nfs_data_unlock(np);
2180 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2181 return 0;
2182 }
2183
2184 /* adjust readahead block number, if necessary */
2185 if (rabn < lbn) {
2186 rabn = lbn;
2187 }
2188 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2189 if (rabn <= lastrabn) { /* start readaheads */
2190 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2191 if (error) {
2192 nfs_data_unlock(np);
2193 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2194 return error;
2195 }
2196 readaheads = 1;
2197 }
2198
2199 OSAddAtomic64(1, &nfsstats.biocache_reads);
2200
2201 /*
2202 * If the block is in the cache and has the required data
2203 * in a valid region, just copy it out.
2204 * Otherwise, get the block and write back/read in,
2205 * as required.
2206 */
2207 again:
2208 io_resid = uio_resid(uio);
2209 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2210 diff = np->n_size - uio_offset(uio);
2211 if (diff < n) {
2212 n = diff;
2213 }
2214
2215 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2216 if (error) {
2217 nfs_data_unlock(np);
2218 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2219 return error;
2220 }
2221
2222 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2223 /*
2224 * IO_NOCACHE found a cached buffer.
2225 * Flush the buffer if it's dirty.
2226 * Invalidate the data if it wasn't just read
2227 * in as part of a "nocache readahead".
2228 */
2229 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2230 /* so write the buffer out and try again */
2231 SET(bp->nb_flags, NB_NOCACHE);
2232 goto flushbuffer;
2233 }
2234 if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2235 CLR(bp->nb_flags, NB_NCRDAHEAD);
2236 SET(bp->nb_flags, NB_NOCACHE);
2237 }
2238 }
2239
2240 /* if any pages are valid... */
2241 if (bp->nb_valid) {
2242 /* ...check for any invalid pages in the read range */
2243 int pg, firstpg, lastpg, dirtypg;
2244 dirtypg = firstpg = lastpg = -1;
2245 pg = on / PAGE_SIZE;
2246 while (pg <= (on + n - 1) / PAGE_SIZE) {
2247 if (!NBPGVALID(bp, pg)) {
2248 if (firstpg < 0) {
2249 firstpg = pg;
2250 }
2251 lastpg = pg;
2252 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
2253 dirtypg = pg;
2254 }
2255 pg++;
2256 }
2257
2258 /* if there are no invalid pages, we're all set */
2259 if (firstpg < 0) {
2260 if (bp->nb_validoff < 0) {
2261 /* valid range isn't set up, so */
2262 /* set it to what we know is valid */
2263 bp->nb_validoff = trunc_page(on);
2264 bp->nb_validend = round_page(on + n);
2265 nfs_buf_normalize_valid_range(np, bp);
2266 }
2267 goto buffer_ready;
2268 }
2269
2270 /* there are invalid pages in the read range */
2271 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2272 (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2273 /* there are also dirty page(s) (or range) in the read range, */
2274 /* so write the buffer out and try again */
2275 flushbuffer:
2276 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2277 SET(bp->nb_flags, NB_ASYNC);
2278 if (!IS_VALID_CRED(bp->nb_wcred)) {
2279 kauth_cred_ref(cred);
2280 bp->nb_wcred = cred;
2281 }
2282 error = nfs_buf_write(bp);
2283 if (error) {
2284 nfs_data_unlock(np);
2285 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2286 return error;
2287 }
2288 goto again;
2289 }
2290 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2291 (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
2292 /* we need to read in more than half the buffer and the */
2293 /* buffer's not dirty, so just fetch the whole buffer */
2294 bp->nb_valid = 0;
2295 } else {
2296 /* read the page range in */
2297 uio_t auio;
2298 char uio_buf[UIO_SIZEOF(1)];
2299
2300 NFS_BUF_MAP(bp);
2301 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2302 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2303 if (!auio) {
2304 error = ENOMEM;
2305 } else {
2306 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2307 ((lastpg - firstpg + 1) * PAGE_SIZE));
2308 error = nfs_read_rpc(np, auio, ctx);
2309 }
2310 if (error) {
2311 if (ioflag & IO_NOCACHE) {
2312 SET(bp->nb_flags, NB_NOCACHE);
2313 }
2314 nfs_buf_release(bp, 1);
2315 nfs_data_unlock(np);
2316 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2317 return error;
2318 }
2319 /* Make sure that the valid range is set to cover this read. */
2320 bp->nb_validoff = trunc_page_32(on);
2321 bp->nb_validend = round_page_32(on + n);
2322 nfs_buf_normalize_valid_range(np, bp);
2323 if (uio_resid(auio) > 0) {
2324 /* if short read, must have hit EOF, */
2325 /* so zero the rest of the range */
2326 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2327 }
2328 /* mark the pages (successfully read) as valid */
2329 for (pg = firstpg; pg <= lastpg; pg++) {
2330 NBPGVALID_SET(bp, pg);
2331 }
2332 }
2333 }
2334 /* if no pages are valid, read the whole block */
2335 if (!bp->nb_valid) {
2336 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2337 kauth_cred_ref(cred);
2338 bp->nb_rcred = cred;
2339 }
2340 SET(bp->nb_flags, NB_READ);
2341 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2342 error = nfs_buf_read(bp);
2343 if (ioflag & IO_NOCACHE) {
2344 SET(bp->nb_flags, NB_NOCACHE);
2345 }
2346 if (error) {
2347 nfs_data_unlock(np);
2348 nfs_buf_release(bp, 1);
2349 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2350 return error;
2351 }
2352 }
2353 buffer_ready:
2354 /* validate read range against valid range and clip */
2355 if (bp->nb_validend > 0) {
2356 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2357 if (diff < n) {
2358 n = diff;
2359 }
2360 }
2361 if (n > 0) {
2362 NFS_BUF_MAP(bp);
2363 error = uiomove(bp->nb_data + on, n, uio);
2364 }
2365
2366
2367 nfs_buf_release(bp, 1);
2368 nfs_data_unlock(np);
2369 nfs_node_lock_force(np);
2370 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2371 nfs_node_unlock(np);
2372 } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2373 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2374 return error;
2375 }
2376
2377 /*
2378 * limit the number of outstanding async I/O writes
2379 */
2380 int
2381 nfs_async_write_start(struct nfsmount *nmp)
2382 {
2383 int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2384 struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
2385
2386 if (nfs_max_async_writes <= 0) {
2387 return 0;
2388 }
2389 lck_mtx_lock(&nmp->nm_lock);
2390 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2391 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2392 break;
2393 }
2394 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
2395 slpflag = 0;
2396 }
2397 if (!error) {
2398 nmp->nm_asyncwrites++;
2399 }
2400 lck_mtx_unlock(&nmp->nm_lock);
2401 return error;
2402 }
2403 void
2404 nfs_async_write_done(struct nfsmount *nmp)
2405 {
2406 if (nmp->nm_asyncwrites <= 0) {
2407 return;
2408 }
2409 lck_mtx_lock(&nmp->nm_lock);
2410 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2411 wakeup(&nmp->nm_asyncwrites);
2412 }
2413 lck_mtx_unlock(&nmp->nm_lock);
2414 }
2415
2416 /*
2417 * write (or commit) the given NFS buffer
2418 *
2419 * Commit the buffer if we can.
2420 * Write out any dirty range.
2421 * If any dirty pages remain, write them out.
2422 * Mark buffer done.
2423 *
2424 * For async requests, all the work beyond sending the initial
2425 * write RPC is handled in the RPC callback(s).
2426 */
2427 int
2428 nfs_buf_write(struct nfsbuf *bp)
2429 {
2430 int error = 0, oldflags, async;
2431 nfsnode_t np;
2432 thread_t thd;
2433 kauth_cred_t cred;
2434 proc_t p = current_proc();
2435 int iomode, doff, dend, firstpg, lastpg;
2436 uint32_t pagemask;
2437
2438 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2439
2440 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2441 panic("nfs_buf_write: buffer is not busy???");
2442 }
2443
2444 np = bp->nb_np;
2445 async = ISSET(bp->nb_flags, NB_ASYNC);
2446 oldflags = bp->nb_flags;
2447
2448 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2449 if (ISSET(oldflags, NB_DELWRI)) {
2450 lck_mtx_lock(nfs_buf_mutex);
2451 nfs_nbdwrite--;
2452 NFSBUFCNTCHK();
2453 lck_mtx_unlock(nfs_buf_mutex);
2454 wakeup(&nfs_nbdwrite);
2455 }
2456
2457 /* move to clean list */
2458 if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2459 lck_mtx_lock(nfs_buf_mutex);
2460 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2461 LIST_REMOVE(bp, nb_vnbufs);
2462 }
2463 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2464 lck_mtx_unlock(nfs_buf_mutex);
2465 }
2466 nfs_node_lock_force(np);
2467 np->n_numoutput++;
2468 nfs_node_unlock(np);
2469 vnode_startwrite(NFSTOV(np));
2470
2471 if (p && p->p_stats) {
2472 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2473 }
2474
2475 cred = bp->nb_wcred;
2476 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2477 cred = bp->nb_rcred; /* shouldn't really happen, but... */
2478 }
2479 if (IS_VALID_CRED(cred)) {
2480 kauth_cred_ref(cred);
2481 }
2482 thd = async ? NULL : current_thread();
2483
2484 /* We need to make sure the pages are locked before doing I/O. */
2485 if (!ISSET(bp->nb_flags, NB_META)) {
2486 if (UBCINFOEXISTS(NFSTOV(np))) {
2487 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2488 error = nfs_buf_upl_setup(bp);
2489 if (error) {
2490 printf("nfs_buf_write: upl create failed %d\n", error);
2491 SET(bp->nb_flags, NB_ERROR);
2492 bp->nb_error = error = EIO;
2493 nfs_buf_iodone(bp);
2494 goto out;
2495 }
2496 nfs_buf_upl_check(bp);
2497 }
2498 } else {
2499 /* We should never be in nfs_buf_write() with no UBCINFO. */
2500 printf("nfs_buf_write: ubcinfo already gone\n");
2501 SET(bp->nb_flags, NB_ERROR);
2502 bp->nb_error = error = EIO;
2503 nfs_buf_iodone(bp);
2504 goto out;
2505 }
2506 }
2507
2508 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2509 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2510 nfs_buf_check_write_verifier(np, bp);
2511 }
2512 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2513 struct nfsmount *nmp = NFSTONMP(np);
2514 if (nfs_mount_gone(nmp)) {
2515 SET(bp->nb_flags, NB_ERROR);
2516 bp->nb_error = error = EIO;
2517 nfs_buf_iodone(bp);
2518 goto out;
2519 }
2520 SET(bp->nb_flags, NB_WRITEINPROG);
2521 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2522 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2523 CLR(bp->nb_flags, NB_WRITEINPROG);
2524 if (error) {
2525 if (error != NFSERR_STALEWRITEVERF) {
2526 SET(bp->nb_flags, NB_ERROR);
2527 bp->nb_error = error;
2528 }
2529 nfs_buf_iodone(bp);
2530 goto out;
2531 }
2532 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2533 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2534 nfs_node_lock_force(np);
2535 np->n_needcommitcnt--;
2536 CHECK_NEEDCOMMITCNT(np);
2537 nfs_node_unlock(np);
2538 }
2539 if (!error && (bp->nb_dirtyend > 0)) {
2540 /* sanity check the dirty range */
2541 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2542 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2543 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2544 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2545 }
2546 }
2547 }
2548 if (!error && (bp->nb_dirtyend > 0)) {
2549 /* there's a dirty range that needs to be written out */
2550 NFS_BUF_MAP(bp);
2551
2552 doff = bp->nb_dirtyoff;
2553 dend = bp->nb_dirtyend;
2554
2555 /* if doff page is dirty, move doff to start of page */
2556 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2557 doff -= doff & PAGE_MASK;
2558 }
2559 /* try to expand write range to include preceding dirty pages */
2560 if (!(doff & PAGE_MASK)) {
2561 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2562 doff -= PAGE_SIZE;
2563 }
2564 }
2565 /* if dend page is dirty, move dend to start of next page */
2566 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2567 dend = round_page_32(dend);
2568 }
2569 /* try to expand write range to include trailing dirty pages */
2570 if (!(dend & PAGE_MASK)) {
2571 while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2572 dend += PAGE_SIZE;
2573 }
2574 }
2575 /* make sure to keep dend clipped to EOF */
2576 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2577 dend = np->n_size - NBOFF(bp);
2578 }
2579 /* calculate range of complete pages being written */
2580 firstpg = round_page_32(doff) / PAGE_SIZE;
2581 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2582 /* calculate mask for that page range */
2583 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2584
2585 /*
2586 * compare page mask to nb_dirty; if there are other dirty pages
2587 * then write FILESYNC; otherwise, write UNSTABLE if async and
2588 * not needcommit/stable; otherwise write FILESYNC
2589 */
2590 if (bp->nb_dirty & ~pagemask) {
2591 iomode = NFS_WRITE_FILESYNC;
2592 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2593 iomode = NFS_WRITE_UNSTABLE;
2594 } else {
2595 iomode = NFS_WRITE_FILESYNC;
2596 }
2597
2598 /* write the whole contiguous dirty range */
2599 bp->nb_offio = doff;
2600 bp->nb_endio = dend;
2601
2602 OSAddAtomic64(1, &nfsstats.write_bios);
2603
2604 SET(bp->nb_flags, NB_WRITEINPROG);
2605 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2606 /*
2607 * For async I/O, the callbacks will finish up the
2608 * write and push out any dirty pages. Otherwise,
2609 * the write has already been finished and any dirty
2610 * pages pushed out.
2611 */
2612 } else {
2613 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2614 error = nfs_buf_write_dirty_pages(bp, thd, cred);
2615 }
2616 nfs_buf_iodone(bp);
2617 }
2618 /* note: bp is still valid only for !async case */
2619 out:
2620 if (!async) {
2621 error = nfs_buf_iowait(bp);
2622 /* move to clean list */
2623 if (oldflags & NB_DELWRI) {
2624 lck_mtx_lock(nfs_buf_mutex);
2625 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2626 LIST_REMOVE(bp, nb_vnbufs);
2627 }
2628 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2629 lck_mtx_unlock(nfs_buf_mutex);
2630 }
2631 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2632 nfs_buf_release(bp, 1);
2633 /* check if we need to invalidate (and we can) */
2634 if ((np->n_flag & NNEEDINVALIDATE) &&
2635 !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2636 int invalidate = 0;
2637 nfs_node_lock_force(np);
2638 if (np->n_flag & NNEEDINVALIDATE) {
2639 invalidate = 1;
2640 np->n_flag &= ~NNEEDINVALIDATE;
2641 }
2642 nfs_node_unlock(np);
2643 if (invalidate) {
2644 /*
2645 * There was a write error and we need to
2646 * invalidate attrs and flush buffers in
2647 * order to sync up with the server.
2648 * (if this write was extending the file,
2649 * we may no longer know the correct size)
2650 *
2651 * But we couldn't call vinvalbuf while holding
2652 * the buffer busy. So we call vinvalbuf() after
2653 * releasing the buffer.
2654 */
2655 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
2656 }
2657 }
2658 }
2659
2660 if (IS_VALID_CRED(cred)) {
2661 kauth_cred_unref(&cred);
2662 }
2663 return error;
2664 }
2665
2666 /*
2667 * finish the writing of a buffer
2668 */
2669 void
2670 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2671 {
2672 nfsnode_t np = bp->nb_np;
2673 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2674 int firstpg, lastpg;
2675 uint32_t pagemask;
2676
2677 if ((error == EINTR) || (error == ERESTART)) {
2678 CLR(bp->nb_flags, NB_ERROR);
2679 SET(bp->nb_flags, NB_EINTR);
2680 }
2681
2682 if (!error) {
2683 /* calculate range of complete pages being written */
2684 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2685 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2686 /* calculate mask for that page range written */
2687 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2688 /* clear dirty bits for pages we've written */
2689 bp->nb_dirty &= ~pagemask;
2690 }
2691
2692 /* manage needcommit state */
2693 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2694 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2695 nfs_node_lock_force(np);
2696 np->n_needcommitcnt++;
2697 nfs_node_unlock(np);
2698 SET(bp->nb_flags, NB_NEEDCOMMIT);
2699 }
2700 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2701 bp->nb_dirtyoff = bp->nb_offio;
2702 bp->nb_dirtyend = bp->nb_endio;
2703 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2704 nfs_node_lock_force(np);
2705 np->n_needcommitcnt--;
2706 CHECK_NEEDCOMMITCNT(np);
2707 nfs_node_unlock(np);
2708 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2709 }
2710
2711 CLR(bp->nb_flags, NB_WRITEINPROG);
2712
2713 /*
2714 * For an unstable write, the buffer is still treated as dirty until
2715 * a commit (or stable (re)write) is performed. Buffers needing only
2716 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2717 *
2718 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2719 * because that would cause the buffer to be dropped. The buffer is
2720 * still valid and simply needs to be written again.
2721 */
2722 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2723 CLR(bp->nb_flags, NB_INVAL);
2724 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2725 SET(bp->nb_flags, NB_DELWRI);
2726 lck_mtx_lock(nfs_buf_mutex);
2727 nfs_nbdwrite++;
2728 NFSBUFCNTCHK();
2729 lck_mtx_unlock(nfs_buf_mutex);
2730 }
2731 /*
2732 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2733 * clean list, we have to reassign it back to the dirty one. Ugh.
2734 */
2735 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2736 /* move to dirty list */
2737 lck_mtx_lock(nfs_buf_mutex);
2738 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2739 LIST_REMOVE(bp, nb_vnbufs);
2740 }
2741 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2742 lck_mtx_unlock(nfs_buf_mutex);
2743 }
2744 } else {
2745 /* either there's an error or we don't need to commit */
2746 if (error) {
2747 /*
2748 * There was a write error and we need to invalidate
2749 * attrs and flush buffers in order to sync up with the
2750 * server. (if this write was extending the file, we
2751 * may no longer know the correct size)
2752 *
2753 * But we can't call vinvalbuf while holding this
2754 * buffer busy. Set a flag to do it after releasing
2755 * the buffer.
2756 */
2757 nfs_node_lock_force(np);
2758 np->n_error = error;
2759 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2760 NATTRINVALIDATE(np);
2761 nfs_node_unlock(np);
2762 }
2763 /* clear the dirty range */
2764 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2765 }
2766
2767 if (!error && bp->nb_dirty) {
2768 nfs_buf_write_dirty_pages(bp, thd, cred);
2769 }
2770 nfs_buf_iodone(bp);
2771 }
2772
2773 /*
2774 * write out any pages marked dirty in a buffer
2775 *
2776 * We do use unstable writes and follow up with a commit.
2777 * If we catch the write verifier changing we'll restart
2778 * do the writes filesync.
2779 */
2780 int
2781 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2782 {
2783 nfsnode_t np = bp->nb_np;
2784 struct nfsmount *nmp = NFSTONMP(np);
2785 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2786 uint32_t dirty = bp->nb_dirty;
2787 uint64_t wverf;
2788 uio_t auio;
2789 char uio_buf[UIO_SIZEOF(1)];
2790
2791 if (!bp->nb_dirty) {
2792 return 0;
2793 }
2794
2795 /* there are pages marked dirty that need to be written out */
2796 OSAddAtomic64(1, &nfsstats.write_bios);
2797 NFS_BUF_MAP(bp);
2798 SET(bp->nb_flags, NB_WRITEINPROG);
2799 npages = bp->nb_bufsize / PAGE_SIZE;
2800 iomode = NFS_WRITE_UNSTABLE;
2801
2802 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2803 &uio_buf, sizeof(uio_buf));
2804
2805 again:
2806 dirty = bp->nb_dirty;
2807 wverf = bp->nb_verf;
2808 commit = NFS_WRITE_FILESYNC;
2809 for (pg = 0; pg < npages; pg++) {
2810 if (!NBPGDIRTY(bp, pg)) {
2811 continue;
2812 }
2813 count = 1;
2814 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2815 count++;
2816 }
2817 /* write count pages starting with page pg */
2818 off = pg * PAGE_SIZE;
2819 len = count * PAGE_SIZE;
2820 /* clip writes to EOF */
2821 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2822 len -= (NBOFF(bp) + off + len) - np->n_size;
2823 }
2824 if (len > 0) {
2825 iomode2 = iomode;
2826 uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2827 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2828 error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2829 if (error) {
2830 break;
2831 }
2832 if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2833 commit = iomode2;
2834 }
2835 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2836 /* verifier changed, redo all the writes filesync */
2837 iomode = NFS_WRITE_FILESYNC;
2838 goto again;
2839 }
2840 }
2841 /* clear dirty bits */
2842 while (count--) {
2843 dirty &= ~(1 << pg);
2844 if (count) { /* leave pg on last page */
2845 pg++;
2846 }
2847 }
2848 }
2849 CLR(bp->nb_flags, NB_WRITEINPROG);
2850
2851 if (!error && (commit != NFS_WRITE_FILESYNC)) {
2852 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2853 if (error == NFSERR_STALEWRITEVERF) {
2854 /* verifier changed, so we need to restart all the writes */
2855 iomode = NFS_WRITE_FILESYNC;
2856 goto again;
2857 }
2858 }
2859 if (!error) {
2860 bp->nb_dirty = dirty;
2861 } else {
2862 SET(bp->nb_flags, NB_ERROR);
2863 bp->nb_error = error;
2864 }
2865 return error;
2866 }
2867
2868 /*
2869 * initiate the NFS WRITE RPC(s) for a buffer
2870 */
2871 int
2872 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2873 {
2874 struct nfsmount *nmp;
2875 nfsnode_t np = bp->nb_np;
2876 int error = 0, nfsvers, async;
2877 int offset, nrpcs;
2878 uint32_t nmwsize, length, len;
2879 struct nfsreq *req;
2880 struct nfsreq_cbinfo cb;
2881 uio_t auio;
2882 char uio_buf[UIO_SIZEOF(1)];
2883
2884 nmp = NFSTONMP(np);
2885 if (nfs_mount_gone(nmp)) {
2886 bp->nb_error = error = ENXIO;
2887 SET(bp->nb_flags, NB_ERROR);
2888 nfs_buf_iodone(bp);
2889 return error;
2890 }
2891 nfsvers = nmp->nm_vers;
2892 nmwsize = nmp->nm_wsize;
2893
2894 offset = bp->nb_offio;
2895 length = bp->nb_endio - bp->nb_offio;
2896
2897 /* Note: Can only do async I/O if nfsiods are configured. */
2898 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2899 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2900 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2901 cb.rcb_bp = bp;
2902
2903 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2904 bp->nb_error = error = EFBIG;
2905 SET(bp->nb_flags, NB_ERROR);
2906 nfs_buf_iodone(bp);
2907 return error;
2908 }
2909
2910 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2911 UIO_WRITE, &uio_buf, sizeof(uio_buf));
2912 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2913
2914 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2915 if (async && (nrpcs > 1)) {
2916 SET(bp->nb_flags, NB_MULTASYNCRPC);
2917 } else {
2918 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2919 }
2920
2921 while (length > 0) {
2922 if (ISSET(bp->nb_flags, NB_ERROR)) {
2923 error = bp->nb_error;
2924 break;
2925 }
2926 len = (length > nmwsize) ? nmwsize : length;
2927 cb.rcb_args[0] = offset;
2928 cb.rcb_args[1] = len;
2929 #if CONFIG_NFS4
2930 if (nmp->nm_vers >= NFS_VER4) {
2931 cb.rcb_args[2] = nmp->nm_stategenid;
2932 }
2933 #endif
2934 if (async && ((error = nfs_async_write_start(nmp)))) {
2935 break;
2936 }
2937 req = NULL;
2938 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2939 iomode, &cb, &req);
2940 if (error) {
2941 if (async) {
2942 nfs_async_write_done(nmp);
2943 }
2944 break;
2945 }
2946 offset += len;
2947 length -= len;
2948 if (async) {
2949 continue;
2950 }
2951 nfs_buf_write_rpc_finish(req);
2952 }
2953
2954 if (length > 0) {
2955 /*
2956 * Something bad happened while trying to send the RPCs.
2957 * Wait for any outstanding requests to complete.
2958 */
2959 bp->nb_error = error;
2960 SET(bp->nb_flags, NB_ERROR);
2961 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2962 nrpcs = (length + nmwsize - 1) / nmwsize;
2963 lck_mtx_lock(nfs_buf_mutex);
2964 bp->nb_rpcs -= nrpcs;
2965 if (bp->nb_rpcs == 0) {
2966 /* No RPCs left, so the buffer's done */
2967 lck_mtx_unlock(nfs_buf_mutex);
2968 nfs_buf_write_finish(bp, thd, cred);
2969 } else {
2970 /* wait for the last RPC to mark it done */
2971 while (bp->nb_rpcs > 0) {
2972 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2973 "nfs_buf_write_rpc_cancel", NULL);
2974 }
2975 lck_mtx_unlock(nfs_buf_mutex);
2976 }
2977 } else {
2978 nfs_buf_write_finish(bp, thd, cred);
2979 }
2980 /* It may have just been an interrupt... that's OK */
2981 if (!ISSET(bp->nb_flags, NB_ERROR)) {
2982 error = 0;
2983 }
2984 }
2985
2986 return error;
2987 }
2988
2989 /*
2990 * finish up an NFS WRITE RPC on a buffer
2991 */
2992 void
2993 nfs_buf_write_rpc_finish(struct nfsreq *req)
2994 {
2995 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2996 int committed = NFS_WRITE_FILESYNC;
2997 uint64_t wverf = 0;
2998 size_t rlen;
2999 void *wakeme = NULL;
3000 struct nfsreq_cbinfo cb;
3001 struct nfsreq *wreq = NULL;
3002 struct nfsbuf *bp;
3003 struct nfsmount *nmp;
3004 nfsnode_t np;
3005 thread_t thd;
3006 kauth_cred_t cred;
3007 uio_t auio;
3008 char uio_buf[UIO_SIZEOF(1)];
3009
3010 finish:
3011 np = req->r_np;
3012 thd = req->r_thread;
3013 cred = req->r_cred;
3014 if (IS_VALID_CRED(cred)) {
3015 kauth_cred_ref(cred);
3016 }
3017 cb = req->r_callback;
3018 bp = cb.rcb_bp;
3019 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3020 nfs_request_ref(req, 0);
3021 }
3022
3023 nmp = NFSTONMP(np);
3024 if (nfs_mount_gone(nmp)) {
3025 SET(bp->nb_flags, NB_ERROR);
3026 bp->nb_error = error = ENXIO;
3027 }
3028 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3029 /* just drop it */
3030 nfs_request_async_cancel(req);
3031 goto out;
3032 }
3033 nfsvers = nmp->nm_vers;
3034
3035 offset = cb.rcb_args[0];
3036 rlen = length = cb.rcb_args[1];
3037
3038 /* finish the RPC */
3039 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3040 if ((error == EINPROGRESS) && cb.rcb_func) {
3041 /* async request restarted */
3042 if (cb.rcb_func) {
3043 nfs_request_rele(req);
3044 }
3045 if (IS_VALID_CRED(cred)) {
3046 kauth_cred_unref(&cred);
3047 }
3048 return;
3049 }
3050 #if CONFIG_NFS4
3051 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3052 lck_mtx_lock(&nmp->nm_lock);
3053 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3054 NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3055 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
3056 nfs_need_recover(nmp, error);
3057 }
3058 lck_mtx_unlock(&nmp->nm_lock);
3059 if (np->n_flag & NREVOKE) {
3060 error = EIO;
3061 } else {
3062 if (error == NFSERR_GRACE) {
3063 if (cb.rcb_func) {
3064 /*
3065 * For an async I/O request, handle a grace delay just like
3066 * jukebox errors. Set the resend time and queue it up.
3067 */
3068 struct timeval now;
3069 if (req->r_nmrep.nmc_mhead) {
3070 mbuf_freem(req->r_nmrep.nmc_mhead);
3071 req->r_nmrep.nmc_mhead = NULL;
3072 }
3073 req->r_error = 0;
3074 microuptime(&now);
3075 lck_mtx_lock(&req->r_mtx);
3076 req->r_resendtime = now.tv_sec + 2;
3077 req->r_xid = 0; // get a new XID
3078 req->r_flags |= R_RESTART;
3079 req->r_start = 0;
3080 nfs_asyncio_resend(req);
3081 lck_mtx_unlock(&req->r_mtx);
3082 if (IS_VALID_CRED(cred)) {
3083 kauth_cred_unref(&cred);
3084 }
3085 /* Note: nfsreq reference taken will be dropped later when finished */
3086 return;
3087 }
3088 /* otherwise, just pause a couple seconds and retry */
3089 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
3090 }
3091 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3092 rlen = 0;
3093 goto writeagain;
3094 }
3095 }
3096 }
3097 #endif
3098 if (error) {
3099 SET(bp->nb_flags, NB_ERROR);
3100 bp->nb_error = error;
3101 }
3102 if (error || (nfsvers == NFS_VER2)) {
3103 goto out;
3104 }
3105 if (rlen <= 0) {
3106 SET(bp->nb_flags, NB_ERROR);
3107 bp->nb_error = error = EIO;
3108 goto out;
3109 }
3110
3111 /* save lowest commit level returned */
3112 if (committed < bp->nb_commitlevel) {
3113 bp->nb_commitlevel = committed;
3114 }
3115
3116 /* check the write verifier */
3117 if (!bp->nb_verf) {
3118 bp->nb_verf = wverf;
3119 } else if (bp->nb_verf != wverf) {
3120 /* verifier changed, so buffer will need to be rewritten */
3121 bp->nb_flags |= NB_STALEWVERF;
3122 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3123 bp->nb_verf = wverf;
3124 }
3125
3126 /*
3127 * check for a short write
3128 *
3129 * If the server didn't write all the data, then we
3130 * need to issue another write for the rest of it.
3131 * (Don't bother if the buffer hit an error or stale wverf.)
3132 */
3133 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
3134 #if CONFIG_NFS4
3135 writeagain:
3136 #endif
3137 offset += rlen;
3138 length -= rlen;
3139
3140 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
3141 UIO_WRITE, &uio_buf, sizeof(uio_buf));
3142 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
3143
3144 cb.rcb_args[0] = offset;
3145 cb.rcb_args[1] = length;
3146 #if CONFIG_NFS4
3147 if (nmp->nm_vers >= NFS_VER4) {
3148 cb.rcb_args[2] = nmp->nm_stategenid;
3149 }
3150 #endif
3151 // XXX iomode should really match the original request
3152 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
3153 NFS_WRITE_FILESYNC, &cb, &wreq);
3154 if (!error) {
3155 if (IS_VALID_CRED(cred)) {
3156 kauth_cred_unref(&cred);
3157 }
3158 if (!cb.rcb_func) {
3159 /* if !async we'll need to wait for this RPC to finish */
3160 req = wreq;
3161 wreq = NULL;
3162 goto finish;
3163 }
3164 nfs_request_rele(req);
3165 /*
3166 * We're done here.
3167 * Outstanding RPC count is unchanged.
3168 * Callback will be called when RPC is done.
3169 */
3170 return;
3171 }
3172 SET(bp->nb_flags, NB_ERROR);
3173 bp->nb_error = error;
3174 }
3175
3176 out:
3177 if (cb.rcb_func) {
3178 nfs_async_write_done(nmp);
3179 nfs_request_rele(req);
3180 }
3181 /*
3182 * Decrement outstanding RPC count on buffer
3183 * and call nfs_buf_write_finish on last RPC.
3184 *
3185 * (Note: when there are multiple async RPCs issued for a
3186 * buffer we need nfs_buffer_mutex to avoid problems when
3187 * aborting a partially-initiated set of RPCs)
3188 */
3189 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3190 if (multasyncrpc) {
3191 lck_mtx_lock(nfs_buf_mutex);
3192 }
3193
3194 bp->nb_rpcs--;
3195 finished = (bp->nb_rpcs == 0);
3196
3197 if (multasyncrpc) {
3198 lck_mtx_unlock(nfs_buf_mutex);
3199 }
3200
3201 if (finished) {
3202 if (multasyncrpc) {
3203 wakeme = &bp->nb_rpcs;
3204 }
3205 nfs_buf_write_finish(bp, thd, cred);
3206 if (wakeme) {
3207 wakeup(wakeme);
3208 }
3209 }
3210
3211 if (IS_VALID_CRED(cred)) {
3212 kauth_cred_unref(&cred);
3213 }
3214 }
3215
3216 /*
3217 * Send commit(s) for the given node's "needcommit" buffers
3218 */
3219 int
3220 nfs_flushcommits(nfsnode_t np, int nowait)
3221 {
3222 struct nfsmount *nmp;
3223 struct nfsbuf *bp, *prevlbp, *lbp;
3224 struct nfsbuflists blist, commitlist;
3225 int error = 0, retv, wcred_set, flags, dirty;
3226 u_quad_t off, endoff, toff;
3227 uint64_t wverf;
3228 u_int32_t count;
3229 kauth_cred_t wcred = NULL;
3230
3231 FSDBG_TOP(557, np, 0, 0, 0);
3232
3233 /*
3234 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3235 * server, but nas not been committed to stable storage on the server
3236 * yet. The byte range is worked out for as many nfsbufs as we can handle
3237 * and the commit rpc is done.
3238 */
3239 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3240 error = nfs_node_lock(np);
3241 if (error) {
3242 goto done;
3243 }
3244 np->n_flag |= NMODIFIED;
3245 nfs_node_unlock(np);
3246 }
3247
3248 off = (u_quad_t)-1;
3249 endoff = 0;
3250 wcred_set = 0;
3251 LIST_INIT(&commitlist);
3252
3253 nmp = NFSTONMP(np);
3254 if (nfs_mount_gone(nmp)) {
3255 error = ENXIO;
3256 goto done;
3257 }
3258 if (nmp->nm_vers == NFS_VER2) {
3259 error = EINVAL;
3260 goto done;
3261 }
3262
3263 flags = NBI_DIRTY;
3264 if (nowait) {
3265 flags |= NBI_NOWAIT;
3266 }
3267 lck_mtx_lock(nfs_buf_mutex);
3268 wverf = nmp->nm_verf;
3269 if (!nfs_buf_iterprepare(np, &blist, flags)) {
3270 while ((bp = LIST_FIRST(&blist))) {
3271 LIST_REMOVE(bp, nb_vnbufs);
3272 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3273 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3274 if (error) {
3275 continue;
3276 }
3277 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3278 nfs_buf_check_write_verifier(np, bp);
3279 }
3280 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3281 (bp->nb_verf != wverf)) {
3282 nfs_buf_drop(bp);
3283 continue;
3284 }
3285 nfs_buf_remfree(bp);
3286
3287 /* buffer UPLs will be grabbed *in order* below */
3288
3289 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3290 FSDBG(557, bp->nb_validoff, bp->nb_validend,
3291 bp->nb_dirtyoff, bp->nb_dirtyend);
3292
3293 /*
3294 * Work out if all buffers are using the same cred
3295 * so we can deal with them all with one commit.
3296 *
3297 * Note: creds in bp's must be obtained by kauth_cred_ref
3298 * on the same original cred in order for them to be equal.
3299 */
3300 if (wcred_set == 0) {
3301 wcred = bp->nb_wcred;
3302 if (!IS_VALID_CRED(wcred)) {
3303 panic("nfs: needcommit w/out wcred");
3304 }
3305 wcred_set = 1;
3306 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3307 wcred_set = -1;
3308 }
3309 SET(bp->nb_flags, NB_WRITEINPROG);
3310
3311 /*
3312 * Add this buffer to the list of buffers we are committing.
3313 * Buffers are inserted into the list in ascending order so that
3314 * we can take the UPLs in order after the list is complete.
3315 */
3316 prevlbp = NULL;
3317 LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3318 if (bp->nb_lblkno < lbp->nb_lblkno) {
3319 break;
3320 }
3321 prevlbp = lbp;
3322 }
3323 LIST_REMOVE(bp, nb_vnbufs);
3324 if (prevlbp) {
3325 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3326 } else {
3327 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3328 }
3329
3330 /* update commit range start, end */
3331 toff = NBOFF(bp) + bp->nb_dirtyoff;
3332 if (toff < off) {
3333 off = toff;
3334 }
3335 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3336 if (toff > endoff) {
3337 endoff = toff;
3338 }
3339 }
3340 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3341 }
3342 lck_mtx_unlock(nfs_buf_mutex);
3343
3344 if (LIST_EMPTY(&commitlist)) {
3345 error = ENOBUFS;
3346 goto done;
3347 }
3348
3349 /*
3350 * We need a UPL to prevent others from accessing the buffers during
3351 * our commit RPC(s).
3352 *
3353 * We used to also check for dirty pages here; if there were any we'd
3354 * abort the commit and force the entire buffer to be written again.
3355 * Instead of doing that, we just go ahead and commit the dirty range,
3356 * and then leave the buffer around with dirty pages that will be
3357 * written out later.
3358 */
3359 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3360 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3361 retv = nfs_buf_upl_setup(bp);
3362 if (retv) {
3363 /* Unable to create the UPL, the VM object probably no longer exists. */
3364 printf("nfs_flushcommits: upl create failed %d\n", retv);
3365 bp->nb_valid = bp->nb_dirty = 0;
3366 }
3367 }
3368 nfs_buf_upl_check(bp);
3369 }
3370
3371 /*
3372 * Commit data on the server, as required.
3373 * If all bufs are using the same wcred, then use that with
3374 * one call for all of them, otherwise commit each one
3375 * separately.
3376 */
3377 if (wcred_set == 1) {
3378 /*
3379 * Note, it's possible the commit range could be >2^32-1.
3380 * If it is, we'll send one commit that covers the whole file.
3381 */
3382 if ((endoff - off) > 0xffffffff) {
3383 count = 0;
3384 } else {
3385 count = (endoff - off);
3386 }
3387 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3388 } else {
3389 retv = 0;
3390 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3391 toff = NBOFF(bp) + bp->nb_dirtyoff;
3392 count = bp->nb_dirtyend - bp->nb_dirtyoff;
3393 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3394 if (retv) {
3395 break;
3396 }
3397 }
3398 }
3399
3400 /*
3401 * Now, either mark the blocks I/O done or mark the
3402 * blocks dirty, depending on whether the commit
3403 * succeeded.
3404 */
3405 while ((bp = LIST_FIRST(&commitlist))) {
3406 LIST_REMOVE(bp, nb_vnbufs);
3407 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3408 nfs_node_lock_force(np);
3409 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3410 np->n_needcommitcnt--;
3411 CHECK_NEEDCOMMITCNT(np);
3412 nfs_node_unlock(np);
3413
3414 if (retv) {
3415 /* move back to dirty list */
3416 lck_mtx_lock(nfs_buf_mutex);
3417 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3418 lck_mtx_unlock(nfs_buf_mutex);
3419 nfs_buf_release(bp, 1);
3420 continue;
3421 }
3422
3423 nfs_node_lock_force(np);
3424 np->n_numoutput++;
3425 nfs_node_unlock(np);
3426 vnode_startwrite(NFSTOV(np));
3427 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3428 lck_mtx_lock(nfs_buf_mutex);
3429 nfs_nbdwrite--;
3430 NFSBUFCNTCHK();
3431 lck_mtx_unlock(nfs_buf_mutex);
3432 wakeup(&nfs_nbdwrite);
3433 }
3434 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
3435 /* if block still has dirty pages, we don't want it to */
3436 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3437 if (!(dirty = bp->nb_dirty)) {
3438 SET(bp->nb_flags, NB_ASYNC);
3439 } else {
3440 CLR(bp->nb_flags, NB_ASYNC);
3441 }
3442
3443 /* move to clean list */
3444 lck_mtx_lock(nfs_buf_mutex);
3445 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3446 lck_mtx_unlock(nfs_buf_mutex);
3447
3448 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3449
3450 nfs_buf_iodone(bp);
3451 if (dirty) {
3452 /* throw it back in as a delayed write buffer */
3453 CLR(bp->nb_flags, NB_DONE);
3454 nfs_buf_write_delayed(bp);
3455 }
3456 }
3457
3458 done:
3459 FSDBG_BOT(557, np, 0, 0, error);
3460 return error;
3461 }
3462
3463 /*
3464 * Flush all the blocks associated with a vnode.
3465 * Walk through the buffer pool and push any dirty pages
3466 * associated with the vnode.
3467 */
3468 int
3469 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3470 {
3471 struct nfsbuf *bp;
3472 struct nfsbuflists blist;
3473 struct nfsmount *nmp = NFSTONMP(np);
3474 int error = 0, error2, slptimeo = 0, slpflag = 0;
3475 int nfsvers, flags, passone = 1;
3476
3477 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3478
3479 if (nfs_mount_gone(nmp)) {
3480 error = ENXIO;
3481 goto out;
3482 }
3483 nfsvers = nmp->nm_vers;
3484 if (NMFLAG(nmp, INTR)) {
3485 slpflag = PCATCH;
3486 }
3487
3488 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3489 nfs_node_lock_force(np);
3490 np->n_flag |= NMODIFIED;
3491 nfs_node_unlock(np);
3492 }
3493
3494 lck_mtx_lock(nfs_buf_mutex);
3495 while (np->n_bflag & NBFLUSHINPROG) {
3496 np->n_bflag |= NBFLUSHWANT;
3497 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3498 if ((error && (error != EWOULDBLOCK)) ||
3499 ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3500 lck_mtx_unlock(nfs_buf_mutex);
3501 goto out;
3502 }
3503 }
3504 np->n_bflag |= NBFLUSHINPROG;
3505
3506 /*
3507 * On the first pass, start async/unstable writes on all
3508 * delayed write buffers. Then wait for all writes to complete
3509 * and call nfs_flushcommits() to commit any uncommitted buffers.
3510 * On all subsequent passes, start STABLE writes on any remaining
3511 * dirty buffers. Then wait for all writes to complete.
3512 */
3513 again:
3514 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3515 if (!NFSTONMP(np)) {
3516 lck_mtx_unlock(nfs_buf_mutex);
3517 error = ENXIO;
3518 goto done;
3519 }
3520
3521 /* Start/do any write(s) that are required. */
3522 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3523 while ((bp = LIST_FIRST(&blist))) {
3524 LIST_REMOVE(bp, nb_vnbufs);
3525 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3526 flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3527 if (flags != NBAC_NOWAIT) {
3528 nfs_buf_refget(bp);
3529 }
3530 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3531 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3532 if (error == EBUSY) {
3533 break;
3534 }
3535 if (error) {
3536 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3537 if (error2) {
3538 if (flags != NBAC_NOWAIT) {
3539 nfs_buf_refrele(bp);
3540 }
3541 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3542 lck_mtx_unlock(nfs_buf_mutex);
3543 error = error2;
3544 goto done;
3545 }
3546 if (slpflag == PCATCH) {
3547 slpflag = 0;
3548 slptimeo = 2 * hz;
3549 }
3550 }
3551 }
3552 if (flags != NBAC_NOWAIT) {
3553 nfs_buf_refrele(bp);
3554 }
3555 if (error == EBUSY) {
3556 continue;
3557 }
3558 if (!bp->nb_np) {
3559 /* buffer is no longer valid */
3560 nfs_buf_drop(bp);
3561 continue;
3562 }
3563 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3564 nfs_buf_check_write_verifier(np, bp);
3565 }
3566 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3567 /* buffer is no longer dirty */
3568 nfs_buf_drop(bp);
3569 continue;
3570 }
3571 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3572 if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3573 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3574 nfs_buf_drop(bp);
3575 continue;
3576 }
3577 nfs_buf_remfree(bp);
3578 lck_mtx_unlock(nfs_buf_mutex);
3579 if (ISSET(bp->nb_flags, NB_ERROR)) {
3580 nfs_node_lock_force(np);
3581 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3582 np->n_flag |= NWRITEERR;
3583 nfs_node_unlock(np);
3584 nfs_buf_release(bp, 1);
3585 lck_mtx_lock(nfs_buf_mutex);
3586 continue;
3587 }
3588 SET(bp->nb_flags, NB_ASYNC);
3589 if (!passone) {
3590 /* NB_STABLE forces this to be written FILESYNC */
3591 SET(bp->nb_flags, NB_STABLE);
3592 }
3593 nfs_buf_write(bp);
3594 lck_mtx_lock(nfs_buf_mutex);
3595 }
3596 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3597 }
3598 lck_mtx_unlock(nfs_buf_mutex);
3599
3600 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3601 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3602 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3603 if (error2) {
3604 error = error2;
3605 goto done;
3606 }
3607 if (slpflag == PCATCH) {
3608 slpflag = 0;
3609 slptimeo = 2 * hz;
3610 }
3611 }
3612 }
3613
3614 if (nfsvers != NFS_VER2) {
3615 /* loop while it looks like there are still buffers to be */
3616 /* commited and nfs_flushcommits() seems to be handling them. */
3617 while (np->n_needcommitcnt) {
3618 if (nfs_flushcommits(np, 0)) {
3619 break;
3620 }
3621 }
3622 }
3623
3624 if (passone) {
3625 passone = 0;
3626 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3627 nfs_node_lock_force(np);
3628 np->n_flag |= NMODIFIED;
3629 nfs_node_unlock(np);
3630 }
3631 lck_mtx_lock(nfs_buf_mutex);
3632 goto again;
3633 }
3634
3635 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3636 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3637 nfs_node_lock_force(np);
3638 np->n_flag |= NMODIFIED;
3639 nfs_node_unlock(np);
3640 }
3641 lck_mtx_lock(nfs_buf_mutex);
3642 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3643 goto again;
3644 }
3645 lck_mtx_unlock(nfs_buf_mutex);
3646 nfs_node_lock_force(np);
3647 /*
3648 * OK, it looks like there are no dirty blocks. If we have no
3649 * writes in flight and no one in the write code, we can clear
3650 * the modified flag. In order to make sure we see the latest
3651 * attributes and size, we also invalidate the attributes and
3652 * advance the attribute cache XID to guarantee that attributes
3653 * newer than our clearing of NMODIFIED will get loaded next.
3654 * (If we don't do this, it's possible for the flush's final
3655 * write/commit (xid1) to be executed in parallel with a subsequent
3656 * getattr request (xid2). The getattr could return attributes
3657 * from *before* the write/commit completed but the stale attributes
3658 * would be preferred because of the xid ordering.)
3659 */
3660 if (!np->n_wrbusy && !np->n_numoutput) {
3661 np->n_flag &= ~NMODIFIED;
3662 NATTRINVALIDATE(np);
3663 nfs_get_xid(&np->n_xid);
3664 }
3665 } else {
3666 nfs_node_lock_force(np);
3667 }
3668
3669 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3670 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3671 error = np->n_error;
3672 np->n_flag &= ~NWRITEERR;
3673 }
3674 nfs_node_unlock(np);
3675 done:
3676 lck_mtx_lock(nfs_buf_mutex);
3677 flags = np->n_bflag;
3678 np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
3679 lck_mtx_unlock(nfs_buf_mutex);
3680 if (flags & NBFLUSHWANT) {
3681 wakeup(&np->n_bflag);
3682 }
3683 out:
3684 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3685 return error;
3686 }
3687
3688 /*
3689 * Flush out and invalidate all buffers associated with a vnode.
3690 * Called with the underlying object locked.
3691 */
3692 int
3693 nfs_vinvalbuf_internal(
3694 nfsnode_t np,
3695 int flags,
3696 thread_t thd,
3697 kauth_cred_t cred,
3698 int slpflag,
3699 int slptimeo)
3700 {
3701 struct nfsbuf *bp;
3702 struct nfsbuflists blist;
3703 int list, error = 0;
3704
3705 if (flags & V_SAVE) {
3706 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3707 return error;
3708 }
3709 }
3710
3711 lck_mtx_lock(nfs_buf_mutex);
3712 for (;;) {
3713 list = NBI_CLEAN;
3714 if (nfs_buf_iterprepare(np, &blist, list)) {
3715 list = NBI_DIRTY;
3716 if (nfs_buf_iterprepare(np, &blist, list)) {
3717 break;
3718 }
3719 }
3720 while ((bp = LIST_FIRST(&blist))) {
3721 LIST_REMOVE(bp, nb_vnbufs);
3722 if (list == NBI_CLEAN) {
3723 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3724 } else {
3725 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3726 }
3727 nfs_buf_refget(bp);
3728 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3729 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3730 if (error != EAGAIN) {
3731 FSDBG(554, np, bp, -1, error);
3732 nfs_buf_refrele(bp);
3733 nfs_buf_itercomplete(np, &blist, list);
3734 lck_mtx_unlock(nfs_buf_mutex);
3735 return error;
3736 }
3737 }
3738 nfs_buf_refrele(bp);
3739 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3740 lck_mtx_unlock(nfs_buf_mutex);
3741 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3742 (NBOFF(bp) < (off_t)np->n_size)) {
3743 /* extra paranoia: make sure we're not */
3744 /* somehow leaving any dirty data around */
3745 int mustwrite = 0;
3746 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3747 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3748 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3749 error = nfs_buf_upl_setup(bp);
3750 if (error == EINVAL) {
3751 /* vm object must no longer exist */
3752 /* hopefully we don't need to do */
3753 /* anything for this buffer */
3754 } else if (error) {
3755 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3756 }
3757 bp->nb_valid = bp->nb_dirty = 0;
3758 }
3759 nfs_buf_upl_check(bp);
3760 /* check for any dirty data before the EOF */
3761 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3762 /* clip dirty range to EOF */
3763 if (bp->nb_dirtyend > end) {
3764 bp->nb_dirtyend = end;
3765 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
3766 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3767 }
3768 }
3769 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3770 mustwrite++;
3771 }
3772 }
3773 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3774 if (bp->nb_dirty) {
3775 mustwrite++;
3776 }
3777 /* also make sure we'll have a credential to do the write */
3778 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3779 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3780 mustwrite = 0;
3781 }
3782 if (mustwrite) {
3783 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3784 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3785 panic("nfs_vinvalbuf: dirty buffer without upl");
3786 }
3787 /* gotta write out dirty data before invalidating */
3788 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3789 /* (NB_NOCACHE indicates buffer should be discarded) */
3790 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3791 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3792 if (!IS_VALID_CRED(bp->nb_wcred)) {
3793 kauth_cred_ref(cred);
3794 bp->nb_wcred = cred;
3795 }
3796 error = nfs_buf_write(bp);
3797 // Note: bp has been released
3798 if (error) {
3799 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3800 nfs_node_lock_force(np);
3801 if ((error != EINTR) && (error != ERESTART)) {
3802 np->n_error = error;
3803 np->n_flag |= NWRITEERR;
3804 }
3805 /*
3806 * There was a write error and we need to
3807 * invalidate attrs to sync with server.
3808 * (if this write was extending the file,
3809 * we may no longer know the correct size)
3810 */
3811 NATTRINVALIDATE(np);
3812 nfs_node_unlock(np);
3813 if ((error == EINTR) || (error == ERESTART)) {
3814 /*
3815 * Abort on EINTR. If we don't, we could
3816 * be stuck in this loop forever because
3817 * the buffer will continue to stay dirty.
3818 */
3819 lck_mtx_lock(nfs_buf_mutex);
3820 nfs_buf_itercomplete(np, &blist, list);
3821 lck_mtx_unlock(nfs_buf_mutex);
3822 return error;
3823 }
3824 error = 0;
3825 }
3826 lck_mtx_lock(nfs_buf_mutex);
3827 continue;
3828 }
3829 }
3830 SET(bp->nb_flags, NB_INVAL);
3831 // hold off on FREEUPs until we're done here
3832 nfs_buf_release(bp, 0);
3833 lck_mtx_lock(nfs_buf_mutex);
3834 }
3835 nfs_buf_itercomplete(np, &blist, list);
3836 }
3837 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
3838 panic("nfs_vinvalbuf: flush/inval failed");
3839 }
3840 lck_mtx_unlock(nfs_buf_mutex);
3841 nfs_node_lock_force(np);
3842 if (!(flags & V_SAVE)) {
3843 np->n_flag &= ~NMODIFIED;
3844 }
3845 if (vnode_vtype(NFSTOV(np)) == VREG) {
3846 np->n_lastrahead = -1;
3847 }
3848 nfs_node_unlock(np);
3849 NFS_BUF_FREEUP();
3850 return 0;
3851 }
3852
3853
3854 /*
3855 * Flush and invalidate all dirty buffers. If another process is already
3856 * doing the flush, just wait for completion.
3857 */
3858 int
3859 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3860 {
3861 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3862 }
3863
3864 int
3865 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3866 {
3867 nfsnode_t np = VTONFS(vp);
3868 struct nfsmount *nmp = VTONMP(vp);
3869 int error, slpflag, slptimeo, nflags, retry = 0;
3870 int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
3871 struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
3872 off_t size;
3873
3874 FSDBG_TOP(554, np, flags, intrflg, 0);
3875
3876 /*
3877 * If the mount is gone no sense to try and write anything.
3878 * and hang trying to do IO.
3879 */
3880 if (nfs_mount_gone(nmp)) {
3881 flags &= ~V_SAVE;
3882 ubcflags &= ~UBC_PUSHALL;
3883 }
3884
3885 if (nmp && !NMFLAG(nmp, INTR)) {
3886 intrflg = 0;
3887 }
3888 if (intrflg) {
3889 slpflag = PCATCH;
3890 slptimeo = 2 * hz;
3891 } else {
3892 slpflag = 0;
3893 slptimeo = 0;
3894 }
3895
3896 /* First wait for any other process doing a flush to complete. */
3897 lck_mtx_lock(nfs_buf_mutex);
3898 while (np->n_bflag & NBINVALINPROG) {
3899 np->n_bflag |= NBINVALWANT;
3900 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3901 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3902 lck_mtx_unlock(nfs_buf_mutex);
3903 return error;
3904 }
3905 if (np->n_bflag & NBINVALINPROG) {
3906 slpflag = 0;
3907 }
3908 }
3909 np->n_bflag |= NBINVALINPROG;
3910 lck_mtx_unlock(nfs_buf_mutex);
3911
3912 /* Now, flush as required. */
3913 again:
3914 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3915 while (error) {
3916 FSDBG(554, np, 0, 0, error);
3917 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3918 goto done;
3919 }
3920 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3921 }
3922
3923 /* get the pages out of vm also */
3924 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
3925 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
3926 if (error == EINVAL) {
3927 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3928 }
3929 if (retry++ < 10) { /* retry invalidating a few times */
3930 if (retry > 1 || error == ENXIO) {
3931 ubcflags &= ~UBC_PUSHALL;
3932 }
3933 goto again;
3934 }
3935 /* give up */
3936 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
3937 }
3938 }
3939 done:
3940 lck_mtx_lock(nfs_buf_mutex);
3941 nflags = np->n_bflag;
3942 np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
3943 lck_mtx_unlock(nfs_buf_mutex);
3944 if (nflags & NBINVALWANT) {
3945 wakeup(&np->n_bflag);
3946 }
3947
3948 FSDBG_BOT(554, np, flags, intrflg, error);
3949 return error;
3950 }
3951
3952 /*
3953 * Wait for any busy buffers to complete.
3954 */
3955 void
3956 nfs_wait_bufs(nfsnode_t np)
3957 {
3958 struct nfsbuf *bp;
3959 struct nfsbuflists blist;
3960 int error = 0;
3961
3962 lck_mtx_lock(nfs_buf_mutex);
3963 if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3964 while ((bp = LIST_FIRST(&blist))) {
3965 LIST_REMOVE(bp, nb_vnbufs);
3966 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3967 nfs_buf_refget(bp);
3968 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3969 if (error != EAGAIN) {
3970 nfs_buf_refrele(bp);
3971 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3972 lck_mtx_unlock(nfs_buf_mutex);
3973 return;
3974 }
3975 }
3976 nfs_buf_refrele(bp);
3977 nfs_buf_drop(bp);
3978 }
3979 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3980 }
3981 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3982 while ((bp = LIST_FIRST(&blist))) {
3983 LIST_REMOVE(bp, nb_vnbufs);
3984 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3985 nfs_buf_refget(bp);
3986 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3987 if (error != EAGAIN) {
3988 nfs_buf_refrele(bp);
3989 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3990 lck_mtx_unlock(nfs_buf_mutex);
3991 return;
3992 }
3993 }
3994 nfs_buf_refrele(bp);
3995 nfs_buf_drop(bp);
3996 }
3997 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3998 }
3999 lck_mtx_unlock(nfs_buf_mutex);
4000 }
4001
4002
4003 /*
4004 * Add an async I/O request to the mount's async I/O queue and make
4005 * sure that an nfsiod will service it.
4006 */
4007 void
4008 nfs_asyncio_finish(struct nfsreq *req)
4009 {
4010 struct nfsmount *nmp;
4011 struct nfsiod *niod;
4012 int started = 0;
4013
4014 FSDBG_TOP(552, nmp, 0, 0, 0);
4015 again:
4016 nmp = req->r_nmp;
4017
4018 if (nmp == NULL) {
4019 return;
4020 }
4021
4022 lck_mtx_lock(nfsiod_mutex);
4023 niod = nmp->nm_niod;
4024
4025 /* grab an nfsiod if we don't have one already */
4026 if (!niod) {
4027 niod = TAILQ_FIRST(&nfsiodfree);
4028 if (niod) {
4029 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4030 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4031 niod->niod_nmp = nmp;
4032 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
4033 /*
4034 * Try starting a new thread.
4035 * We may try a couple times if other callers
4036 * get the new threads before we do.
4037 */
4038 lck_mtx_unlock(nfsiod_mutex);
4039 started++;
4040 if (!nfsiod_start()) {
4041 goto again;
4042 }
4043 lck_mtx_lock(nfsiod_mutex);
4044 }
4045 }
4046
4047 /*
4048 * If we got here while being on the resendq we need to get off. This
4049 * happens when the timer fires and errors out requests from nfs_sigintr
4050 * or we receive a reply (UDP case) while being on the resend queue so
4051 * we're just finishing up and are not going to be resent.
4052 */
4053 lck_mtx_lock(&req->r_mtx);
4054 if (req->r_flags & R_RESENDQ) {
4055 lck_mtx_lock(&nmp->nm_lock);
4056 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4057 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4058 TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4059 req->r_rchain.tqe_next = NFSREQNOLIST;
4060 assert(req->r_refs > 1);
4061 /* Remove resendq reference */
4062 req->r_refs--;
4063 }
4064 lck_mtx_unlock(&nmp->nm_lock);
4065 req->r_flags &= ~R_RESENDQ;
4066 }
4067 lck_mtx_unlock(&req->r_mtx);
4068
4069 if (req->r_achain.tqe_next == NFSREQNOLIST) {
4070 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
4071 }
4072
4073 /* If this mount doesn't already have an nfsiod working on it... */
4074 if (!nmp->nm_niod) {
4075 if (niod) { /* give it the nfsiod we just grabbed */
4076 nmp->nm_niod = niod;
4077 lck_mtx_unlock(nfsiod_mutex);
4078 wakeup(niod);
4079 } else if (nfsiod_thread_count > 0) {
4080 /* just queue it up on nfsiod mounts queue if needed */
4081 if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
4082 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
4083 }
4084 lck_mtx_unlock(nfsiod_mutex);
4085 } else {
4086 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4087 lck_mtx_unlock(nfsiod_mutex);
4088 /* we have no other option but to be persistent */
4089 started = 0;
4090 goto again;
4091 }
4092 } else {
4093 lck_mtx_unlock(nfsiod_mutex);
4094 }
4095
4096 FSDBG_BOT(552, nmp, 0, 0, 0);
4097 }
4098
4099 /*
4100 * queue up async I/O request for resend
4101 */
4102 void
4103 nfs_asyncio_resend(struct nfsreq *req)
4104 {
4105 struct nfsmount *nmp = req->r_nmp;
4106
4107 if (nfs_mount_gone(nmp)) {
4108 return;
4109 }
4110
4111 #if CONFIG_NFS_GSS
4112 nfs_gss_clnt_rpcdone(req);
4113 #endif
4114 lck_mtx_lock(&nmp->nm_lock);
4115 if (!(req->r_flags & R_RESENDQ)) {
4116 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4117 req->r_flags |= R_RESENDQ;
4118 /*
4119 * We take a reference on this request so that it can't be
4120 * destroyed while a resend is queued or in progress.
4121 */
4122 nfs_request_ref(req, 1);
4123 }
4124 nfs_mount_sock_thread_wake(nmp);
4125 lck_mtx_unlock(&nmp->nm_lock);
4126 }
4127
4128 /*
4129 * Read directory data into a buffer.
4130 *
4131 * Buffer will be filled (unless EOF is hit).
4132 * Buffers after this one may also be completely/partially filled.
4133 */
4134 int
4135 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
4136 {
4137 nfsnode_t np = bp->nb_np;
4138 struct nfsmount *nmp = NFSTONMP(np);
4139 int error = 0;
4140
4141 if (nfs_mount_gone(nmp)) {
4142 return ENXIO;
4143 }
4144
4145 if (nmp->nm_vers < NFS_VER4) {
4146 error = nfs3_readdir_rpc(np, bp, ctx);
4147 }
4148 #if CONFIG_NFS4
4149 else {
4150 error = nfs4_readdir_rpc(np, bp, ctx);
4151 }
4152 #endif
4153 if (error && (error != NFSERR_DIRBUFDROPPED)) {
4154 SET(bp->nb_flags, NB_ERROR);
4155 bp->nb_error = error;
4156 }
4157 return error;
4158 }
4159
4160 #endif /* CONFIG_NFS_CLIENT */