]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
cb1f92939b45cba33d036d49f2611754618bce90
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/resourcevar.h>
70 #include <sys/signalvar.h>
71 #include <sys/proc_internal.h>
72 #include <sys/kauth.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/dirent.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/ubc_internal.h>
79 #include <sys/uio_internal.h>
80 #include <sys/kpi_mbuf.h>
81
82 #include <sys/vm.h>
83 #include <sys/vmparam.h>
84
85 #include <sys/time.h>
86 #include <kern/clock.h>
87 #include <libkern/OSAtomic.h>
88 #include <kern/kalloc.h>
89 #include <kern/thread_call.h>
90
91 #include <nfs/rpcv2.h>
92 #include <nfs/nfsproto.h>
93 #include <nfs/nfs.h>
94 #include <nfs/nfs_gss.h>
95 #include <nfs/nfsmount.h>
96 #include <nfs/nfsnode.h>
97 #include <sys/buf_internal.h>
98 #include <libkern/OSAtomic.h>
99
100 #define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
101
102 kern_return_t thread_terminate(thread_t); /* XXX */
103
104 #define NFSBUFHASH(np, lbn) \
105 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
106 LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
107 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
108 u_long nfsbufhash;
109 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
110 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
111 int nfs_nbdwrite;
112 int nfs_buf_timer_on = 0;
113 thread_t nfsbufdelwrithd = NULL;
114
115 lck_grp_t *nfs_buf_lck_grp;
116 lck_mtx_t *nfs_buf_mutex;
117
118 #define NFSBUF_FREE_PERIOD 30 /* seconds */
119 #define NFSBUF_LRU_STALE 120
120 #define NFSBUF_META_STALE 240
121
122 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
123 #define LRU_TO_FREEUP 6
124 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
125 #define META_TO_FREEUP 3
126 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
127 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
128 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
129 #define LRU_FREEUP_FRAC_ON_TIMER 8
130 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
131 #define META_FREEUP_FRAC_ON_TIMER 16
132 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
133 #define LRU_FREEUP_MIN_FRAC 4
134 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
135 #define META_FREEUP_MIN_FRAC 2
136
137 #define NFS_BUF_FREEUP() \
138 do { \
139 /* only call nfs_buf_freeup() if it has work to do: */ \
140 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
141 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
142 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
143 nfs_buf_freeup(0); \
144 } while (0)
145
146 /*
147 * Initialize nfsbuf lists
148 */
149 void
150 nfs_nbinit(void)
151 {
152 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
153 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
154
155 nfsbufcnt = nfsbufmetacnt =
156 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
157 nfsbufmin = 128;
158 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
159 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
160 nfsbufmetamax = nfsbufmax / 4;
161 nfsneedbuffer = 0;
162 nfs_nbdwrite = 0;
163
164 nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
165 TAILQ_INIT(&nfsbuffree);
166 TAILQ_INIT(&nfsbuffreemeta);
167 TAILQ_INIT(&nfsbufdelwri);
168 }
169
170 /*
171 * Check periodically for stale/unused nfs bufs
172 */
173 void
174 nfs_buf_timer(__unused void *param0, __unused void *param1)
175 {
176 nfs_buf_freeup(1);
177
178 lck_mtx_lock(nfs_buf_mutex);
179 if (nfsbufcnt <= nfsbufmin) {
180 nfs_buf_timer_on = 0;
181 lck_mtx_unlock(nfs_buf_mutex);
182 return;
183 }
184 lck_mtx_unlock(nfs_buf_mutex);
185
186 nfs_interval_timer_start(nfs_buf_timer_call,
187 NFSBUF_FREE_PERIOD * 1000);
188 }
189
190 /*
191 * try to free up some excess, unused nfsbufs
192 */
193 void
194 nfs_buf_freeup(int timer)
195 {
196 struct nfsbuf *fbp;
197 struct timeval now;
198 int count;
199 struct nfsbuffreehead nfsbuffreeup;
200
201 TAILQ_INIT(&nfsbuffreeup);
202
203 lck_mtx_lock(nfs_buf_mutex);
204
205 microuptime(&now);
206
207 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
208
209 count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
210 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
211 fbp = TAILQ_FIRST(&nfsbuffree);
212 if (!fbp) {
213 break;
214 }
215 if (fbp->nb_refs) {
216 break;
217 }
218 if (NBUFSTAMPVALID(fbp) &&
219 (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
220 break;
221 }
222 nfs_buf_remfree(fbp);
223 /* disassociate buffer from any nfsnode */
224 if (fbp->nb_np) {
225 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
226 LIST_REMOVE(fbp, nb_vnbufs);
227 fbp->nb_vnbufs.le_next = NFSNOLIST;
228 }
229 fbp->nb_np = NULL;
230 }
231 LIST_REMOVE(fbp, nb_hash);
232 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
233 nfsbufcnt--;
234 }
235
236 count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
237 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
238 fbp = TAILQ_FIRST(&nfsbuffreemeta);
239 if (!fbp) {
240 break;
241 }
242 if (fbp->nb_refs) {
243 break;
244 }
245 if (NBUFSTAMPVALID(fbp) &&
246 (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
247 break;
248 }
249 nfs_buf_remfree(fbp);
250 /* disassociate buffer from any nfsnode */
251 if (fbp->nb_np) {
252 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
253 LIST_REMOVE(fbp, nb_vnbufs);
254 fbp->nb_vnbufs.le_next = NFSNOLIST;
255 }
256 fbp->nb_np = NULL;
257 }
258 LIST_REMOVE(fbp, nb_hash);
259 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
260 nfsbufcnt--;
261 nfsbufmetacnt--;
262 }
263
264 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
265 NFSBUFCNTCHK();
266
267 lck_mtx_unlock(nfs_buf_mutex);
268
269 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
270 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
271 /* nuke any creds */
272 if (IS_VALID_CRED(fbp->nb_rcred)) {
273 kauth_cred_unref(&fbp->nb_rcred);
274 }
275 if (IS_VALID_CRED(fbp->nb_wcred)) {
276 kauth_cred_unref(&fbp->nb_wcred);
277 }
278 /* if buf was NB_META, dump buffer */
279 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
280 kfree(fbp->nb_data, fbp->nb_bufsize);
281 }
282 FREE(fbp, M_TEMP);
283 }
284 }
285
286 /*
287 * remove a buffer from the freelist
288 * (must be called with nfs_buf_mutex held)
289 */
290 void
291 nfs_buf_remfree(struct nfsbuf *bp)
292 {
293 if (bp->nb_free.tqe_next == NFSNOLIST) {
294 panic("nfsbuf not on free list");
295 }
296 if (ISSET(bp->nb_flags, NB_DELWRI)) {
297 nfsbufdelwricnt--;
298 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
299 } else if (ISSET(bp->nb_flags, NB_META)) {
300 nfsbuffreemetacnt--;
301 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
302 } else {
303 nfsbuffreecnt--;
304 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
305 }
306 bp->nb_free.tqe_next = NFSNOLIST;
307 NFSBUFCNTCHK();
308 }
309
310 /*
311 * check for existence of nfsbuf in cache
312 */
313 boolean_t
314 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
315 {
316 boolean_t rv;
317 lck_mtx_lock(nfs_buf_mutex);
318 if (nfs_buf_incore(np, blkno)) {
319 rv = TRUE;
320 } else {
321 rv = FALSE;
322 }
323 lck_mtx_unlock(nfs_buf_mutex);
324 return rv;
325 }
326
327 /*
328 * return incore buffer (must be called with nfs_buf_mutex held)
329 */
330 struct nfsbuf *
331 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
332 {
333 /* Search hash chain */
334 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
335 for (; bp != NULL; bp = bp->nb_hash.le_next) {
336 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
337 if (!ISSET(bp->nb_flags, NB_INVAL)) {
338 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
339 return bp;
340 }
341 }
342 }
343 return NULL;
344 }
345
346 /*
347 * Check if it's OK to drop a page.
348 *
349 * Called by vnode_pager() on pageout request of non-dirty page.
350 * We need to make sure that it's not part of a delayed write.
351 * If it is, we can't let the VM drop it because we may need it
352 * later when/if we need to write the data (again).
353 */
354 int
355 nfs_buf_page_inval(vnode_t vp, off_t offset)
356 {
357 struct nfsmount *nmp = VTONMP(vp);
358 struct nfsbuf *bp;
359 int error = 0;
360
361 if (nfs_mount_gone(nmp)) {
362 return ENXIO;
363 }
364
365 lck_mtx_lock(nfs_buf_mutex);
366 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
367 if (!bp) {
368 goto out;
369 }
370 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
371 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
372 error = EBUSY;
373 goto out;
374 }
375 /*
376 * If there's a dirty range in the buffer, check to
377 * see if this page intersects with the dirty range.
378 * If it does, we can't let the pager drop the page.
379 */
380 if (bp->nb_dirtyend > 0) {
381 int start = offset - NBOFF(bp);
382 if ((bp->nb_dirtyend > start) &&
383 (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
384 /*
385 * Before returning the bad news, move the
386 * buffer to the start of the delwri list and
387 * give the list a push to try to flush the
388 * buffer out.
389 */
390 error = EBUSY;
391 nfs_buf_remfree(bp);
392 TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
393 nfsbufdelwricnt++;
394 nfs_buf_delwri_push(1);
395 }
396 }
397 out:
398 lck_mtx_unlock(nfs_buf_mutex);
399 return error;
400 }
401
402 /*
403 * set up the UPL for a buffer
404 * (must NOT be called with nfs_buf_mutex held)
405 */
406 int
407 nfs_buf_upl_setup(struct nfsbuf *bp)
408 {
409 kern_return_t kret;
410 upl_t upl;
411 int upl_flags;
412
413 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
414 return 0;
415 }
416
417 upl_flags = UPL_PRECIOUS;
418 if (!ISSET(bp->nb_flags, NB_READ)) {
419 /*
420 * We're doing a "write", so we intend to modify
421 * the pages we're gathering.
422 */
423 upl_flags |= UPL_WILL_MODIFY;
424 }
425 kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
426 &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
427 if (kret == KERN_INVALID_ARGUMENT) {
428 /* vm object probably doesn't exist any more */
429 bp->nb_pagelist = NULL;
430 return EINVAL;
431 }
432 if (kret != KERN_SUCCESS) {
433 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
434 bp->nb_pagelist = NULL;
435 return EIO;
436 }
437
438 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
439
440 bp->nb_pagelist = upl;
441 SET(bp->nb_flags, NB_PAGELIST);
442 return 0;
443 }
444
445 /*
446 * update buffer's valid/dirty info from UBC
447 * (must NOT be called with nfs_buf_mutex held)
448 */
449 void
450 nfs_buf_upl_check(struct nfsbuf *bp)
451 {
452 upl_page_info_t *pl;
453 off_t filesize, fileoffset;
454 int i, npages;
455
456 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
457 return;
458 }
459
460 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
461 filesize = ubc_getsize(NFSTOV(bp->nb_np));
462 fileoffset = NBOFF(bp);
463 if (fileoffset < filesize) {
464 SET(bp->nb_flags, NB_CACHE);
465 } else {
466 CLR(bp->nb_flags, NB_CACHE);
467 }
468
469 pl = ubc_upl_pageinfo(bp->nb_pagelist);
470 bp->nb_valid = bp->nb_dirty = 0;
471
472 for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
473 /* anything beyond the end of the file is not valid or dirty */
474 if (fileoffset >= filesize) {
475 break;
476 }
477 if (!upl_valid_page(pl, i)) {
478 CLR(bp->nb_flags, NB_CACHE);
479 continue;
480 }
481 NBPGVALID_SET(bp, i);
482 if (upl_dirty_page(pl, i)) {
483 NBPGDIRTY_SET(bp, i);
484 }
485 }
486 fileoffset = NBOFF(bp);
487 if (ISSET(bp->nb_flags, NB_CACHE)) {
488 bp->nb_validoff = 0;
489 bp->nb_validend = bp->nb_bufsize;
490 if (fileoffset + bp->nb_validend > filesize) {
491 bp->nb_validend = filesize - fileoffset;
492 }
493 } else {
494 bp->nb_validoff = bp->nb_validend = -1;
495 }
496 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
497 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
498 }
499
500 /*
501 * make sure that a buffer is mapped
502 * (must NOT be called with nfs_buf_mutex held)
503 */
504 int
505 nfs_buf_map(struct nfsbuf *bp)
506 {
507 kern_return_t kret;
508
509 if (bp->nb_data) {
510 return 0;
511 }
512 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
513 return EINVAL;
514 }
515
516 kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
517 if (kret != KERN_SUCCESS) {
518 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
519 }
520 if (bp->nb_data == 0) {
521 panic("ubc_upl_map mapped 0");
522 }
523 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
524 return 0;
525 }
526
527 /*
528 * normalize an nfsbuf's valid range
529 *
530 * the read/write code guarantees that we'll always have a valid
531 * region that is an integral number of pages. If either end
532 * of the valid range isn't page-aligned, it gets corrected
533 * here as we extend the valid range through all of the
534 * contiguous valid pages.
535 */
536 void
537 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
538 {
539 int pg, npg;
540 /* pull validoff back to start of contiguous valid page range */
541 pg = bp->nb_validoff / PAGE_SIZE;
542 while (pg >= 0 && NBPGVALID(bp, pg)) {
543 pg--;
544 }
545 bp->nb_validoff = (pg + 1) * PAGE_SIZE;
546 /* push validend forward to end of contiguous valid page range */
547 npg = bp->nb_bufsize / PAGE_SIZE;
548 pg = bp->nb_validend / PAGE_SIZE;
549 while (pg < npg && NBPGVALID(bp, pg)) {
550 pg++;
551 }
552 bp->nb_validend = pg * PAGE_SIZE;
553 /* clip to EOF */
554 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
555 bp->nb_validend = np->n_size % bp->nb_bufsize;
556 }
557 }
558
559 /*
560 * process some entries on the delayed write queue
561 * (must be called with nfs_buf_mutex held)
562 */
563 void
564 nfs_buf_delwri_service(void)
565 {
566 struct nfsbuf *bp;
567 nfsnode_t np;
568 int error, i = 0;
569
570 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
571 np = bp->nb_np;
572 nfs_buf_remfree(bp);
573 nfs_buf_refget(bp);
574 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
575 ;
576 }
577 nfs_buf_refrele(bp);
578 if (error) {
579 break;
580 }
581 if (!bp->nb_np) {
582 /* buffer is no longer valid */
583 nfs_buf_drop(bp);
584 continue;
585 }
586 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
587 nfs_buf_check_write_verifier(np, bp);
588 }
589 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
590 /* put buffer at end of delwri list */
591 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
592 nfsbufdelwricnt++;
593 nfs_buf_drop(bp);
594 lck_mtx_unlock(nfs_buf_mutex);
595 nfs_flushcommits(np, 1);
596 } else {
597 SET(bp->nb_flags, NB_ASYNC);
598 lck_mtx_unlock(nfs_buf_mutex);
599 nfs_buf_write(bp);
600 }
601 i++;
602 lck_mtx_lock(nfs_buf_mutex);
603 }
604 }
605
606 /*
607 * thread to service the delayed write queue when asked
608 */
609 void
610 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
611 {
612 struct timespec ts = { 30, 0 };
613 int error = 0;
614
615 lck_mtx_lock(nfs_buf_mutex);
616 while (!error) {
617 nfs_buf_delwri_service();
618 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
619 }
620 nfsbufdelwrithd = NULL;
621 lck_mtx_unlock(nfs_buf_mutex);
622 thread_terminate(nfsbufdelwrithd);
623 }
624
625 /*
626 * try to push out some delayed/uncommitted writes
627 * ("locked" indicates whether nfs_buf_mutex is already held)
628 */
629 void
630 nfs_buf_delwri_push(int locked)
631 {
632 if (TAILQ_EMPTY(&nfsbufdelwri)) {
633 return;
634 }
635 if (!locked) {
636 lck_mtx_lock(nfs_buf_mutex);
637 }
638 /* wake up the delayed write service thread */
639 if (nfsbufdelwrithd) {
640 wakeup(&nfsbufdelwrithd);
641 } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
642 thread_deallocate(nfsbufdelwrithd);
643 }
644 /* otherwise, try to do some of the work ourselves */
645 if (!nfsbufdelwrithd) {
646 nfs_buf_delwri_service();
647 }
648 if (!locked) {
649 lck_mtx_unlock(nfs_buf_mutex);
650 }
651 }
652
653 /*
654 * Get an nfs buffer.
655 *
656 * Returns errno on error, 0 otherwise.
657 * Any buffer is returned in *bpp.
658 *
659 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
660 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
661 *
662 * Check for existence of buffer in cache.
663 * Or attempt to reuse a buffer from one of the free lists.
664 * Or allocate a new buffer if we haven't already hit max allocation.
665 * Or wait for a free buffer.
666 *
667 * If available buffer found, prepare it, and return it.
668 *
669 * If the calling process is interrupted by a signal for
670 * an interruptible mount point, return EINTR.
671 */
672 int
673 nfs_buf_get(
674 nfsnode_t np,
675 daddr64_t blkno,
676 uint32_t size,
677 thread_t thd,
678 int flags,
679 struct nfsbuf **bpp)
680 {
681 vnode_t vp = NFSTOV(np);
682 struct nfsmount *nmp = VTONMP(vp);
683 struct nfsbuf *bp;
684 uint32_t bufsize;
685 int slpflag = PCATCH;
686 int operation = (flags & NBLK_OPMASK);
687 int error = 0;
688 struct timespec ts;
689
690 FSDBG_TOP(541, np, blkno, size, flags);
691 *bpp = NULL;
692
693 bufsize = size;
694 if (bufsize > NFS_MAXBSIZE) {
695 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
696 }
697
698 if (nfs_mount_gone(nmp)) {
699 FSDBG_BOT(541, np, blkno, 0, ENXIO);
700 return ENXIO;
701 }
702
703 if (!UBCINFOEXISTS(vp)) {
704 operation = NBLK_META;
705 } else if (bufsize < (uint32_t)nmp->nm_biosize) {
706 /* reg files should always have biosize blocks */
707 bufsize = nmp->nm_biosize;
708 }
709
710 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
711 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
712 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
713
714 /* poke the delwri list */
715 nfs_buf_delwri_push(0);
716
717 /* sleep to let other threads run... */
718 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
719 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
720 }
721
722 loop:
723 lck_mtx_lock(nfs_buf_mutex);
724
725 /* wait for any buffer invalidation/flushing to complete */
726 while (np->n_bflag & NBINVALINPROG) {
727 np->n_bflag |= NBINVALWANT;
728 ts.tv_sec = 2;
729 ts.tv_nsec = 0;
730 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
731 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
732 lck_mtx_unlock(nfs_buf_mutex);
733 FSDBG_BOT(541, np, blkno, 0, error);
734 return error;
735 }
736 if (np->n_bflag & NBINVALINPROG) {
737 slpflag = 0;
738 }
739 }
740
741 /* check for existence of nfsbuf in cache */
742 if ((bp = nfs_buf_incore(np, blkno))) {
743 /* if busy, set wanted and wait */
744 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
745 if (flags & NBLK_NOWAIT) {
746 lck_mtx_unlock(nfs_buf_mutex);
747 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
748 return 0;
749 }
750 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
751 SET(bp->nb_lflags, NBL_WANTED);
752
753 ts.tv_sec = 2;
754 ts.tv_nsec = 0;
755 msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
756 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
757 slpflag = 0;
758 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
759 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
760 FSDBG_BOT(541, np, blkno, 0, error);
761 return error;
762 }
763 goto loop;
764 }
765 if (bp->nb_bufsize != bufsize) {
766 panic("nfsbuf size mismatch");
767 }
768 SET(bp->nb_lflags, NBL_BUSY);
769 SET(bp->nb_flags, NB_CACHE);
770 nfs_buf_remfree(bp);
771 /* additional paranoia: */
772 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
773 panic("pagelist buffer was not busy");
774 }
775 goto buffer_setup;
776 }
777
778 if (flags & NBLK_ONLYVALID) {
779 lck_mtx_unlock(nfs_buf_mutex);
780 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
781 return 0;
782 }
783
784 /*
785 * where to get a free buffer:
786 * - if meta and maxmeta reached, must reuse meta
787 * - alloc new if we haven't reached min bufs
788 * - if free lists are NOT empty
789 * - if free list is stale, use it
790 * - else if freemeta list is stale, use it
791 * - else if max bufs allocated, use least-time-to-stale
792 * - alloc new if we haven't reached max allowed
793 * - start clearing out delwri list and try again
794 */
795
796 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
797 /* if we've hit max meta buffers, must reuse a meta buffer */
798 bp = TAILQ_FIRST(&nfsbuffreemeta);
799 } else if ((nfsbufcnt > nfsbufmin) &&
800 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
801 /* try to pull an nfsbuf off a free list */
802 struct nfsbuf *lrubp, *metabp;
803 struct timeval now;
804 microuptime(&now);
805
806 /* if the next LRU or META buffer is invalid or stale, use it */
807 lrubp = TAILQ_FIRST(&nfsbuffree);
808 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
809 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
810 bp = lrubp;
811 }
812 metabp = TAILQ_FIRST(&nfsbuffreemeta);
813 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
814 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
815 bp = metabp;
816 }
817
818 if (!bp && (nfsbufcnt >= nfsbufmax)) {
819 /* we've already allocated all bufs, so */
820 /* choose the buffer that'll go stale first */
821 if (!metabp) {
822 bp = lrubp;
823 } else if (!lrubp) {
824 bp = metabp;
825 } else {
826 int32_t lru_stale_time, meta_stale_time;
827 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
828 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
829 if (lru_stale_time <= meta_stale_time) {
830 bp = lrubp;
831 } else {
832 bp = metabp;
833 }
834 }
835 }
836 }
837
838 if (bp) {
839 /* we have a buffer to reuse */
840 FSDBG(544, np, blkno, bp, bp->nb_flags);
841 nfs_buf_remfree(bp);
842 if (ISSET(bp->nb_flags, NB_DELWRI)) {
843 panic("nfs_buf_get: delwri");
844 }
845 SET(bp->nb_lflags, NBL_BUSY);
846 /* disassociate buffer from previous nfsnode */
847 if (bp->nb_np) {
848 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
849 LIST_REMOVE(bp, nb_vnbufs);
850 bp->nb_vnbufs.le_next = NFSNOLIST;
851 }
852 bp->nb_np = NULL;
853 }
854 LIST_REMOVE(bp, nb_hash);
855 /* nuke any creds we're holding */
856 if (IS_VALID_CRED(bp->nb_rcred)) {
857 kauth_cred_unref(&bp->nb_rcred);
858 }
859 if (IS_VALID_CRED(bp->nb_wcred)) {
860 kauth_cred_unref(&bp->nb_wcred);
861 }
862 /* if buf will no longer be NB_META, dump old buffer */
863 if (operation == NBLK_META) {
864 if (!ISSET(bp->nb_flags, NB_META)) {
865 nfsbufmetacnt++;
866 }
867 } else if (ISSET(bp->nb_flags, NB_META)) {
868 if (bp->nb_data) {
869 kfree(bp->nb_data, bp->nb_bufsize);
870 bp->nb_data = NULL;
871 }
872 nfsbufmetacnt--;
873 }
874 /* re-init buf fields */
875 bp->nb_error = 0;
876 bp->nb_validoff = bp->nb_validend = -1;
877 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
878 bp->nb_valid = 0;
879 bp->nb_dirty = 0;
880 bp->nb_verf = 0;
881 } else {
882 /* no buffer to reuse */
883 if ((nfsbufcnt < nfsbufmax) &&
884 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
885 /* just alloc a new one */
886 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
887 if (!bp) {
888 lck_mtx_unlock(nfs_buf_mutex);
889 FSDBG_BOT(541, np, blkno, 0, error);
890 return ENOMEM;
891 }
892 nfsbufcnt++;
893
894 /*
895 * If any excess bufs, make sure the timer
896 * is running to free them up later.
897 */
898 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
899 nfs_buf_timer_on = 1;
900 nfs_interval_timer_start(nfs_buf_timer_call,
901 NFSBUF_FREE_PERIOD * 1000);
902 }
903
904 if (operation == NBLK_META) {
905 nfsbufmetacnt++;
906 }
907 NFSBUFCNTCHK();
908 /* init nfsbuf */
909 bzero(bp, sizeof(*bp));
910 bp->nb_free.tqe_next = NFSNOLIST;
911 bp->nb_validoff = bp->nb_validend = -1;
912 FSDBG(545, np, blkno, bp, 0);
913 } else {
914 /* too many bufs... wait for buffers to free up */
915 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
916
917 /* poke the delwri list */
918 nfs_buf_delwri_push(1);
919
920 nfsneedbuffer = 1;
921 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
922 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
923 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
924 FSDBG_BOT(541, np, blkno, 0, error);
925 return error;
926 }
927 goto loop;
928 }
929 }
930
931 /* set up nfsbuf */
932 SET(bp->nb_lflags, NBL_BUSY);
933 bp->nb_flags = 0;
934 bp->nb_lblkno = blkno;
935 /* insert buf in hash */
936 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
937 /* associate buffer with new nfsnode */
938 bp->nb_np = np;
939 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
940
941 buffer_setup:
942
943 /* unlock hash */
944 lck_mtx_unlock(nfs_buf_mutex);
945
946 switch (operation) {
947 case NBLK_META:
948 SET(bp->nb_flags, NB_META);
949 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
950 kfree(bp->nb_data, bp->nb_bufsize);
951 bp->nb_data = NULL;
952 bp->nb_validoff = bp->nb_validend = -1;
953 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
954 bp->nb_valid = 0;
955 bp->nb_dirty = 0;
956 CLR(bp->nb_flags, NB_CACHE);
957 }
958 if (!bp->nb_data) {
959 bp->nb_data = kalloc(bufsize);
960 }
961 if (!bp->nb_data) {
962 /* Ack! couldn't allocate the data buffer! */
963 /* clean up buffer and return error */
964 lck_mtx_lock(nfs_buf_mutex);
965 LIST_REMOVE(bp, nb_vnbufs);
966 bp->nb_vnbufs.le_next = NFSNOLIST;
967 bp->nb_np = NULL;
968 /* invalidate usage timestamp to allow immediate freeing */
969 NBUFSTAMPINVALIDATE(bp);
970 if (bp->nb_free.tqe_next != NFSNOLIST) {
971 panic("nfsbuf on freelist");
972 }
973 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
974 nfsbuffreecnt++;
975 lck_mtx_unlock(nfs_buf_mutex);
976 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
977 return ENOMEM;
978 }
979 bp->nb_bufsize = bufsize;
980 break;
981
982 case NBLK_READ:
983 case NBLK_WRITE:
984 /*
985 * Set or clear NB_READ now to let the UPL subsystem know
986 * if we intend to modify the pages or not.
987 */
988 if (operation == NBLK_READ) {
989 SET(bp->nb_flags, NB_READ);
990 } else {
991 CLR(bp->nb_flags, NB_READ);
992 }
993 if (bufsize < PAGE_SIZE) {
994 bufsize = PAGE_SIZE;
995 }
996 bp->nb_bufsize = bufsize;
997 bp->nb_validoff = bp->nb_validend = -1;
998
999 if (UBCINFOEXISTS(vp)) {
1000 /* set up upl */
1001 if (nfs_buf_upl_setup(bp)) {
1002 /* unable to create upl */
1003 /* vm object must no longer exist */
1004 /* clean up buffer and return error */
1005 lck_mtx_lock(nfs_buf_mutex);
1006 LIST_REMOVE(bp, nb_vnbufs);
1007 bp->nb_vnbufs.le_next = NFSNOLIST;
1008 bp->nb_np = NULL;
1009 /* invalidate usage timestamp to allow immediate freeing */
1010 NBUFSTAMPINVALIDATE(bp);
1011 if (bp->nb_free.tqe_next != NFSNOLIST) {
1012 panic("nfsbuf on freelist");
1013 }
1014 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1015 nfsbuffreecnt++;
1016 lck_mtx_unlock(nfs_buf_mutex);
1017 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
1018 return EIO;
1019 }
1020 nfs_buf_upl_check(bp);
1021 }
1022 break;
1023
1024 default:
1025 panic("nfs_buf_get: %d unknown operation", operation);
1026 }
1027
1028 *bpp = bp;
1029
1030 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
1031
1032 return 0;
1033 }
1034
1035 void
1036 nfs_buf_release(struct nfsbuf *bp, int freeup)
1037 {
1038 nfsnode_t np = bp->nb_np;
1039 vnode_t vp;
1040 struct timeval now;
1041 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
1042
1043 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1044 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1045 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1046
1047 vp = np ? NFSTOV(np) : NULL;
1048 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
1049 int upl_flags, rv;
1050 upl_t upl;
1051 uint32_t i;
1052
1053 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1054 rv = nfs_buf_upl_setup(bp);
1055 if (rv) {
1056 printf("nfs_buf_release: upl create failed %d\n", rv);
1057 } else {
1058 nfs_buf_upl_check(bp);
1059 }
1060 }
1061 upl = bp->nb_pagelist;
1062 if (!upl) {
1063 goto pagelist_cleanup_done;
1064 }
1065 if (bp->nb_data) {
1066 if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
1067 panic("ubc_upl_unmap failed");
1068 }
1069 bp->nb_data = NULL;
1070 }
1071 /*
1072 * Abort the pages on error or: if this is an invalid or
1073 * non-needcommit nocache buffer AND no pages are dirty.
1074 */
1075 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1076 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1077 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
1078 upl_flags = UPL_ABORT_DUMP_PAGES;
1079 } else {
1080 upl_flags = 0;
1081 }
1082 ubc_upl_abort(upl, upl_flags);
1083 goto pagelist_cleanup_done;
1084 }
1085 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1086 if (!NBPGVALID(bp, i)) {
1087 ubc_upl_abort_range(upl,
1088 i * PAGE_SIZE, PAGE_SIZE,
1089 UPL_ABORT_DUMP_PAGES |
1090 UPL_ABORT_FREE_ON_EMPTY);
1091 } else {
1092 if (NBPGDIRTY(bp, i)) {
1093 upl_flags = UPL_COMMIT_SET_DIRTY;
1094 } else {
1095 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1096 }
1097
1098 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
1099 upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1100 }
1101
1102 ubc_upl_commit_range(upl,
1103 i * PAGE_SIZE, PAGE_SIZE,
1104 upl_flags |
1105 UPL_COMMIT_INACTIVATE |
1106 UPL_COMMIT_FREE_ON_EMPTY);
1107 }
1108 }
1109 pagelist_cleanup_done:
1110 /* invalidate any pages past EOF */
1111 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1112 off_t start, end;
1113 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1114 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1115 if (start < NBOFF(bp)) {
1116 start = NBOFF(bp);
1117 }
1118 if (end > start) {
1119 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
1120 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
1121 }
1122 }
1123 }
1124 CLR(bp->nb_flags, NB_PAGELIST);
1125 bp->nb_pagelist = NULL;
1126 }
1127
1128 lck_mtx_lock(nfs_buf_mutex);
1129
1130 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1131
1132 /* Wake up any processes waiting for any buffer to become free. */
1133 if (nfsneedbuffer) {
1134 nfsneedbuffer = 0;
1135 wakeup_needbuffer = 1;
1136 }
1137 /* Wake up any processes waiting for _this_ buffer to become free. */
1138 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1139 CLR(bp->nb_lflags, NBL_WANTED);
1140 wakeup_buffer = 1;
1141 }
1142
1143 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1144 if (ISSET(bp->nb_flags, NB_ERROR) ||
1145 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
1146 SET(bp->nb_flags, NB_INVAL);
1147 }
1148
1149 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1150 /* If it's invalid or empty, dissociate it from its nfsnode */
1151 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1152 LIST_REMOVE(bp, nb_vnbufs);
1153 bp->nb_vnbufs.le_next = NFSNOLIST;
1154 }
1155 bp->nb_np = NULL;
1156 /* if this was a delayed write, wakeup anyone */
1157 /* waiting for delayed writes to complete */
1158 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1159 CLR(bp->nb_flags, NB_DELWRI);
1160 nfs_nbdwrite--;
1161 NFSBUFCNTCHK();
1162 wakeup_nbdwrite = 1;
1163 }
1164 /* invalidate usage timestamp to allow immediate freeing */
1165 NBUFSTAMPINVALIDATE(bp);
1166 /* put buffer at head of free list */
1167 if (bp->nb_free.tqe_next != NFSNOLIST) {
1168 panic("nfsbuf on freelist");
1169 }
1170 SET(bp->nb_flags, NB_INVAL);
1171 if (ISSET(bp->nb_flags, NB_META)) {
1172 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1173 nfsbuffreemetacnt++;
1174 } else {
1175 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1176 nfsbuffreecnt++;
1177 }
1178 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1179 /* put buffer at end of delwri list */
1180 if (bp->nb_free.tqe_next != NFSNOLIST) {
1181 panic("nfsbuf on freelist");
1182 }
1183 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1184 nfsbufdelwricnt++;
1185 freeup = 0;
1186 } else {
1187 /* update usage timestamp */
1188 microuptime(&now);
1189 bp->nb_timestamp = now.tv_sec;
1190 /* put buffer at end of free list */
1191 if (bp->nb_free.tqe_next != NFSNOLIST) {
1192 panic("nfsbuf on freelist");
1193 }
1194 if (ISSET(bp->nb_flags, NB_META)) {
1195 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1196 nfsbuffreemetacnt++;
1197 } else {
1198 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1199 nfsbuffreecnt++;
1200 }
1201 }
1202
1203 NFSBUFCNTCHK();
1204
1205 /* Unlock the buffer. */
1206 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1207 CLR(bp->nb_lflags, NBL_BUSY);
1208
1209 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1210
1211 lck_mtx_unlock(nfs_buf_mutex);
1212
1213 if (wakeup_needbuffer) {
1214 wakeup(&nfsneedbuffer);
1215 }
1216 if (wakeup_buffer) {
1217 wakeup(bp);
1218 }
1219 if (wakeup_nbdwrite) {
1220 wakeup(&nfs_nbdwrite);
1221 }
1222 if (freeup) {
1223 NFS_BUF_FREEUP();
1224 }
1225 }
1226
1227 /*
1228 * Wait for operations on the buffer to complete.
1229 * When they do, extract and return the I/O's error value.
1230 */
1231 int
1232 nfs_buf_iowait(struct nfsbuf *bp)
1233 {
1234 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1235
1236 lck_mtx_lock(nfs_buf_mutex);
1237
1238 while (!ISSET(bp->nb_flags, NB_DONE)) {
1239 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1240 }
1241
1242 lck_mtx_unlock(nfs_buf_mutex);
1243
1244 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1245
1246 /* check for interruption of I/O, then errors. */
1247 if (ISSET(bp->nb_flags, NB_EINTR)) {
1248 CLR(bp->nb_flags, NB_EINTR);
1249 return EINTR;
1250 } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1251 return bp->nb_error ? bp->nb_error : EIO;
1252 }
1253 return 0;
1254 }
1255
1256 /*
1257 * Mark I/O complete on a buffer.
1258 */
1259 void
1260 nfs_buf_iodone(struct nfsbuf *bp)
1261 {
1262 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1263
1264 if (ISSET(bp->nb_flags, NB_DONE)) {
1265 panic("nfs_buf_iodone already");
1266 }
1267
1268 if (!ISSET(bp->nb_flags, NB_READ)) {
1269 CLR(bp->nb_flags, NB_WRITEINPROG);
1270 /*
1271 * vnode_writedone() takes care of waking up
1272 * any throttled write operations
1273 */
1274 vnode_writedone(NFSTOV(bp->nb_np));
1275 nfs_node_lock_force(bp->nb_np);
1276 bp->nb_np->n_numoutput--;
1277 nfs_node_unlock(bp->nb_np);
1278 }
1279 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1280 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1281 nfs_buf_release(bp, 1);
1282 } else { /* or just wakeup the buffer */
1283 lck_mtx_lock(nfs_buf_mutex);
1284 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1285 CLR(bp->nb_lflags, NBL_WANTED);
1286 lck_mtx_unlock(nfs_buf_mutex);
1287 wakeup(bp);
1288 }
1289
1290 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1291 }
1292
1293 void
1294 nfs_buf_write_delayed(struct nfsbuf *bp)
1295 {
1296 nfsnode_t np = bp->nb_np;
1297
1298 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1299 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1300
1301 /*
1302 * If the block hasn't been seen before:
1303 * (1) Mark it as having been seen,
1304 * (2) Make sure it's on its node's correct block list,
1305 */
1306 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1307 SET(bp->nb_flags, NB_DELWRI);
1308 /* move to dirty list */
1309 lck_mtx_lock(nfs_buf_mutex);
1310 nfs_nbdwrite++;
1311 NFSBUFCNTCHK();
1312 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1313 LIST_REMOVE(bp, nb_vnbufs);
1314 }
1315 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1316 lck_mtx_unlock(nfs_buf_mutex);
1317 }
1318
1319 /*
1320 * If the vnode has "too many" write operations in progress
1321 * wait for them to finish the IO
1322 */
1323 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1324
1325 /* the file is in a modified state, so make sure the flag's set */
1326 nfs_node_lock_force(np);
1327 np->n_flag |= NMODIFIED;
1328 nfs_node_unlock(np);
1329
1330 /*
1331 * If we have too many delayed write buffers,
1332 * just fall back to doing the async write.
1333 */
1334 if (nfs_nbdwrite < 0) {
1335 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1336 }
1337 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1338 /* issue async write */
1339 SET(bp->nb_flags, NB_ASYNC);
1340 nfs_buf_write(bp);
1341 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1342 return;
1343 }
1344
1345 /* Otherwise, the "write" is done, so mark and release the buffer. */
1346 SET(bp->nb_flags, NB_DONE);
1347 nfs_buf_release(bp, 1);
1348 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1349 return;
1350 }
1351
1352 /*
1353 * Check that a "needcommit" buffer can still be committed.
1354 * If the write verifier has changed, we need to clear the
1355 * the needcommit flag.
1356 */
1357 void
1358 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1359 {
1360 struct nfsmount *nmp;
1361
1362 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
1363 return;
1364 }
1365
1366 nmp = NFSTONMP(np);
1367 if (nfs_mount_gone(nmp)) {
1368 return;
1369 }
1370 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
1371 return;
1372 }
1373
1374 /* write verifier changed, clear commit/wverf flags */
1375 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1376 bp->nb_verf = 0;
1377 nfs_node_lock_force(np);
1378 np->n_needcommitcnt--;
1379 CHECK_NEEDCOMMITCNT(np);
1380 nfs_node_unlock(np);
1381 }
1382
1383 /*
1384 * add a reference to a buffer so it doesn't disappear while being used
1385 * (must be called with nfs_buf_mutex held)
1386 */
1387 void
1388 nfs_buf_refget(struct nfsbuf *bp)
1389 {
1390 bp->nb_refs++;
1391 }
1392 /*
1393 * release a reference on a buffer
1394 * (must be called with nfs_buf_mutex held)
1395 */
1396 void
1397 nfs_buf_refrele(struct nfsbuf *bp)
1398 {
1399 bp->nb_refs--;
1400 }
1401
1402 /*
1403 * mark a particular buffer as BUSY
1404 * (must be called with nfs_buf_mutex held)
1405 */
1406 errno_t
1407 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1408 {
1409 errno_t error;
1410 struct timespec ts;
1411
1412 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1413 /*
1414 * since the lck_mtx_lock may block, the buffer
1415 * may become BUSY, so we need to recheck for
1416 * a NOWAIT request
1417 */
1418 if (flags & NBAC_NOWAIT) {
1419 return EBUSY;
1420 }
1421 SET(bp->nb_lflags, NBL_WANTED);
1422
1423 ts.tv_sec = (slptimeo / 100);
1424 /* the hz value is 100; which leads to 10ms */
1425 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1426
1427 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1428 "nfs_buf_acquire", &ts);
1429 if (error) {
1430 return error;
1431 }
1432 return EAGAIN;
1433 }
1434 if (flags & NBAC_REMOVE) {
1435 nfs_buf_remfree(bp);
1436 }
1437 SET(bp->nb_lflags, NBL_BUSY);
1438
1439 return 0;
1440 }
1441
1442 /*
1443 * simply drop the BUSY status of a buffer
1444 * (must be called with nfs_buf_mutex held)
1445 */
1446 void
1447 nfs_buf_drop(struct nfsbuf *bp)
1448 {
1449 int need_wakeup = 0;
1450
1451 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
1452 panic("nfs_buf_drop: buffer not busy!");
1453 }
1454 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1455 /* delay the actual wakeup until after we clear NBL_BUSY */
1456 need_wakeup = 1;
1457 }
1458 /* Unlock the buffer. */
1459 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1460
1461 if (need_wakeup) {
1462 wakeup(bp);
1463 }
1464 }
1465
1466 /*
1467 * prepare for iterating over an nfsnode's buffer list
1468 * this lock protects the queue manipulation
1469 * (must be called with nfs_buf_mutex held)
1470 */
1471 int
1472 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1473 {
1474 struct nfsbuflists *listheadp;
1475
1476 if (flags & NBI_DIRTY) {
1477 listheadp = &np->n_dirtyblkhd;
1478 } else {
1479 listheadp = &np->n_cleanblkhd;
1480 }
1481
1482 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1483 LIST_INIT(iterheadp);
1484 return EWOULDBLOCK;
1485 }
1486
1487 while (np->n_bufiterflags & NBI_ITER) {
1488 np->n_bufiterflags |= NBI_ITERWANT;
1489 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1490 }
1491 if (LIST_EMPTY(listheadp)) {
1492 LIST_INIT(iterheadp);
1493 return EINVAL;
1494 }
1495 np->n_bufiterflags |= NBI_ITER;
1496
1497 iterheadp->lh_first = listheadp->lh_first;
1498 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1499 LIST_INIT(listheadp);
1500
1501 return 0;
1502 }
1503
1504 /*
1505 * clean up after iterating over an nfsnode's buffer list
1506 * this lock protects the queue manipulation
1507 * (must be called with nfs_buf_mutex held)
1508 */
1509 void
1510 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1511 {
1512 struct nfsbuflists * listheadp;
1513 struct nfsbuf *bp;
1514
1515 if (flags & NBI_DIRTY) {
1516 listheadp = &np->n_dirtyblkhd;
1517 } else {
1518 listheadp = &np->n_cleanblkhd;
1519 }
1520
1521 while (!LIST_EMPTY(iterheadp)) {
1522 bp = LIST_FIRST(iterheadp);
1523 LIST_REMOVE(bp, nb_vnbufs);
1524 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1525 }
1526
1527 np->n_bufiterflags &= ~NBI_ITER;
1528 if (np->n_bufiterflags & NBI_ITERWANT) {
1529 np->n_bufiterflags &= ~NBI_ITERWANT;
1530 wakeup(&np->n_bufiterflags);
1531 }
1532 }
1533
1534
1535 /*
1536 * Read an NFS buffer for a file.
1537 */
1538 int
1539 nfs_buf_read(struct nfsbuf *bp)
1540 {
1541 int error = 0;
1542 nfsnode_t np;
1543 thread_t thd;
1544 kauth_cred_t cred;
1545
1546 np = bp->nb_np;
1547 cred = bp->nb_rcred;
1548 if (IS_VALID_CRED(cred)) {
1549 kauth_cred_ref(cred);
1550 }
1551 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1552
1553 /* sanity checks */
1554 if (!ISSET(bp->nb_flags, NB_READ)) {
1555 panic("nfs_buf_read: !NB_READ");
1556 }
1557 if (ISSET(bp->nb_flags, NB_DONE)) {
1558 CLR(bp->nb_flags, NB_DONE);
1559 }
1560
1561 NFS_BUF_MAP(bp);
1562
1563 OSAddAtomic64(1, &nfsstats.read_bios);
1564
1565 error = nfs_buf_read_rpc(bp, thd, cred);
1566 /*
1567 * For async I/O, the callbacks will finish up the
1568 * read. Otherwise, the read has already been finished.
1569 */
1570
1571 if (IS_VALID_CRED(cred)) {
1572 kauth_cred_unref(&cred);
1573 }
1574 return error;
1575 }
1576
1577 /*
1578 * finish the reading of a buffer
1579 */
1580 void
1581 nfs_buf_read_finish(struct nfsbuf *bp)
1582 {
1583 nfsnode_t np = bp->nb_np;
1584 struct nfsmount *nmp;
1585
1586 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1587 /* update valid range */
1588 bp->nb_validoff = 0;
1589 bp->nb_validend = bp->nb_endio;
1590 if (bp->nb_endio < (int)bp->nb_bufsize) {
1591 /*
1592 * The read may be short because we have unflushed writes
1593 * that are extending the file size and the reads hit the
1594 * (old) EOF on the server. So, just make sure nb_validend
1595 * correctly tracks EOF.
1596 * Note that the missing data should have already been zeroed
1597 * in nfs_buf_read_rpc_finish().
1598 */
1599 off_t boff = NBOFF(bp);
1600 if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
1601 bp->nb_validend = bp->nb_bufsize;
1602 } else if ((off_t)np->n_size >= boff) {
1603 bp->nb_validend = np->n_size - boff;
1604 } else {
1605 bp->nb_validend = 0;
1606 }
1607 }
1608 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1609 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
1610 bp->nb_validend = 0x100000000LL - NBOFF(bp);
1611 }
1612 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1613 if (bp->nb_validend & PAGE_MASK) {
1614 /* zero-fill remainder of last page */
1615 bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
1616 }
1617 }
1618 nfs_buf_iodone(bp);
1619 }
1620
1621 /*
1622 * initiate the NFS READ RPC(s) for a buffer
1623 */
1624 int
1625 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1626 {
1627 struct nfsmount *nmp;
1628 nfsnode_t np = bp->nb_np;
1629 int error = 0, nfsvers, async;
1630 int offset, nrpcs;
1631 uint32_t nmrsize, length, len;
1632 off_t boff;
1633 struct nfsreq *req;
1634 struct nfsreq_cbinfo cb;
1635
1636 nmp = NFSTONMP(np);
1637 if (nfs_mount_gone(nmp)) {
1638 bp->nb_error = error = ENXIO;
1639 SET(bp->nb_flags, NB_ERROR);
1640 nfs_buf_iodone(bp);
1641 return error;
1642 }
1643 nfsvers = nmp->nm_vers;
1644 nmrsize = nmp->nm_rsize;
1645
1646 boff = NBOFF(bp);
1647 offset = 0;
1648 length = bp->nb_bufsize;
1649
1650 if (nfsvers == NFS_VER2) {
1651 if (boff > 0xffffffffLL) {
1652 bp->nb_error = error = EFBIG;
1653 SET(bp->nb_flags, NB_ERROR);
1654 nfs_buf_iodone(bp);
1655 return error;
1656 }
1657 if ((boff + length - 1) > 0xffffffffLL) {
1658 length = 0x100000000LL - boff;
1659 }
1660 }
1661
1662 /* Note: Can only do async I/O if nfsiods are configured. */
1663 async = (bp->nb_flags & NB_ASYNC);
1664 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1665 cb.rcb_bp = bp;
1666
1667 bp->nb_offio = bp->nb_endio = 0;
1668 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1669 if (async && (nrpcs > 1)) {
1670 SET(bp->nb_flags, NB_MULTASYNCRPC);
1671 } else {
1672 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1673 }
1674
1675 while (length > 0) {
1676 if (ISSET(bp->nb_flags, NB_ERROR)) {
1677 error = bp->nb_error;
1678 break;
1679 }
1680 len = (length > nmrsize) ? nmrsize : length;
1681 cb.rcb_args[0] = offset;
1682 cb.rcb_args[1] = len;
1683 if (nmp->nm_vers >= NFS_VER4) {
1684 cb.rcb_args[2] = nmp->nm_stategenid;
1685 }
1686 req = NULL;
1687 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1688 if (error) {
1689 break;
1690 }
1691 offset += len;
1692 length -= len;
1693 if (async) {
1694 continue;
1695 }
1696 nfs_buf_read_rpc_finish(req);
1697 if (ISSET(bp->nb_flags, NB_ERROR)) {
1698 error = bp->nb_error;
1699 break;
1700 }
1701 }
1702
1703 if (length > 0) {
1704 /*
1705 * Something bad happened while trying to send the RPC(s).
1706 * Wait for any outstanding requests to complete.
1707 */
1708 bp->nb_error = error;
1709 SET(bp->nb_flags, NB_ERROR);
1710 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1711 nrpcs = (length + nmrsize - 1) / nmrsize;
1712 lck_mtx_lock(nfs_buf_mutex);
1713 bp->nb_rpcs -= nrpcs;
1714 if (bp->nb_rpcs == 0) {
1715 /* No RPCs left, so the buffer's done */
1716 lck_mtx_unlock(nfs_buf_mutex);
1717 nfs_buf_iodone(bp);
1718 } else {
1719 /* wait for the last RPC to mark it done */
1720 while (bp->nb_rpcs > 0) {
1721 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1722 "nfs_buf_read_rpc_cancel", NULL);
1723 }
1724 lck_mtx_unlock(nfs_buf_mutex);
1725 }
1726 } else {
1727 nfs_buf_iodone(bp);
1728 }
1729 }
1730
1731 return error;
1732 }
1733
1734 /*
1735 * finish up an NFS READ RPC on a buffer
1736 */
1737 void
1738 nfs_buf_read_rpc_finish(struct nfsreq *req)
1739 {
1740 struct nfsmount *nmp;
1741 size_t rlen;
1742 struct nfsreq_cbinfo cb;
1743 struct nfsbuf *bp;
1744 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1745 void *wakeme = NULL;
1746 struct nfsreq *rreq = NULL;
1747 nfsnode_t np;
1748 thread_t thd;
1749 kauth_cred_t cred;
1750 uio_t auio;
1751 char uio_buf[UIO_SIZEOF(1)];
1752
1753 finish:
1754 np = req->r_np;
1755 thd = req->r_thread;
1756 cred = req->r_cred;
1757 if (IS_VALID_CRED(cred)) {
1758 kauth_cred_ref(cred);
1759 }
1760 cb = req->r_callback;
1761 bp = cb.rcb_bp;
1762 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
1763 nfs_request_ref(req, 0);
1764 }
1765
1766 nmp = NFSTONMP(np);
1767 if (nfs_mount_gone(nmp)) {
1768 SET(bp->nb_flags, NB_ERROR);
1769 bp->nb_error = error = ENXIO;
1770 }
1771 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1772 /* just drop it */
1773 nfs_request_async_cancel(req);
1774 goto out;
1775 }
1776
1777 nfsvers = nmp->nm_vers;
1778 offset = cb.rcb_args[0];
1779 rlen = length = cb.rcb_args[1];
1780
1781 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1782 UIO_READ, &uio_buf, sizeof(uio_buf));
1783 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1784
1785 /* finish the RPC */
1786 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1787 if ((error == EINPROGRESS) && cb.rcb_func) {
1788 /* async request restarted */
1789 if (cb.rcb_func) {
1790 nfs_request_rele(req);
1791 }
1792 if (IS_VALID_CRED(cred)) {
1793 kauth_cred_unref(&cred);
1794 }
1795 return;
1796 }
1797 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1798 lck_mtx_lock(&nmp->nm_lock);
1799 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1800 NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
1801 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
1802 nfs_need_recover(nmp, error);
1803 }
1804 lck_mtx_unlock(&nmp->nm_lock);
1805 if (np->n_flag & NREVOKE) {
1806 error = EIO;
1807 } else {
1808 if (error == NFSERR_GRACE) {
1809 if (cb.rcb_func) {
1810 /*
1811 * For an async I/O request, handle a grace delay just like
1812 * jukebox errors. Set the resend time and queue it up.
1813 */
1814 struct timeval now;
1815 if (req->r_nmrep.nmc_mhead) {
1816 mbuf_freem(req->r_nmrep.nmc_mhead);
1817 req->r_nmrep.nmc_mhead = NULL;
1818 }
1819 req->r_error = 0;
1820 microuptime(&now);
1821 lck_mtx_lock(&req->r_mtx);
1822 req->r_resendtime = now.tv_sec + 2;
1823 req->r_xid = 0; // get a new XID
1824 req->r_flags |= R_RESTART;
1825 req->r_start = 0;
1826 nfs_asyncio_resend(req);
1827 lck_mtx_unlock(&req->r_mtx);
1828 if (IS_VALID_CRED(cred)) {
1829 kauth_cred_unref(&cred);
1830 }
1831 /* Note: nfsreq reference taken will be dropped later when finished */
1832 return;
1833 }
1834 /* otherwise, just pause a couple seconds and retry */
1835 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
1836 }
1837 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1838 rlen = 0;
1839 goto readagain;
1840 }
1841 }
1842 }
1843 if (error) {
1844 SET(bp->nb_flags, NB_ERROR);
1845 bp->nb_error = error;
1846 goto out;
1847 }
1848
1849 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
1850 bp->nb_endio = offset + rlen;
1851 }
1852
1853 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1854 /* zero out the remaining data (up to EOF) */
1855 off_t rpcrem, eofrem, rem;
1856 rpcrem = (length - rlen);
1857 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1858 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1859 if (rem > 0) {
1860 bzero(bp->nb_data + offset + rlen, rem);
1861 }
1862 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1863 /*
1864 * short read
1865 *
1866 * We haven't hit EOF and we didn't get all the data
1867 * requested, so we need to issue another read for the rest.
1868 * (Don't bother if the buffer already hit an error.)
1869 */
1870 readagain:
1871 offset += rlen;
1872 length -= rlen;
1873 cb.rcb_args[0] = offset;
1874 cb.rcb_args[1] = length;
1875 if (nmp->nm_vers >= NFS_VER4) {
1876 cb.rcb_args[2] = nmp->nm_stategenid;
1877 }
1878 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1879 if (!error) {
1880 if (IS_VALID_CRED(cred)) {
1881 kauth_cred_unref(&cred);
1882 }
1883 if (!cb.rcb_func) {
1884 /* if !async we'll need to wait for this RPC to finish */
1885 req = rreq;
1886 rreq = NULL;
1887 goto finish;
1888 }
1889 nfs_request_rele(req);
1890 /*
1891 * We're done here.
1892 * Outstanding RPC count is unchanged.
1893 * Callback will be called when RPC is done.
1894 */
1895 return;
1896 }
1897 SET(bp->nb_flags, NB_ERROR);
1898 bp->nb_error = error;
1899 }
1900
1901 out:
1902 if (cb.rcb_func) {
1903 nfs_request_rele(req);
1904 }
1905 if (IS_VALID_CRED(cred)) {
1906 kauth_cred_unref(&cred);
1907 }
1908
1909 /*
1910 * Decrement outstanding RPC count on buffer
1911 * and call nfs_buf_read_finish on last RPC.
1912 *
1913 * (Note: when there are multiple async RPCs issued for a
1914 * buffer we need nfs_buffer_mutex to avoid problems when
1915 * aborting a partially-initiated set of RPCs)
1916 */
1917
1918 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1919 if (multasyncrpc) {
1920 lck_mtx_lock(nfs_buf_mutex);
1921 }
1922
1923 bp->nb_rpcs--;
1924 finished = (bp->nb_rpcs == 0);
1925
1926 if (multasyncrpc) {
1927 lck_mtx_unlock(nfs_buf_mutex);
1928 }
1929
1930 if (finished) {
1931 if (multasyncrpc) {
1932 wakeme = &bp->nb_rpcs;
1933 }
1934 nfs_buf_read_finish(bp);
1935 if (wakeme) {
1936 wakeup(wakeme);
1937 }
1938 }
1939 }
1940
1941 /*
1942 * Do buffer readahead.
1943 * Initiate async I/O to read buffers not in cache.
1944 */
1945 int
1946 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1947 {
1948 struct nfsmount *nmp = NFSTONMP(np);
1949 struct nfsbuf *bp;
1950 int error = 0;
1951 uint32_t nra;
1952
1953 if (nfs_mount_gone(nmp)) {
1954 return ENXIO;
1955 }
1956 if (nmp->nm_readahead <= 0) {
1957 return 0;
1958 }
1959 if (*rabnp > lastrabn) {
1960 return 0;
1961 }
1962
1963 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1964 /* check if block exists and is valid. */
1965 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1966 /* stop reading ahead if we're beyond EOF */
1967 *rabnp = lastrabn;
1968 break;
1969 }
1970 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1971 if (error) {
1972 break;
1973 }
1974 nfs_node_lock_force(np);
1975 np->n_lastrahead = *rabnp;
1976 nfs_node_unlock(np);
1977 if (!bp) {
1978 continue;
1979 }
1980 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1981 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
1982 CLR(bp->nb_flags, NB_CACHE);
1983 bp->nb_valid = 0;
1984 bp->nb_validoff = bp->nb_validend = -1;
1985 }
1986 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1987 !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
1988 SET(bp->nb_flags, (NB_READ | NB_ASYNC));
1989 if (ioflag & IO_NOCACHE) {
1990 SET(bp->nb_flags, NB_NCRDAHEAD);
1991 }
1992 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1993 kauth_cred_ref(cred);
1994 bp->nb_rcred = cred;
1995 }
1996 if ((error = nfs_buf_read(bp))) {
1997 break;
1998 }
1999 continue;
2000 }
2001 nfs_buf_release(bp, 1);
2002 }
2003 return error;
2004 }
2005
2006 /*
2007 * NFS buffer I/O for reading files.
2008 */
2009 int
2010 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2011 {
2012 vnode_t vp = NFSTOV(np);
2013 struct nfsbuf *bp = NULL;
2014 struct nfsmount *nmp = VTONMP(vp);
2015 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2016 off_t diff;
2017 int error = 0, n = 0, on = 0;
2018 int nfsvers, biosize, modified, readaheads = 0;
2019 thread_t thd;
2020 kauth_cred_t cred;
2021 int64_t io_resid;
2022
2023 FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2024
2025 nfsvers = nmp->nm_vers;
2026 biosize = nmp->nm_biosize;
2027 thd = vfs_context_thread(ctx);
2028 cred = vfs_context_ucred(ctx);
2029
2030 if (vnode_vtype(vp) != VREG) {
2031 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2032 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
2033 return EINVAL;
2034 }
2035
2036 /*
2037 * For NFS, cache consistency can only be maintained approximately.
2038 * Although RFC1094 does not specify the criteria, the following is
2039 * believed to be compatible with the reference port.
2040 *
2041 * If the file has changed since the last read RPC or you have
2042 * written to the file, you may have lost data cache consistency
2043 * with the server. So, check for a change, and flush all of the
2044 * file's data out of the cache.
2045 * NB: This implies that cache data can be read when up to
2046 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2047 * need current attributes, nfs_getattr() can be forced to fetch
2048 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2049 */
2050
2051 if (ISSET(np->n_flag, NUPDATESIZE)) {
2052 nfs_data_update_size(np, 0);
2053 }
2054
2055 if ((error = nfs_node_lock(np))) {
2056 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
2057 return error;
2058 }
2059
2060 if (np->n_flag & NNEEDINVALIDATE) {
2061 np->n_flag &= ~NNEEDINVALIDATE;
2062 nfs_node_unlock(np);
2063 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2064 if (!error) {
2065 error = nfs_node_lock(np);
2066 }
2067 if (error) {
2068 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
2069 return error;
2070 }
2071 }
2072
2073 modified = (np->n_flag & NMODIFIED);
2074 nfs_node_unlock(np);
2075 /* nfs_getattr() will check changed and purge caches */
2076 error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
2077 if (error) {
2078 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
2079 return error;
2080 }
2081
2082 if (uio_resid(uio) == 0) {
2083 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
2084 return 0;
2085 }
2086 if (uio_offset(uio) < 0) {
2087 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
2088 return EINVAL;
2089 }
2090
2091 /*
2092 * set up readahead - which may be limited by:
2093 * + current request length (for IO_NOCACHE)
2094 * + readahead setting
2095 * + file size
2096 */
2097 if (nmp->nm_readahead > 0) {
2098 off_t end = uio_offset(uio) + uio_resid(uio);
2099 if (end > (off_t)np->n_size) {
2100 end = np->n_size;
2101 }
2102 rabn = uio_offset(uio) / biosize;
2103 maxrabn = (end - 1) / biosize;
2104 nfs_node_lock_force(np);
2105 if (!(ioflag & IO_NOCACHE) &&
2106 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
2107 maxrabn += nmp->nm_readahead;
2108 if ((maxrabn * biosize) >= (off_t)np->n_size) {
2109 maxrabn = ((off_t)np->n_size - 1) / biosize;
2110 }
2111 }
2112 if (maxrabn < np->n_lastrahead) {
2113 np->n_lastrahead = -1;
2114 }
2115 if (rabn < np->n_lastrahead) {
2116 rabn = np->n_lastrahead + 1;
2117 }
2118 nfs_node_unlock(np);
2119 } else {
2120 rabn = maxrabn = 0;
2121 }
2122
2123 do {
2124 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2125 lbn = uio_offset(uio) / biosize;
2126
2127 /*
2128 * Copy directly from any cached pages without grabbing the bufs.
2129 * (If we are NOCACHE and we've issued readahead requests, we need
2130 * to grab the NB_NCRDAHEAD bufs to drop them.)
2131 */
2132 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2133 ((uio->uio_segflg == UIO_USERSPACE32 ||
2134 uio->uio_segflg == UIO_USERSPACE64 ||
2135 uio->uio_segflg == UIO_USERSPACE))) {
2136 io_resid = uio_resid(uio);
2137 diff = np->n_size - uio_offset(uio);
2138 if (diff < io_resid) {
2139 io_resid = diff;
2140 }
2141 if (io_resid > 0) {
2142 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2143 error = cluster_copy_ubc_data(vp, uio, &count, 0);
2144 if (error) {
2145 nfs_data_unlock(np);
2146 FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
2147 return error;
2148 }
2149 }
2150 /* count any biocache reads that we just copied directly */
2151 if (lbn != (uio_offset(uio) / biosize)) {
2152 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
2153 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2154 }
2155 }
2156
2157 lbn = uio_offset(uio) / biosize;
2158 on = uio_offset(uio) % biosize;
2159 nfs_node_lock_force(np);
2160 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2161 nfs_node_unlock(np);
2162
2163 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2164 nfs_data_unlock(np);
2165 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2166 return 0;
2167 }
2168
2169 /* adjust readahead block number, if necessary */
2170 if (rabn < lbn) {
2171 rabn = lbn;
2172 }
2173 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2174 if (rabn <= lastrabn) { /* start readaheads */
2175 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2176 if (error) {
2177 nfs_data_unlock(np);
2178 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2179 return error;
2180 }
2181 readaheads = 1;
2182 }
2183
2184 OSAddAtomic64(1, &nfsstats.biocache_reads);
2185
2186 /*
2187 * If the block is in the cache and has the required data
2188 * in a valid region, just copy it out.
2189 * Otherwise, get the block and write back/read in,
2190 * as required.
2191 */
2192 again:
2193 io_resid = uio_resid(uio);
2194 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2195 diff = np->n_size - uio_offset(uio);
2196 if (diff < n) {
2197 n = diff;
2198 }
2199
2200 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2201 if (error) {
2202 nfs_data_unlock(np);
2203 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2204 return error;
2205 }
2206
2207 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2208 /*
2209 * IO_NOCACHE found a cached buffer.
2210 * Flush the buffer if it's dirty.
2211 * Invalidate the data if it wasn't just read
2212 * in as part of a "nocache readahead".
2213 */
2214 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2215 /* so write the buffer out and try again */
2216 SET(bp->nb_flags, NB_NOCACHE);
2217 goto flushbuffer;
2218 }
2219 if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2220 CLR(bp->nb_flags, NB_NCRDAHEAD);
2221 SET(bp->nb_flags, NB_NOCACHE);
2222 }
2223 }
2224
2225 /* if any pages are valid... */
2226 if (bp->nb_valid) {
2227 /* ...check for any invalid pages in the read range */
2228 int pg, firstpg, lastpg, dirtypg;
2229 dirtypg = firstpg = lastpg = -1;
2230 pg = on / PAGE_SIZE;
2231 while (pg <= (on + n - 1) / PAGE_SIZE) {
2232 if (!NBPGVALID(bp, pg)) {
2233 if (firstpg < 0) {
2234 firstpg = pg;
2235 }
2236 lastpg = pg;
2237 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
2238 dirtypg = pg;
2239 }
2240 pg++;
2241 }
2242
2243 /* if there are no invalid pages, we're all set */
2244 if (firstpg < 0) {
2245 if (bp->nb_validoff < 0) {
2246 /* valid range isn't set up, so */
2247 /* set it to what we know is valid */
2248 bp->nb_validoff = trunc_page(on);
2249 bp->nb_validend = round_page(on + n);
2250 nfs_buf_normalize_valid_range(np, bp);
2251 }
2252 goto buffer_ready;
2253 }
2254
2255 /* there are invalid pages in the read range */
2256 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2257 (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2258 /* there are also dirty page(s) (or range) in the read range, */
2259 /* so write the buffer out and try again */
2260 flushbuffer:
2261 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2262 SET(bp->nb_flags, NB_ASYNC);
2263 if (!IS_VALID_CRED(bp->nb_wcred)) {
2264 kauth_cred_ref(cred);
2265 bp->nb_wcred = cred;
2266 }
2267 error = nfs_buf_write(bp);
2268 if (error) {
2269 nfs_data_unlock(np);
2270 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2271 return error;
2272 }
2273 goto again;
2274 }
2275 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2276 (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
2277 /* we need to read in more than half the buffer and the */
2278 /* buffer's not dirty, so just fetch the whole buffer */
2279 bp->nb_valid = 0;
2280 } else {
2281 /* read the page range in */
2282 uio_t auio;
2283 char uio_buf[UIO_SIZEOF(1)];
2284
2285 NFS_BUF_MAP(bp);
2286 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2287 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2288 if (!auio) {
2289 error = ENOMEM;
2290 } else {
2291 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2292 ((lastpg - firstpg + 1) * PAGE_SIZE));
2293 error = nfs_read_rpc(np, auio, ctx);
2294 }
2295 if (error) {
2296 if (ioflag & IO_NOCACHE) {
2297 SET(bp->nb_flags, NB_NOCACHE);
2298 }
2299 nfs_buf_release(bp, 1);
2300 nfs_data_unlock(np);
2301 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2302 return error;
2303 }
2304 /* Make sure that the valid range is set to cover this read. */
2305 bp->nb_validoff = trunc_page_32(on);
2306 bp->nb_validend = round_page_32(on + n);
2307 nfs_buf_normalize_valid_range(np, bp);
2308 if (uio_resid(auio) > 0) {
2309 /* if short read, must have hit EOF, */
2310 /* so zero the rest of the range */
2311 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2312 }
2313 /* mark the pages (successfully read) as valid */
2314 for (pg = firstpg; pg <= lastpg; pg++) {
2315 NBPGVALID_SET(bp, pg);
2316 }
2317 }
2318 }
2319 /* if no pages are valid, read the whole block */
2320 if (!bp->nb_valid) {
2321 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2322 kauth_cred_ref(cred);
2323 bp->nb_rcred = cred;
2324 }
2325 SET(bp->nb_flags, NB_READ);
2326 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2327 error = nfs_buf_read(bp);
2328 if (ioflag & IO_NOCACHE) {
2329 SET(bp->nb_flags, NB_NOCACHE);
2330 }
2331 if (error) {
2332 nfs_data_unlock(np);
2333 nfs_buf_release(bp, 1);
2334 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2335 return error;
2336 }
2337 }
2338 buffer_ready:
2339 /* validate read range against valid range and clip */
2340 if (bp->nb_validend > 0) {
2341 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2342 if (diff < n) {
2343 n = diff;
2344 }
2345 }
2346 if (n > 0) {
2347 NFS_BUF_MAP(bp);
2348 error = uiomove(bp->nb_data + on, n, uio);
2349 }
2350
2351 nfs_buf_release(bp, 1);
2352 nfs_data_unlock(np);
2353 nfs_node_lock_force(np);
2354 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2355 nfs_node_unlock(np);
2356 } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2357 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2358 return error;
2359 }
2360
2361 /*
2362 * limit the number of outstanding async I/O writes
2363 */
2364 int
2365 nfs_async_write_start(struct nfsmount *nmp)
2366 {
2367 int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
2368 struct timespec ts = {1, 0};
2369
2370 if (nfs_max_async_writes <= 0) {
2371 return 0;
2372 }
2373 lck_mtx_lock(&nmp->nm_lock);
2374 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2375 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2376 break;
2377 }
2378 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
2379 slpflag = 0;
2380 }
2381 if (!error) {
2382 nmp->nm_asyncwrites++;
2383 }
2384 lck_mtx_unlock(&nmp->nm_lock);
2385 return error;
2386 }
2387 void
2388 nfs_async_write_done(struct nfsmount *nmp)
2389 {
2390 if (nmp->nm_asyncwrites <= 0) {
2391 return;
2392 }
2393 lck_mtx_lock(&nmp->nm_lock);
2394 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2395 wakeup(&nmp->nm_asyncwrites);
2396 }
2397 lck_mtx_unlock(&nmp->nm_lock);
2398 }
2399
2400 /*
2401 * write (or commit) the given NFS buffer
2402 *
2403 * Commit the buffer if we can.
2404 * Write out any dirty range.
2405 * If any dirty pages remain, write them out.
2406 * Mark buffer done.
2407 *
2408 * For async requests, all the work beyond sending the initial
2409 * write RPC is handled in the RPC callback(s).
2410 */
2411 int
2412 nfs_buf_write(struct nfsbuf *bp)
2413 {
2414 int error = 0, oldflags, async;
2415 nfsnode_t np;
2416 thread_t thd;
2417 kauth_cred_t cred;
2418 proc_t p = current_proc();
2419 int iomode, doff, dend, firstpg, lastpg;
2420 uint32_t pagemask;
2421
2422 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2423
2424 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2425 panic("nfs_buf_write: buffer is not busy???");
2426 }
2427
2428 np = bp->nb_np;
2429 async = ISSET(bp->nb_flags, NB_ASYNC);
2430 oldflags = bp->nb_flags;
2431
2432 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2433 if (ISSET(oldflags, NB_DELWRI)) {
2434 lck_mtx_lock(nfs_buf_mutex);
2435 nfs_nbdwrite--;
2436 NFSBUFCNTCHK();
2437 lck_mtx_unlock(nfs_buf_mutex);
2438 wakeup(&nfs_nbdwrite);
2439 }
2440
2441 /* move to clean list */
2442 if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2443 lck_mtx_lock(nfs_buf_mutex);
2444 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2445 LIST_REMOVE(bp, nb_vnbufs);
2446 }
2447 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2448 lck_mtx_unlock(nfs_buf_mutex);
2449 }
2450 nfs_node_lock_force(np);
2451 np->n_numoutput++;
2452 nfs_node_unlock(np);
2453 vnode_startwrite(NFSTOV(np));
2454
2455 if (p && p->p_stats) {
2456 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2457 }
2458
2459 cred = bp->nb_wcred;
2460 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2461 cred = bp->nb_rcred; /* shouldn't really happen, but... */
2462 }
2463 if (IS_VALID_CRED(cred)) {
2464 kauth_cred_ref(cred);
2465 }
2466 thd = async ? NULL : current_thread();
2467
2468 /* We need to make sure the pages are locked before doing I/O. */
2469 if (!ISSET(bp->nb_flags, NB_META)) {
2470 if (UBCINFOEXISTS(NFSTOV(np))) {
2471 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2472 error = nfs_buf_upl_setup(bp);
2473 if (error) {
2474 printf("nfs_buf_write: upl create failed %d\n", error);
2475 SET(bp->nb_flags, NB_ERROR);
2476 bp->nb_error = error = EIO;
2477 nfs_buf_iodone(bp);
2478 goto out;
2479 }
2480 nfs_buf_upl_check(bp);
2481 }
2482 } else {
2483 /* We should never be in nfs_buf_write() with no UBCINFO. */
2484 printf("nfs_buf_write: ubcinfo already gone\n");
2485 SET(bp->nb_flags, NB_ERROR);
2486 bp->nb_error = error = EIO;
2487 nfs_buf_iodone(bp);
2488 goto out;
2489 }
2490 }
2491
2492 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2493 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2494 nfs_buf_check_write_verifier(np, bp);
2495 }
2496 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2497 struct nfsmount *nmp = NFSTONMP(np);
2498 if (nfs_mount_gone(nmp)) {
2499 SET(bp->nb_flags, NB_ERROR);
2500 bp->nb_error = error = EIO;
2501 nfs_buf_iodone(bp);
2502 goto out;
2503 }
2504 SET(bp->nb_flags, NB_WRITEINPROG);
2505 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2506 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2507 CLR(bp->nb_flags, NB_WRITEINPROG);
2508 if (error) {
2509 if (error != NFSERR_STALEWRITEVERF) {
2510 SET(bp->nb_flags, NB_ERROR);
2511 bp->nb_error = error;
2512 }
2513 nfs_buf_iodone(bp);
2514 goto out;
2515 }
2516 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2517 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2518 nfs_node_lock_force(np);
2519 np->n_needcommitcnt--;
2520 CHECK_NEEDCOMMITCNT(np);
2521 nfs_node_unlock(np);
2522 }
2523 if (!error && (bp->nb_dirtyend > 0)) {
2524 /* sanity check the dirty range */
2525 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2526 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2527 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2528 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2529 }
2530 }
2531 }
2532 if (!error && (bp->nb_dirtyend > 0)) {
2533 /* there's a dirty range that needs to be written out */
2534 NFS_BUF_MAP(bp);
2535
2536 doff = bp->nb_dirtyoff;
2537 dend = bp->nb_dirtyend;
2538
2539 /* if doff page is dirty, move doff to start of page */
2540 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2541 doff -= doff & PAGE_MASK;
2542 }
2543 /* try to expand write range to include preceding dirty pages */
2544 if (!(doff & PAGE_MASK)) {
2545 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2546 doff -= PAGE_SIZE;
2547 }
2548 }
2549 /* if dend page is dirty, move dend to start of next page */
2550 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2551 dend = round_page_32(dend);
2552 }
2553 /* try to expand write range to include trailing dirty pages */
2554 if (!(dend & PAGE_MASK)) {
2555 while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2556 dend += PAGE_SIZE;
2557 }
2558 }
2559 /* make sure to keep dend clipped to EOF */
2560 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2561 dend = np->n_size - NBOFF(bp);
2562 }
2563 /* calculate range of complete pages being written */
2564 firstpg = round_page_32(doff) / PAGE_SIZE;
2565 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2566 /* calculate mask for that page range */
2567 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2568
2569 /*
2570 * compare page mask to nb_dirty; if there are other dirty pages
2571 * then write FILESYNC; otherwise, write UNSTABLE if async and
2572 * not needcommit/stable; otherwise write FILESYNC
2573 */
2574 if (bp->nb_dirty & ~pagemask) {
2575 iomode = NFS_WRITE_FILESYNC;
2576 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2577 iomode = NFS_WRITE_UNSTABLE;
2578 } else {
2579 iomode = NFS_WRITE_FILESYNC;
2580 }
2581
2582 /* write the whole contiguous dirty range */
2583 bp->nb_offio = doff;
2584 bp->nb_endio = dend;
2585
2586 OSAddAtomic64(1, &nfsstats.write_bios);
2587
2588 SET(bp->nb_flags, NB_WRITEINPROG);
2589 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2590 /*
2591 * For async I/O, the callbacks will finish up the
2592 * write and push out any dirty pages. Otherwise,
2593 * the write has already been finished and any dirty
2594 * pages pushed out.
2595 */
2596 } else {
2597 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2598 error = nfs_buf_write_dirty_pages(bp, thd, cred);
2599 }
2600 nfs_buf_iodone(bp);
2601 }
2602 /* note: bp is still valid only for !async case */
2603 out:
2604 if (!async) {
2605 error = nfs_buf_iowait(bp);
2606 /* move to clean list */
2607 if (oldflags & NB_DELWRI) {
2608 lck_mtx_lock(nfs_buf_mutex);
2609 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2610 LIST_REMOVE(bp, nb_vnbufs);
2611 }
2612 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2613 lck_mtx_unlock(nfs_buf_mutex);
2614 }
2615 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2616 nfs_buf_release(bp, 1);
2617 /* check if we need to invalidate (and we can) */
2618 if ((np->n_flag & NNEEDINVALIDATE) &&
2619 !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2620 int invalidate = 0;
2621 nfs_node_lock_force(np);
2622 if (np->n_flag & NNEEDINVALIDATE) {
2623 invalidate = 1;
2624 np->n_flag &= ~NNEEDINVALIDATE;
2625 }
2626 nfs_node_unlock(np);
2627 if (invalidate) {
2628 /*
2629 * There was a write error and we need to
2630 * invalidate attrs and flush buffers in
2631 * order to sync up with the server.
2632 * (if this write was extending the file,
2633 * we may no longer know the correct size)
2634 *
2635 * But we couldn't call vinvalbuf while holding
2636 * the buffer busy. So we call vinvalbuf() after
2637 * releasing the buffer.
2638 */
2639 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
2640 }
2641 }
2642 }
2643
2644 if (IS_VALID_CRED(cred)) {
2645 kauth_cred_unref(&cred);
2646 }
2647 return error;
2648 }
2649
2650 /*
2651 * finish the writing of a buffer
2652 */
2653 void
2654 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2655 {
2656 nfsnode_t np = bp->nb_np;
2657 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2658 int firstpg, lastpg;
2659 uint32_t pagemask;
2660
2661 if ((error == EINTR) || (error == ERESTART)) {
2662 CLR(bp->nb_flags, NB_ERROR);
2663 SET(bp->nb_flags, NB_EINTR);
2664 }
2665
2666 if (!error) {
2667 /* calculate range of complete pages being written */
2668 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2669 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2670 /* calculate mask for that page range written */
2671 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2672 /* clear dirty bits for pages we've written */
2673 bp->nb_dirty &= ~pagemask;
2674 }
2675
2676 /* manage needcommit state */
2677 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2678 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2679 nfs_node_lock_force(np);
2680 np->n_needcommitcnt++;
2681 nfs_node_unlock(np);
2682 SET(bp->nb_flags, NB_NEEDCOMMIT);
2683 }
2684 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2685 bp->nb_dirtyoff = bp->nb_offio;
2686 bp->nb_dirtyend = bp->nb_endio;
2687 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2688 nfs_node_lock_force(np);
2689 np->n_needcommitcnt--;
2690 CHECK_NEEDCOMMITCNT(np);
2691 nfs_node_unlock(np);
2692 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2693 }
2694
2695 CLR(bp->nb_flags, NB_WRITEINPROG);
2696
2697 /*
2698 * For an unstable write, the buffer is still treated as dirty until
2699 * a commit (or stable (re)write) is performed. Buffers needing only
2700 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2701 *
2702 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2703 * because that would cause the buffer to be dropped. The buffer is
2704 * still valid and simply needs to be written again.
2705 */
2706 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2707 CLR(bp->nb_flags, NB_INVAL);
2708 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2709 SET(bp->nb_flags, NB_DELWRI);
2710 lck_mtx_lock(nfs_buf_mutex);
2711 nfs_nbdwrite++;
2712 NFSBUFCNTCHK();
2713 lck_mtx_unlock(nfs_buf_mutex);
2714 }
2715 /*
2716 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2717 * clean list, we have to reassign it back to the dirty one. Ugh.
2718 */
2719 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2720 /* move to dirty list */
2721 lck_mtx_lock(nfs_buf_mutex);
2722 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2723 LIST_REMOVE(bp, nb_vnbufs);
2724 }
2725 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2726 lck_mtx_unlock(nfs_buf_mutex);
2727 }
2728 } else {
2729 /* either there's an error or we don't need to commit */
2730 if (error) {
2731 /*
2732 * There was a write error and we need to invalidate
2733 * attrs and flush buffers in order to sync up with the
2734 * server. (if this write was extending the file, we
2735 * may no longer know the correct size)
2736 *
2737 * But we can't call vinvalbuf while holding this
2738 * buffer busy. Set a flag to do it after releasing
2739 * the buffer.
2740 */
2741 nfs_node_lock_force(np);
2742 np->n_error = error;
2743 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2744 NATTRINVALIDATE(np);
2745 nfs_node_unlock(np);
2746 }
2747 /* clear the dirty range */
2748 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2749 }
2750
2751 if (!error && bp->nb_dirty) {
2752 nfs_buf_write_dirty_pages(bp, thd, cred);
2753 }
2754 nfs_buf_iodone(bp);
2755 }
2756
2757 /*
2758 * write out any pages marked dirty in a buffer
2759 *
2760 * We do use unstable writes and follow up with a commit.
2761 * If we catch the write verifier changing we'll restart
2762 * do the writes filesync.
2763 */
2764 int
2765 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2766 {
2767 nfsnode_t np = bp->nb_np;
2768 struct nfsmount *nmp = NFSTONMP(np);
2769 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2770 uint32_t dirty = bp->nb_dirty;
2771 uint64_t wverf;
2772 uio_t auio;
2773 char uio_buf[UIO_SIZEOF(1)];
2774
2775 if (!bp->nb_dirty) {
2776 return 0;
2777 }
2778
2779 /* there are pages marked dirty that need to be written out */
2780 OSAddAtomic64(1, &nfsstats.write_bios);
2781 NFS_BUF_MAP(bp);
2782 SET(bp->nb_flags, NB_WRITEINPROG);
2783 npages = bp->nb_bufsize / PAGE_SIZE;
2784 iomode = NFS_WRITE_UNSTABLE;
2785
2786 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2787 &uio_buf, sizeof(uio_buf));
2788
2789 again:
2790 dirty = bp->nb_dirty;
2791 wverf = bp->nb_verf;
2792 commit = NFS_WRITE_FILESYNC;
2793 for (pg = 0; pg < npages; pg++) {
2794 if (!NBPGDIRTY(bp, pg)) {
2795 continue;
2796 }
2797 count = 1;
2798 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2799 count++;
2800 }
2801 /* write count pages starting with page pg */
2802 off = pg * PAGE_SIZE;
2803 len = count * PAGE_SIZE;
2804 /* clip writes to EOF */
2805 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2806 len -= (NBOFF(bp) + off + len) - np->n_size;
2807 }
2808 if (len > 0) {
2809 iomode2 = iomode;
2810 uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2811 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2812 error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2813 if (error) {
2814 break;
2815 }
2816 if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2817 commit = iomode2;
2818 }
2819 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2820 /* verifier changed, redo all the writes filesync */
2821 iomode = NFS_WRITE_FILESYNC;
2822 goto again;
2823 }
2824 }
2825 /* clear dirty bits */
2826 while (count--) {
2827 dirty &= ~(1 << pg);
2828 if (count) { /* leave pg on last page */
2829 pg++;
2830 }
2831 }
2832 }
2833 CLR(bp->nb_flags, NB_WRITEINPROG);
2834
2835 if (!error && (commit != NFS_WRITE_FILESYNC)) {
2836 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2837 if (error == NFSERR_STALEWRITEVERF) {
2838 /* verifier changed, so we need to restart all the writes */
2839 iomode = NFS_WRITE_FILESYNC;
2840 goto again;
2841 }
2842 }
2843 if (!error) {
2844 bp->nb_dirty = dirty;
2845 } else {
2846 SET(bp->nb_flags, NB_ERROR);
2847 bp->nb_error = error;
2848 }
2849 return error;
2850 }
2851
2852 /*
2853 * initiate the NFS WRITE RPC(s) for a buffer
2854 */
2855 int
2856 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2857 {
2858 struct nfsmount *nmp;
2859 nfsnode_t np = bp->nb_np;
2860 int error = 0, nfsvers, async;
2861 int offset, nrpcs;
2862 uint32_t nmwsize, length, len;
2863 struct nfsreq *req;
2864 struct nfsreq_cbinfo cb;
2865 uio_t auio;
2866 char uio_buf[UIO_SIZEOF(1)];
2867
2868 nmp = NFSTONMP(np);
2869 if (nfs_mount_gone(nmp)) {
2870 bp->nb_error = error = ENXIO;
2871 SET(bp->nb_flags, NB_ERROR);
2872 nfs_buf_iodone(bp);
2873 return error;
2874 }
2875 nfsvers = nmp->nm_vers;
2876 nmwsize = nmp->nm_wsize;
2877
2878 offset = bp->nb_offio;
2879 length = bp->nb_endio - bp->nb_offio;
2880
2881 /* Note: Can only do async I/O if nfsiods are configured. */
2882 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2883 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2884 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2885 cb.rcb_bp = bp;
2886
2887 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2888 bp->nb_error = error = EFBIG;
2889 SET(bp->nb_flags, NB_ERROR);
2890 nfs_buf_iodone(bp);
2891 return error;
2892 }
2893
2894 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2895 UIO_WRITE, &uio_buf, sizeof(uio_buf));
2896 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2897
2898 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2899 if (async && (nrpcs > 1)) {
2900 SET(bp->nb_flags, NB_MULTASYNCRPC);
2901 } else {
2902 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2903 }
2904
2905 while (length > 0) {
2906 if (ISSET(bp->nb_flags, NB_ERROR)) {
2907 error = bp->nb_error;
2908 break;
2909 }
2910 len = (length > nmwsize) ? nmwsize : length;
2911 cb.rcb_args[0] = offset;
2912 cb.rcb_args[1] = len;
2913 if (nmp->nm_vers >= NFS_VER4) {
2914 cb.rcb_args[2] = nmp->nm_stategenid;
2915 }
2916 if (async && ((error = nfs_async_write_start(nmp)))) {
2917 break;
2918 }
2919 req = NULL;
2920 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2921 iomode, &cb, &req);
2922 if (error) {
2923 if (async) {
2924 nfs_async_write_done(nmp);
2925 }
2926 break;
2927 }
2928 offset += len;
2929 length -= len;
2930 if (async) {
2931 continue;
2932 }
2933 nfs_buf_write_rpc_finish(req);
2934 }
2935
2936 if (length > 0) {
2937 /*
2938 * Something bad happened while trying to send the RPCs.
2939 * Wait for any outstanding requests to complete.
2940 */
2941 bp->nb_error = error;
2942 SET(bp->nb_flags, NB_ERROR);
2943 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2944 nrpcs = (length + nmwsize - 1) / nmwsize;
2945 lck_mtx_lock(nfs_buf_mutex);
2946 bp->nb_rpcs -= nrpcs;
2947 if (bp->nb_rpcs == 0) {
2948 /* No RPCs left, so the buffer's done */
2949 lck_mtx_unlock(nfs_buf_mutex);
2950 nfs_buf_write_finish(bp, thd, cred);
2951 } else {
2952 /* wait for the last RPC to mark it done */
2953 while (bp->nb_rpcs > 0) {
2954 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2955 "nfs_buf_write_rpc_cancel", NULL);
2956 }
2957 lck_mtx_unlock(nfs_buf_mutex);
2958 }
2959 } else {
2960 nfs_buf_write_finish(bp, thd, cred);
2961 }
2962 /* It may have just been an interrupt... that's OK */
2963 if (!ISSET(bp->nb_flags, NB_ERROR)) {
2964 error = 0;
2965 }
2966 }
2967
2968 return error;
2969 }
2970
2971 /*
2972 * finish up an NFS WRITE RPC on a buffer
2973 */
2974 void
2975 nfs_buf_write_rpc_finish(struct nfsreq *req)
2976 {
2977 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2978 int committed = NFS_WRITE_FILESYNC;
2979 uint64_t wverf = 0;
2980 size_t rlen;
2981 void *wakeme = NULL;
2982 struct nfsreq_cbinfo cb;
2983 struct nfsreq *wreq = NULL;
2984 struct nfsbuf *bp;
2985 struct nfsmount *nmp;
2986 nfsnode_t np;
2987 thread_t thd;
2988 kauth_cred_t cred;
2989 uio_t auio;
2990 char uio_buf[UIO_SIZEOF(1)];
2991
2992 finish:
2993 np = req->r_np;
2994 thd = req->r_thread;
2995 cred = req->r_cred;
2996 if (IS_VALID_CRED(cred)) {
2997 kauth_cred_ref(cred);
2998 }
2999 cb = req->r_callback;
3000 bp = cb.rcb_bp;
3001 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
3002 nfs_request_ref(req, 0);
3003 }
3004
3005 nmp = NFSTONMP(np);
3006 if (nfs_mount_gone(nmp)) {
3007 SET(bp->nb_flags, NB_ERROR);
3008 bp->nb_error = error = ENXIO;
3009 }
3010 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3011 /* just drop it */
3012 nfs_request_async_cancel(req);
3013 goto out;
3014 }
3015 nfsvers = nmp->nm_vers;
3016
3017 offset = cb.rcb_args[0];
3018 rlen = length = cb.rcb_args[1];
3019
3020 /* finish the RPC */
3021 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3022 if ((error == EINPROGRESS) && cb.rcb_func) {
3023 /* async request restarted */
3024 if (cb.rcb_func) {
3025 nfs_request_rele(req);
3026 }
3027 if (IS_VALID_CRED(cred)) {
3028 kauth_cred_unref(&cred);
3029 }
3030 return;
3031 }
3032 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3033 lck_mtx_lock(&nmp->nm_lock);
3034 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3035 NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
3036 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
3037 nfs_need_recover(nmp, error);
3038 }
3039 lck_mtx_unlock(&nmp->nm_lock);
3040 if (np->n_flag & NREVOKE) {
3041 error = EIO;
3042 } else {
3043 if (error == NFSERR_GRACE) {
3044 if (cb.rcb_func) {
3045 /*
3046 * For an async I/O request, handle a grace delay just like
3047 * jukebox errors. Set the resend time and queue it up.
3048 */
3049 struct timeval now;
3050 if (req->r_nmrep.nmc_mhead) {
3051 mbuf_freem(req->r_nmrep.nmc_mhead);
3052 req->r_nmrep.nmc_mhead = NULL;
3053 }
3054 req->r_error = 0;
3055 microuptime(&now);
3056 lck_mtx_lock(&req->r_mtx);
3057 req->r_resendtime = now.tv_sec + 2;
3058 req->r_xid = 0; // get a new XID
3059 req->r_flags |= R_RESTART;
3060 req->r_start = 0;
3061 nfs_asyncio_resend(req);
3062 lck_mtx_unlock(&req->r_mtx);
3063 if (IS_VALID_CRED(cred)) {
3064 kauth_cred_unref(&cred);
3065 }
3066 /* Note: nfsreq reference taken will be dropped later when finished */
3067 return;
3068 }
3069 /* otherwise, just pause a couple seconds and retry */
3070 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
3071 }
3072 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3073 rlen = 0;
3074 goto writeagain;
3075 }
3076 }
3077 }
3078 if (error) {
3079 SET(bp->nb_flags, NB_ERROR);
3080 bp->nb_error = error;
3081 }
3082 if (error || (nfsvers == NFS_VER2)) {
3083 goto out;
3084 }
3085 if (rlen <= 0) {
3086 SET(bp->nb_flags, NB_ERROR);
3087 bp->nb_error = error = EIO;
3088 goto out;
3089 }
3090
3091 /* save lowest commit level returned */
3092 if (committed < bp->nb_commitlevel) {
3093 bp->nb_commitlevel = committed;
3094 }
3095
3096 /* check the write verifier */
3097 if (!bp->nb_verf) {
3098 bp->nb_verf = wverf;
3099 } else if (bp->nb_verf != wverf) {
3100 /* verifier changed, so buffer will need to be rewritten */
3101 bp->nb_flags |= NB_STALEWVERF;
3102 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3103 bp->nb_verf = wverf;
3104 }
3105
3106 /*
3107 * check for a short write
3108 *
3109 * If the server didn't write all the data, then we
3110 * need to issue another write for the rest of it.
3111 * (Don't bother if the buffer hit an error or stale wverf.)
3112 */
3113 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
3114 writeagain:
3115 offset += rlen;
3116 length -= rlen;
3117
3118 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
3119 UIO_WRITE, &uio_buf, sizeof(uio_buf));
3120 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
3121
3122 cb.rcb_args[0] = offset;
3123 cb.rcb_args[1] = length;
3124 if (nmp->nm_vers >= NFS_VER4) {
3125 cb.rcb_args[2] = nmp->nm_stategenid;
3126 }
3127
3128 // XXX iomode should really match the original request
3129 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
3130 NFS_WRITE_FILESYNC, &cb, &wreq);
3131 if (!error) {
3132 if (IS_VALID_CRED(cred)) {
3133 kauth_cred_unref(&cred);
3134 }
3135 if (!cb.rcb_func) {
3136 /* if !async we'll need to wait for this RPC to finish */
3137 req = wreq;
3138 wreq = NULL;
3139 goto finish;
3140 }
3141 nfs_request_rele(req);
3142 /*
3143 * We're done here.
3144 * Outstanding RPC count is unchanged.
3145 * Callback will be called when RPC is done.
3146 */
3147 return;
3148 }
3149 SET(bp->nb_flags, NB_ERROR);
3150 bp->nb_error = error;
3151 }
3152
3153 out:
3154 if (cb.rcb_func) {
3155 nfs_async_write_done(nmp);
3156 nfs_request_rele(req);
3157 }
3158 /*
3159 * Decrement outstanding RPC count on buffer
3160 * and call nfs_buf_write_finish on last RPC.
3161 *
3162 * (Note: when there are multiple async RPCs issued for a
3163 * buffer we need nfs_buffer_mutex to avoid problems when
3164 * aborting a partially-initiated set of RPCs)
3165 */
3166 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3167 if (multasyncrpc) {
3168 lck_mtx_lock(nfs_buf_mutex);
3169 }
3170
3171 bp->nb_rpcs--;
3172 finished = (bp->nb_rpcs == 0);
3173
3174 if (multasyncrpc) {
3175 lck_mtx_unlock(nfs_buf_mutex);
3176 }
3177
3178 if (finished) {
3179 if (multasyncrpc) {
3180 wakeme = &bp->nb_rpcs;
3181 }
3182 nfs_buf_write_finish(bp, thd, cred);
3183 if (wakeme) {
3184 wakeup(wakeme);
3185 }
3186 }
3187
3188 if (IS_VALID_CRED(cred)) {
3189 kauth_cred_unref(&cred);
3190 }
3191 }
3192
3193 /*
3194 * Send commit(s) for the given node's "needcommit" buffers
3195 */
3196 int
3197 nfs_flushcommits(nfsnode_t np, int nowait)
3198 {
3199 struct nfsmount *nmp;
3200 struct nfsbuf *bp, *prevlbp, *lbp;
3201 struct nfsbuflists blist, commitlist;
3202 int error = 0, retv, wcred_set, flags, dirty;
3203 u_quad_t off, endoff, toff;
3204 uint64_t wverf;
3205 u_int32_t count;
3206 kauth_cred_t wcred = NULL;
3207
3208 FSDBG_TOP(557, np, 0, 0, 0);
3209
3210 /*
3211 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3212 * server, but nas not been committed to stable storage on the server
3213 * yet. The byte range is worked out for as many nfsbufs as we can handle
3214 * and the commit rpc is done.
3215 */
3216 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3217 error = nfs_node_lock(np);
3218 if (error) {
3219 goto done;
3220 }
3221 np->n_flag |= NMODIFIED;
3222 nfs_node_unlock(np);
3223 }
3224
3225 off = (u_quad_t)-1;
3226 endoff = 0;
3227 wcred_set = 0;
3228 LIST_INIT(&commitlist);
3229
3230 nmp = NFSTONMP(np);
3231 if (nfs_mount_gone(nmp)) {
3232 error = ENXIO;
3233 goto done;
3234 }
3235 if (nmp->nm_vers == NFS_VER2) {
3236 error = EINVAL;
3237 goto done;
3238 }
3239
3240 flags = NBI_DIRTY;
3241 if (nowait) {
3242 flags |= NBI_NOWAIT;
3243 }
3244 lck_mtx_lock(nfs_buf_mutex);
3245 wverf = nmp->nm_verf;
3246 if (!nfs_buf_iterprepare(np, &blist, flags)) {
3247 while ((bp = LIST_FIRST(&blist))) {
3248 LIST_REMOVE(bp, nb_vnbufs);
3249 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3250 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3251 if (error) {
3252 continue;
3253 }
3254 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3255 nfs_buf_check_write_verifier(np, bp);
3256 }
3257 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3258 (bp->nb_verf != wverf)) {
3259 nfs_buf_drop(bp);
3260 continue;
3261 }
3262 nfs_buf_remfree(bp);
3263
3264 /* buffer UPLs will be grabbed *in order* below */
3265
3266 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3267 FSDBG(557, bp->nb_validoff, bp->nb_validend,
3268 bp->nb_dirtyoff, bp->nb_dirtyend);
3269
3270 /*
3271 * Work out if all buffers are using the same cred
3272 * so we can deal with them all with one commit.
3273 *
3274 * Note: creds in bp's must be obtained by kauth_cred_ref
3275 * on the same original cred in order for them to be equal.
3276 */
3277 if (wcred_set == 0) {
3278 wcred = bp->nb_wcred;
3279 if (!IS_VALID_CRED(wcred)) {
3280 panic("nfs: needcommit w/out wcred");
3281 }
3282 wcred_set = 1;
3283 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3284 wcred_set = -1;
3285 }
3286 SET(bp->nb_flags, NB_WRITEINPROG);
3287
3288 /*
3289 * Add this buffer to the list of buffers we are committing.
3290 * Buffers are inserted into the list in ascending order so that
3291 * we can take the UPLs in order after the list is complete.
3292 */
3293 prevlbp = NULL;
3294 LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3295 if (bp->nb_lblkno < lbp->nb_lblkno) {
3296 break;
3297 }
3298 prevlbp = lbp;
3299 }
3300 LIST_REMOVE(bp, nb_vnbufs);
3301 if (prevlbp) {
3302 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3303 } else {
3304 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3305 }
3306
3307 /* update commit range start, end */
3308 toff = NBOFF(bp) + bp->nb_dirtyoff;
3309 if (toff < off) {
3310 off = toff;
3311 }
3312 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3313 if (toff > endoff) {
3314 endoff = toff;
3315 }
3316 }
3317 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3318 }
3319 lck_mtx_unlock(nfs_buf_mutex);
3320
3321 if (LIST_EMPTY(&commitlist)) {
3322 error = ENOBUFS;
3323 goto done;
3324 }
3325
3326 /*
3327 * We need a UPL to prevent others from accessing the buffers during
3328 * our commit RPC(s).
3329 *
3330 * We used to also check for dirty pages here; if there were any we'd
3331 * abort the commit and force the entire buffer to be written again.
3332 * Instead of doing that, we just go ahead and commit the dirty range,
3333 * and then leave the buffer around with dirty pages that will be
3334 * written out later.
3335 */
3336 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3337 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3338 retv = nfs_buf_upl_setup(bp);
3339 if (retv) {
3340 /* Unable to create the UPL, the VM object probably no longer exists. */
3341 printf("nfs_flushcommits: upl create failed %d\n", retv);
3342 bp->nb_valid = bp->nb_dirty = 0;
3343 }
3344 }
3345 nfs_buf_upl_check(bp);
3346 }
3347
3348 /*
3349 * Commit data on the server, as required.
3350 * If all bufs are using the same wcred, then use that with
3351 * one call for all of them, otherwise commit each one
3352 * separately.
3353 */
3354 if (wcred_set == 1) {
3355 /*
3356 * Note, it's possible the commit range could be >2^32-1.
3357 * If it is, we'll send one commit that covers the whole file.
3358 */
3359 if ((endoff - off) > 0xffffffff) {
3360 count = 0;
3361 } else {
3362 count = (endoff - off);
3363 }
3364 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
3365 } else {
3366 retv = 0;
3367 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3368 toff = NBOFF(bp) + bp->nb_dirtyoff;
3369 count = bp->nb_dirtyend - bp->nb_dirtyoff;
3370 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
3371 if (retv) {
3372 break;
3373 }
3374 }
3375 }
3376
3377 /*
3378 * Now, either mark the blocks I/O done or mark the
3379 * blocks dirty, depending on whether the commit
3380 * succeeded.
3381 */
3382 while ((bp = LIST_FIRST(&commitlist))) {
3383 LIST_REMOVE(bp, nb_vnbufs);
3384 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3385 nfs_node_lock_force(np);
3386 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3387 np->n_needcommitcnt--;
3388 CHECK_NEEDCOMMITCNT(np);
3389 nfs_node_unlock(np);
3390
3391 if (retv) {
3392 /* move back to dirty list */
3393 lck_mtx_lock(nfs_buf_mutex);
3394 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3395 lck_mtx_unlock(nfs_buf_mutex);
3396 nfs_buf_release(bp, 1);
3397 continue;
3398 }
3399
3400 nfs_node_lock_force(np);
3401 np->n_numoutput++;
3402 nfs_node_unlock(np);
3403 vnode_startwrite(NFSTOV(np));
3404 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3405 lck_mtx_lock(nfs_buf_mutex);
3406 nfs_nbdwrite--;
3407 NFSBUFCNTCHK();
3408 lck_mtx_unlock(nfs_buf_mutex);
3409 wakeup(&nfs_nbdwrite);
3410 }
3411 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
3412 /* if block still has dirty pages, we don't want it to */
3413 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3414 if (!(dirty = bp->nb_dirty)) {
3415 SET(bp->nb_flags, NB_ASYNC);
3416 } else {
3417 CLR(bp->nb_flags, NB_ASYNC);
3418 }
3419
3420 /* move to clean list */
3421 lck_mtx_lock(nfs_buf_mutex);
3422 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3423 lck_mtx_unlock(nfs_buf_mutex);
3424
3425 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3426
3427 nfs_buf_iodone(bp);
3428 if (dirty) {
3429 /* throw it back in as a delayed write buffer */
3430 CLR(bp->nb_flags, NB_DONE);
3431 nfs_buf_write_delayed(bp);
3432 }
3433 }
3434
3435 done:
3436 FSDBG_BOT(557, np, 0, 0, error);
3437 return error;
3438 }
3439
3440 /*
3441 * Flush all the blocks associated with a vnode.
3442 * Walk through the buffer pool and push any dirty pages
3443 * associated with the vnode.
3444 */
3445 int
3446 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3447 {
3448 struct nfsbuf *bp;
3449 struct nfsbuflists blist;
3450 struct nfsmount *nmp = NFSTONMP(np);
3451 int error = 0, error2, slptimeo = 0, slpflag = 0;
3452 int nfsvers, flags, passone = 1;
3453
3454 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3455
3456 if (nfs_mount_gone(nmp)) {
3457 error = ENXIO;
3458 goto out;
3459 }
3460 nfsvers = nmp->nm_vers;
3461 if (NMFLAG(nmp, INTR)) {
3462 slpflag = PCATCH;
3463 }
3464
3465 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3466 nfs_node_lock_force(np);
3467 np->n_flag |= NMODIFIED;
3468 nfs_node_unlock(np);
3469 }
3470
3471 lck_mtx_lock(nfs_buf_mutex);
3472 while (np->n_bflag & NBFLUSHINPROG) {
3473 np->n_bflag |= NBFLUSHWANT;
3474 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3475 if ((error && (error != EWOULDBLOCK)) ||
3476 ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
3477 lck_mtx_unlock(nfs_buf_mutex);
3478 goto out;
3479 }
3480 }
3481 np->n_bflag |= NBFLUSHINPROG;
3482
3483 /*
3484 * On the first pass, start async/unstable writes on all
3485 * delayed write buffers. Then wait for all writes to complete
3486 * and call nfs_flushcommits() to commit any uncommitted buffers.
3487 * On all subsequent passes, start STABLE writes on any remaining
3488 * dirty buffers. Then wait for all writes to complete.
3489 */
3490 again:
3491 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3492 if (!NFSTONMP(np)) {
3493 lck_mtx_unlock(nfs_buf_mutex);
3494 error = ENXIO;
3495 goto done;
3496 }
3497
3498 /* Start/do any write(s) that are required. */
3499 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3500 while ((bp = LIST_FIRST(&blist))) {
3501 LIST_REMOVE(bp, nb_vnbufs);
3502 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3503 flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3504 if (flags != NBAC_NOWAIT) {
3505 nfs_buf_refget(bp);
3506 }
3507 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3508 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3509 if (error == EBUSY) {
3510 break;
3511 }
3512 if (error) {
3513 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3514 if (error2) {
3515 if (flags != NBAC_NOWAIT) {
3516 nfs_buf_refrele(bp);
3517 }
3518 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3519 lck_mtx_unlock(nfs_buf_mutex);
3520 error = error2;
3521 goto done;
3522 }
3523 if (slpflag == PCATCH) {
3524 slpflag = 0;
3525 slptimeo = 2 * hz;
3526 }
3527 }
3528 }
3529 if (flags != NBAC_NOWAIT) {
3530 nfs_buf_refrele(bp);
3531 }
3532 if (error == EBUSY) {
3533 continue;
3534 }
3535 if (!bp->nb_np) {
3536 /* buffer is no longer valid */
3537 nfs_buf_drop(bp);
3538 continue;
3539 }
3540 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3541 nfs_buf_check_write_verifier(np, bp);
3542 }
3543 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3544 /* buffer is no longer dirty */
3545 nfs_buf_drop(bp);
3546 continue;
3547 }
3548 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3549 if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3550 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3551 nfs_buf_drop(bp);
3552 continue;
3553 }
3554 nfs_buf_remfree(bp);
3555 lck_mtx_unlock(nfs_buf_mutex);
3556 if (ISSET(bp->nb_flags, NB_ERROR)) {
3557 nfs_node_lock_force(np);
3558 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3559 np->n_flag |= NWRITEERR;
3560 nfs_node_unlock(np);
3561 nfs_buf_release(bp, 1);
3562 lck_mtx_lock(nfs_buf_mutex);
3563 continue;
3564 }
3565 SET(bp->nb_flags, NB_ASYNC);
3566 if (!passone) {
3567 /* NB_STABLE forces this to be written FILESYNC */
3568 SET(bp->nb_flags, NB_STABLE);
3569 }
3570 nfs_buf_write(bp);
3571 lck_mtx_lock(nfs_buf_mutex);
3572 }
3573 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3574 }
3575 lck_mtx_unlock(nfs_buf_mutex);
3576
3577 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3578 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3579 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3580 if (error2) {
3581 error = error2;
3582 goto done;
3583 }
3584 if (slpflag == PCATCH) {
3585 slpflag = 0;
3586 slptimeo = 2 * hz;
3587 }
3588 }
3589 }
3590
3591 if (nfsvers != NFS_VER2) {
3592 /* loop while it looks like there are still buffers to be */
3593 /* commited and nfs_flushcommits() seems to be handling them. */
3594 while (np->n_needcommitcnt) {
3595 if (nfs_flushcommits(np, 0)) {
3596 break;
3597 }
3598 }
3599 }
3600
3601 if (passone) {
3602 passone = 0;
3603 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3604 nfs_node_lock_force(np);
3605 np->n_flag |= NMODIFIED;
3606 nfs_node_unlock(np);
3607 }
3608 lck_mtx_lock(nfs_buf_mutex);
3609 goto again;
3610 }
3611
3612 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3613 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3614 nfs_node_lock_force(np);
3615 np->n_flag |= NMODIFIED;
3616 nfs_node_unlock(np);
3617 }
3618 lck_mtx_lock(nfs_buf_mutex);
3619 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3620 goto again;
3621 }
3622 lck_mtx_unlock(nfs_buf_mutex);
3623 nfs_node_lock_force(np);
3624 /*
3625 * OK, it looks like there are no dirty blocks. If we have no
3626 * writes in flight and no one in the write code, we can clear
3627 * the modified flag. In order to make sure we see the latest
3628 * attributes and size, we also invalidate the attributes and
3629 * advance the attribute cache XID to guarantee that attributes
3630 * newer than our clearing of NMODIFIED will get loaded next.
3631 * (If we don't do this, it's possible for the flush's final
3632 * write/commit (xid1) to be executed in parallel with a subsequent
3633 * getattr request (xid2). The getattr could return attributes
3634 * from *before* the write/commit completed but the stale attributes
3635 * would be preferred because of the xid ordering.)
3636 */
3637 if (!np->n_wrbusy && !np->n_numoutput) {
3638 np->n_flag &= ~NMODIFIED;
3639 NATTRINVALIDATE(np);
3640 nfs_get_xid(&np->n_xid);
3641 }
3642 } else {
3643 nfs_node_lock_force(np);
3644 }
3645
3646 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3647 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3648 error = np->n_error;
3649 np->n_flag &= ~NWRITEERR;
3650 }
3651 nfs_node_unlock(np);
3652 done:
3653 lck_mtx_lock(nfs_buf_mutex);
3654 flags = np->n_bflag;
3655 np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
3656 lck_mtx_unlock(nfs_buf_mutex);
3657 if (flags & NBFLUSHWANT) {
3658 wakeup(&np->n_bflag);
3659 }
3660 out:
3661 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3662 return error;
3663 }
3664
3665 /*
3666 * Flush out and invalidate all buffers associated with a vnode.
3667 * Called with the underlying object locked.
3668 */
3669 int
3670 nfs_vinvalbuf_internal(
3671 nfsnode_t np,
3672 int flags,
3673 thread_t thd,
3674 kauth_cred_t cred,
3675 int slpflag,
3676 int slptimeo)
3677 {
3678 struct nfsbuf *bp;
3679 struct nfsbuflists blist;
3680 int list, error = 0;
3681
3682 if (flags & V_SAVE) {
3683 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3684 return error;
3685 }
3686 }
3687
3688 lck_mtx_lock(nfs_buf_mutex);
3689 for (;;) {
3690 list = NBI_CLEAN;
3691 if (nfs_buf_iterprepare(np, &blist, list)) {
3692 list = NBI_DIRTY;
3693 if (nfs_buf_iterprepare(np, &blist, list)) {
3694 break;
3695 }
3696 }
3697 while ((bp = LIST_FIRST(&blist))) {
3698 LIST_REMOVE(bp, nb_vnbufs);
3699 if (list == NBI_CLEAN) {
3700 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3701 } else {
3702 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3703 }
3704 nfs_buf_refget(bp);
3705 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3706 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3707 if (error != EAGAIN) {
3708 FSDBG(554, np, bp, -1, error);
3709 nfs_buf_refrele(bp);
3710 nfs_buf_itercomplete(np, &blist, list);
3711 lck_mtx_unlock(nfs_buf_mutex);
3712 return error;
3713 }
3714 }
3715 nfs_buf_refrele(bp);
3716 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3717 lck_mtx_unlock(nfs_buf_mutex);
3718 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3719 (NBOFF(bp) < (off_t)np->n_size)) {
3720 /* extra paranoia: make sure we're not */
3721 /* somehow leaving any dirty data around */
3722 int mustwrite = 0;
3723 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3724 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3725 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3726 error = nfs_buf_upl_setup(bp);
3727 if (error == EINVAL) {
3728 /* vm object must no longer exist */
3729 /* hopefully we don't need to do */
3730 /* anything for this buffer */
3731 } else if (error) {
3732 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3733 }
3734 bp->nb_valid = bp->nb_dirty = 0;
3735 }
3736 nfs_buf_upl_check(bp);
3737 /* check for any dirty data before the EOF */
3738 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3739 /* clip dirty range to EOF */
3740 if (bp->nb_dirtyend > end) {
3741 bp->nb_dirtyend = end;
3742 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
3743 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3744 }
3745 }
3746 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3747 mustwrite++;
3748 }
3749 }
3750 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3751 if (bp->nb_dirty) {
3752 mustwrite++;
3753 }
3754 /* also make sure we'll have a credential to do the write */
3755 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3756 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3757 mustwrite = 0;
3758 }
3759 if (mustwrite) {
3760 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3761 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3762 panic("nfs_vinvalbuf: dirty buffer without upl");
3763 }
3764 /* gotta write out dirty data before invalidating */
3765 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3766 /* (NB_NOCACHE indicates buffer should be discarded) */
3767 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3768 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3769 if (!IS_VALID_CRED(bp->nb_wcred)) {
3770 kauth_cred_ref(cred);
3771 bp->nb_wcred = cred;
3772 }
3773 error = nfs_buf_write(bp);
3774 // Note: bp has been released
3775 if (error) {
3776 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3777 nfs_node_lock_force(np);
3778 if ((error != EINTR) && (error != ERESTART)) {
3779 np->n_error = error;
3780 np->n_flag |= NWRITEERR;
3781 }
3782 /*
3783 * There was a write error and we need to
3784 * invalidate attrs to sync with server.
3785 * (if this write was extending the file,
3786 * we may no longer know the correct size)
3787 */
3788 NATTRINVALIDATE(np);
3789 nfs_node_unlock(np);
3790 if ((error == EINTR) || (error == ERESTART)) {
3791 /*
3792 * Abort on EINTR. If we don't, we could
3793 * be stuck in this loop forever because
3794 * the buffer will continue to stay dirty.
3795 */
3796 lck_mtx_lock(nfs_buf_mutex);
3797 nfs_buf_itercomplete(np, &blist, list);
3798 lck_mtx_unlock(nfs_buf_mutex);
3799 return error;
3800 }
3801 error = 0;
3802 }
3803 lck_mtx_lock(nfs_buf_mutex);
3804 continue;
3805 }
3806 }
3807 SET(bp->nb_flags, NB_INVAL);
3808 // hold off on FREEUPs until we're done here
3809 nfs_buf_release(bp, 0);
3810 lck_mtx_lock(nfs_buf_mutex);
3811 }
3812 nfs_buf_itercomplete(np, &blist, list);
3813 }
3814 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
3815 panic("nfs_vinvalbuf: flush/inval failed");
3816 }
3817 lck_mtx_unlock(nfs_buf_mutex);
3818 nfs_node_lock_force(np);
3819 if (!(flags & V_SAVE)) {
3820 np->n_flag &= ~NMODIFIED;
3821 }
3822 if (vnode_vtype(NFSTOV(np)) == VREG) {
3823 np->n_lastrahead = -1;
3824 }
3825 nfs_node_unlock(np);
3826 NFS_BUF_FREEUP();
3827 return 0;
3828 }
3829
3830
3831 /*
3832 * Flush and invalidate all dirty buffers. If another process is already
3833 * doing the flush, just wait for completion.
3834 */
3835 int
3836 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3837 {
3838 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3839 }
3840
3841 int
3842 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3843 {
3844 nfsnode_t np = VTONFS(vp);
3845 struct nfsmount *nmp = VTONMP(vp);
3846 int error, slpflag, slptimeo, nflags, retry = 0;
3847 int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
3848 struct timespec ts = { 2, 0 };
3849 off_t size;
3850
3851 FSDBG_TOP(554, np, flags, intrflg, 0);
3852
3853 /*
3854 * If the mount is gone no sense to try and write anything.
3855 * and hang trying to do IO.
3856 */
3857 if (nfs_mount_gone(nmp)) {
3858 flags &= ~V_SAVE;
3859 ubcflags &= ~UBC_PUSHALL;
3860 }
3861
3862 if (nmp && !NMFLAG(nmp, INTR)) {
3863 intrflg = 0;
3864 }
3865 if (intrflg) {
3866 slpflag = PCATCH;
3867 slptimeo = 2 * hz;
3868 } else {
3869 slpflag = 0;
3870 slptimeo = 0;
3871 }
3872
3873 /* First wait for any other process doing a flush to complete. */
3874 lck_mtx_lock(nfs_buf_mutex);
3875 while (np->n_bflag & NBINVALINPROG) {
3876 np->n_bflag |= NBINVALWANT;
3877 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
3878 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3879 lck_mtx_unlock(nfs_buf_mutex);
3880 return error;
3881 }
3882 if (np->n_bflag & NBINVALINPROG) {
3883 slpflag = 0;
3884 }
3885 }
3886 np->n_bflag |= NBINVALINPROG;
3887 lck_mtx_unlock(nfs_buf_mutex);
3888
3889 /* Now, flush as required. */
3890 again:
3891 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3892 while (error) {
3893 FSDBG(554, np, 0, 0, error);
3894 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3895 goto done;
3896 }
3897 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3898 }
3899
3900 /* get the pages out of vm also */
3901 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
3902 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
3903 if (error == EINVAL) {
3904 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
3905 }
3906 if (retry++ < 10) { /* retry invalidating a few times */
3907 if (retry > 1 || error == ENXIO) {
3908 ubcflags &= ~UBC_PUSHALL;
3909 }
3910 goto again;
3911 }
3912 /* give up */
3913 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
3914 }
3915 }
3916 done:
3917 lck_mtx_lock(nfs_buf_mutex);
3918 nflags = np->n_bflag;
3919 np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
3920 lck_mtx_unlock(nfs_buf_mutex);
3921 if (nflags & NBINVALWANT) {
3922 wakeup(&np->n_bflag);
3923 }
3924
3925 FSDBG_BOT(554, np, flags, intrflg, error);
3926 return error;
3927 }
3928
3929 /*
3930 * Wait for any busy buffers to complete.
3931 */
3932 void
3933 nfs_wait_bufs(nfsnode_t np)
3934 {
3935 struct nfsbuf *bp;
3936 struct nfsbuflists blist;
3937 int error = 0;
3938
3939 lck_mtx_lock(nfs_buf_mutex);
3940 if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3941 while ((bp = LIST_FIRST(&blist))) {
3942 LIST_REMOVE(bp, nb_vnbufs);
3943 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3944 nfs_buf_refget(bp);
3945 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3946 if (error != EAGAIN) {
3947 nfs_buf_refrele(bp);
3948 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3949 lck_mtx_unlock(nfs_buf_mutex);
3950 return;
3951 }
3952 }
3953 nfs_buf_refrele(bp);
3954 nfs_buf_drop(bp);
3955 }
3956 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3957 }
3958 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3959 while ((bp = LIST_FIRST(&blist))) {
3960 LIST_REMOVE(bp, nb_vnbufs);
3961 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3962 nfs_buf_refget(bp);
3963 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3964 if (error != EAGAIN) {
3965 nfs_buf_refrele(bp);
3966 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3967 lck_mtx_unlock(nfs_buf_mutex);
3968 return;
3969 }
3970 }
3971 nfs_buf_refrele(bp);
3972 nfs_buf_drop(bp);
3973 }
3974 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3975 }
3976 lck_mtx_unlock(nfs_buf_mutex);
3977 }
3978
3979
3980 /*
3981 * Add an async I/O request to the mount's async I/O queue and make
3982 * sure that an nfsiod will service it.
3983 */
3984 void
3985 nfs_asyncio_finish(struct nfsreq *req)
3986 {
3987 struct nfsmount *nmp;
3988 struct nfsiod *niod;
3989 int started = 0;
3990
3991 FSDBG_TOP(552, nmp, 0, 0, 0);
3992 again:
3993 nmp = req->r_nmp;
3994
3995 if (nmp == NULL) {
3996 return;
3997 }
3998
3999 lck_mtx_lock(nfsiod_mutex);
4000 niod = nmp->nm_niod;
4001
4002 /* grab an nfsiod if we don't have one already */
4003 if (!niod) {
4004 niod = TAILQ_FIRST(&nfsiodfree);
4005 if (niod) {
4006 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4007 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4008 niod->niod_nmp = nmp;
4009 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
4010 /*
4011 * Try starting a new thread.
4012 * We may try a couple times if other callers
4013 * get the new threads before we do.
4014 */
4015 lck_mtx_unlock(nfsiod_mutex);
4016 started++;
4017 if (!nfsiod_start()) {
4018 goto again;
4019 }
4020 lck_mtx_lock(nfsiod_mutex);
4021 }
4022 }
4023
4024 /*
4025 * If we got here while being on the resendq we need to get off. This
4026 * happens when the timer fires and errors out requests from nfs_sigintr
4027 * or we receive a reply (UDP case) while being on the resend queue so
4028 * we're just finishing up and are not going to be resent.
4029 */
4030 lck_mtx_lock(&req->r_mtx);
4031 if (req->r_flags & R_RESENDQ) {
4032 lck_mtx_lock(&nmp->nm_lock);
4033 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4034 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4035 TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4036 req->r_rchain.tqe_next = NFSREQNOLIST;
4037 assert(req->r_refs > 1);
4038 /* Remove resendq reference */
4039 req->r_refs--;
4040 }
4041 lck_mtx_unlock(&nmp->nm_lock);
4042 req->r_flags &= ~R_RESENDQ;
4043 }
4044 lck_mtx_unlock(&req->r_mtx);
4045
4046 if (req->r_achain.tqe_next == NFSREQNOLIST) {
4047 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
4048 }
4049
4050 /* If this mount doesn't already have an nfsiod working on it... */
4051 if (!nmp->nm_niod) {
4052 if (niod) { /* give it the nfsiod we just grabbed */
4053 nmp->nm_niod = niod;
4054 lck_mtx_unlock(nfsiod_mutex);
4055 wakeup(niod);
4056 } else if (nfsiod_thread_count > 0) {
4057 /* just queue it up on nfsiod mounts queue if needed */
4058 if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
4059 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
4060 }
4061 lck_mtx_unlock(nfsiod_mutex);
4062 } else {
4063 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4064 lck_mtx_unlock(nfsiod_mutex);
4065 /* we have no other option but to be persistent */
4066 started = 0;
4067 goto again;
4068 }
4069 } else {
4070 lck_mtx_unlock(nfsiod_mutex);
4071 }
4072
4073 FSDBG_BOT(552, nmp, 0, 0, 0);
4074 }
4075
4076 /*
4077 * queue up async I/O request for resend
4078 */
4079 void
4080 nfs_asyncio_resend(struct nfsreq *req)
4081 {
4082 struct nfsmount *nmp = req->r_nmp;
4083
4084 if (nfs_mount_gone(nmp)) {
4085 return;
4086 }
4087
4088 nfs_gss_clnt_rpcdone(req);
4089 lck_mtx_lock(&nmp->nm_lock);
4090 if (!(req->r_flags & R_RESENDQ)) {
4091 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4092 req->r_flags |= R_RESENDQ;
4093 /*
4094 * We take a reference on this request so that it can't be
4095 * destroyed while a resend is queued or in progress.
4096 */
4097 nfs_request_ref(req, 1);
4098 }
4099 nfs_mount_sock_thread_wake(nmp);
4100 lck_mtx_unlock(&nmp->nm_lock);
4101 }
4102
4103 /*
4104 * Read directory data into a buffer.
4105 *
4106 * Buffer will be filled (unless EOF is hit).
4107 * Buffers after this one may also be completely/partially filled.
4108 */
4109 int
4110 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
4111 {
4112 nfsnode_t np = bp->nb_np;
4113 struct nfsmount *nmp = NFSTONMP(np);
4114 int error = 0;
4115
4116 if (nfs_mount_gone(nmp)) {
4117 return ENXIO;
4118 }
4119
4120 if (nmp->nm_vers < NFS_VER4) {
4121 error = nfs3_readdir_rpc(np, bp, ctx);
4122 } else {
4123 error = nfs4_readdir_rpc(np, bp, ctx);
4124 }
4125
4126 if (error && (error != NFSERR_DIRBUFDROPPED)) {
4127 SET(bp->nb_flags, NB_ERROR);
4128 bp->nb_error = error;
4129 }
4130 return error;
4131 }