]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-1504.15.3.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/resourcevar.h>
70 #include <sys/signalvar.h>
71 #include <sys/proc_internal.h>
72 #include <sys/kauth.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/dirent.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/ubc_internal.h>
79 #include <sys/uio_internal.h>
80
81 #include <sys/vm.h>
82 #include <sys/vmparam.h>
83
84 #include <sys/time.h>
85 #include <kern/clock.h>
86 #include <libkern/OSAtomic.h>
87 #include <kern/kalloc.h>
88 #include <kern/thread_call.h>
89
90 #include <nfs/rpcv2.h>
91 #include <nfs/nfsproto.h>
92 #include <nfs/nfs.h>
93 #include <nfs/nfs_gss.h>
94 #include <nfs/nfsmount.h>
95 #include <nfs/nfsnode.h>
96 #include <sys/buf_internal.h>
97 #include <libkern/OSAtomic.h>
98
99 kern_return_t thread_terminate(thread_t); /* XXX */
100
101 #define NFSBUFHASH(np, lbn) \
102 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
103 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
104 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
105 u_long nfsbufhash;
106 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
107 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
108 int nfs_nbdwrite;
109 int nfs_buf_timer_on = 0;
110 thread_t nfsbufdelwrithd = NULL;
111
112 lck_grp_t *nfs_buf_lck_grp;
113 lck_mtx_t *nfs_buf_mutex;
114
115 #define NFSBUF_FREE_PERIOD 30 /* seconds */
116 #define NFSBUF_LRU_STALE 120
117 #define NFSBUF_META_STALE 240
118
119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
120 #define LRU_TO_FREEUP 6
121 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
122 #define META_TO_FREEUP 3
123 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
124 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
126 #define LRU_FREEUP_FRAC_ON_TIMER 8
127 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
128 #define META_FREEUP_FRAC_ON_TIMER 16
129 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
130 #define LRU_FREEUP_MIN_FRAC 4
131 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
132 #define META_FREEUP_MIN_FRAC 2
133
134 #define NFS_BUF_FREEUP() \
135 do { \
136 /* only call nfs_buf_freeup() if it has work to do: */ \
137 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
138 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
139 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
140 nfs_buf_freeup(0); \
141 } while (0)
142
143 /*
144 * Initialize nfsbuf lists
145 */
146 void
147 nfs_nbinit(void)
148 {
149 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
150 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
151
152 nfsbufcnt = nfsbufmetacnt =
153 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
154 nfsbufmin = 128;
155 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
156 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
157 nfsbufmetamax = nfsbufmax / 4;
158 nfsneedbuffer = 0;
159 nfs_nbdwrite = 0;
160
161 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
162 TAILQ_INIT(&nfsbuffree);
163 TAILQ_INIT(&nfsbuffreemeta);
164 TAILQ_INIT(&nfsbufdelwri);
165
166 }
167
168 /*
169 * Check periodically for stale/unused nfs bufs
170 */
171 void
172 nfs_buf_timer(__unused void *param0, __unused void *param1)
173 {
174 nfs_buf_freeup(1);
175
176 lck_mtx_lock(nfs_buf_mutex);
177 if (nfsbufcnt <= nfsbufmin) {
178 nfs_buf_timer_on = 0;
179 lck_mtx_unlock(nfs_buf_mutex);
180 return;
181 }
182 lck_mtx_unlock(nfs_buf_mutex);
183
184 nfs_interval_timer_start(nfs_buf_timer_call,
185 NFSBUF_FREE_PERIOD * 1000);
186 }
187
188 /*
189 * try to free up some excess, unused nfsbufs
190 */
191 void
192 nfs_buf_freeup(int timer)
193 {
194 struct nfsbuf *fbp;
195 struct timeval now;
196 int count;
197 struct nfsbuffreehead nfsbuffreeup;
198
199 TAILQ_INIT(&nfsbuffreeup);
200
201 lck_mtx_lock(nfs_buf_mutex);
202
203 microuptime(&now);
204
205 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
206
207 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
208 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
209 fbp = TAILQ_FIRST(&nfsbuffree);
210 if (!fbp)
211 break;
212 if (fbp->nb_refs)
213 break;
214 if (NBUFSTAMPVALID(fbp) &&
215 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
216 break;
217 nfs_buf_remfree(fbp);
218 /* disassociate buffer from any nfsnode */
219 if (fbp->nb_np) {
220 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
221 LIST_REMOVE(fbp, nb_vnbufs);
222 fbp->nb_vnbufs.le_next = NFSNOLIST;
223 }
224 fbp->nb_np = NULL;
225 }
226 LIST_REMOVE(fbp, nb_hash);
227 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
228 nfsbufcnt--;
229 }
230
231 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
232 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
233 fbp = TAILQ_FIRST(&nfsbuffreemeta);
234 if (!fbp)
235 break;
236 if (fbp->nb_refs)
237 break;
238 if (NBUFSTAMPVALID(fbp) &&
239 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
240 break;
241 nfs_buf_remfree(fbp);
242 /* disassociate buffer from any nfsnode */
243 if (fbp->nb_np) {
244 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
245 LIST_REMOVE(fbp, nb_vnbufs);
246 fbp->nb_vnbufs.le_next = NFSNOLIST;
247 }
248 fbp->nb_np = NULL;
249 }
250 LIST_REMOVE(fbp, nb_hash);
251 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
252 nfsbufcnt--;
253 nfsbufmetacnt--;
254 }
255
256 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
257 NFSBUFCNTCHK();
258
259 lck_mtx_unlock(nfs_buf_mutex);
260
261 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
262 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
263 /* nuke any creds */
264 if (IS_VALID_CRED(fbp->nb_rcred))
265 kauth_cred_unref(&fbp->nb_rcred);
266 if (IS_VALID_CRED(fbp->nb_wcred))
267 kauth_cred_unref(&fbp->nb_wcred);
268 /* if buf was NB_META, dump buffer */
269 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
270 kfree(fbp->nb_data, fbp->nb_bufsize);
271 FREE(fbp, M_TEMP);
272 }
273
274 }
275
276 /*
277 * remove a buffer from the freelist
278 * (must be called with nfs_buf_mutex held)
279 */
280 void
281 nfs_buf_remfree(struct nfsbuf *bp)
282 {
283 if (bp->nb_free.tqe_next == NFSNOLIST)
284 panic("nfsbuf not on free list");
285 if (ISSET(bp->nb_flags, NB_DELWRI)) {
286 nfsbufdelwricnt--;
287 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
288 } else if (ISSET(bp->nb_flags, NB_META)) {
289 nfsbuffreemetacnt--;
290 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
291 } else {
292 nfsbuffreecnt--;
293 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
294 }
295 bp->nb_free.tqe_next = NFSNOLIST;
296 NFSBUFCNTCHK();
297 }
298
299 /*
300 * check for existence of nfsbuf in cache
301 */
302 boolean_t
303 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
304 {
305 boolean_t rv;
306 lck_mtx_lock(nfs_buf_mutex);
307 if (nfs_buf_incore(np, blkno))
308 rv = TRUE;
309 else
310 rv = FALSE;
311 lck_mtx_unlock(nfs_buf_mutex);
312 return (rv);
313 }
314
315 /*
316 * return incore buffer (must be called with nfs_buf_mutex held)
317 */
318 struct nfsbuf *
319 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
320 {
321 /* Search hash chain */
322 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
323 for (; bp != NULL; bp = bp->nb_hash.le_next)
324 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
325 if (!ISSET(bp->nb_flags, NB_INVAL)) {
326 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
327 return (bp);
328 }
329 }
330 return (NULL);
331 }
332
333 /*
334 * Check if it's OK to drop a page.
335 *
336 * Called by vnode_pager() on pageout request of non-dirty page.
337 * We need to make sure that it's not part of a delayed write.
338 * If it is, we can't let the VM drop it because we may need it
339 * later when/if we need to write the data (again).
340 */
341 int
342 nfs_buf_page_inval(vnode_t vp, off_t offset)
343 {
344 struct nfsmount *nmp = VTONMP(vp);
345 struct nfsbuf *bp;
346 int error = 0;
347
348 if (!nmp)
349 return (ENXIO);
350
351 lck_mtx_lock(nfs_buf_mutex);
352 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
353 if (!bp)
354 goto out;
355 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
356 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
357 error = EBUSY;
358 goto out;
359 }
360 /*
361 * If there's a dirty range in the buffer, check to
362 * see if this page intersects with the dirty range.
363 * If it does, we can't let the pager drop the page.
364 */
365 if (bp->nb_dirtyend > 0) {
366 int start = offset - NBOFF(bp);
367 if ((bp->nb_dirtyend > start) &&
368 (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
369 /*
370 * Before returning the bad news, move the
371 * buffer to the start of the delwri list and
372 * give the list a push to try to flush the
373 * buffer out.
374 */
375 error = EBUSY;
376 nfs_buf_remfree(bp);
377 TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
378 nfsbufdelwricnt++;
379 nfs_buf_delwri_push(1);
380 }
381 }
382 out:
383 lck_mtx_unlock(nfs_buf_mutex);
384 return (error);
385 }
386
387 /*
388 * set up the UPL for a buffer
389 * (must NOT be called with nfs_buf_mutex held)
390 */
391 int
392 nfs_buf_upl_setup(struct nfsbuf *bp)
393 {
394 kern_return_t kret;
395 upl_t upl;
396 int upl_flags;
397
398 if (ISSET(bp->nb_flags, NB_PAGELIST))
399 return (0);
400
401 upl_flags = UPL_PRECIOUS;
402 if (!ISSET(bp->nb_flags, NB_READ)) {
403 /*
404 * We're doing a "write", so we intend to modify
405 * the pages we're gathering.
406 */
407 upl_flags |= UPL_WILL_MODIFY;
408 }
409 kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
410 &upl, NULL, upl_flags);
411 if (kret == KERN_INVALID_ARGUMENT) {
412 /* vm object probably doesn't exist any more */
413 bp->nb_pagelist = NULL;
414 return (EINVAL);
415 }
416 if (kret != KERN_SUCCESS) {
417 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
418 bp->nb_pagelist = NULL;
419 return (EIO);
420 }
421
422 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
423
424 bp->nb_pagelist = upl;
425 SET(bp->nb_flags, NB_PAGELIST);
426 return (0);
427 }
428
429 /*
430 * update buffer's valid/dirty info from UBC
431 * (must NOT be called with nfs_buf_mutex held)
432 */
433 void
434 nfs_buf_upl_check(struct nfsbuf *bp)
435 {
436 upl_page_info_t *pl;
437 off_t filesize, fileoffset;
438 int i, npages;
439
440 if (!ISSET(bp->nb_flags, NB_PAGELIST))
441 return;
442
443 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
444 filesize = ubc_getsize(NFSTOV(bp->nb_np));
445 fileoffset = NBOFF(bp);
446 if (fileoffset < filesize)
447 SET(bp->nb_flags, NB_CACHE);
448 else
449 CLR(bp->nb_flags, NB_CACHE);
450
451 pl = ubc_upl_pageinfo(bp->nb_pagelist);
452 bp->nb_valid = bp->nb_dirty = 0;
453
454 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
455 /* anything beyond the end of the file is not valid or dirty */
456 if (fileoffset >= filesize)
457 break;
458 if (!upl_valid_page(pl, i)) {
459 CLR(bp->nb_flags, NB_CACHE);
460 continue;
461 }
462 NBPGVALID_SET(bp,i);
463 if (upl_dirty_page(pl, i))
464 NBPGDIRTY_SET(bp, i);
465 }
466 fileoffset = NBOFF(bp);
467 if (ISSET(bp->nb_flags, NB_CACHE)) {
468 bp->nb_validoff = 0;
469 bp->nb_validend = bp->nb_bufsize;
470 if (fileoffset + bp->nb_validend > filesize)
471 bp->nb_validend = filesize - fileoffset;
472 } else {
473 bp->nb_validoff = bp->nb_validend = -1;
474 }
475 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
476 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
477 }
478
479 /*
480 * make sure that a buffer is mapped
481 * (must NOT be called with nfs_buf_mutex held)
482 */
483 int
484 nfs_buf_map(struct nfsbuf *bp)
485 {
486 kern_return_t kret;
487
488 if (bp->nb_data)
489 return (0);
490 if (!ISSET(bp->nb_flags, NB_PAGELIST))
491 return (EINVAL);
492
493 kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
494 if (kret != KERN_SUCCESS)
495 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
496 if (bp->nb_data == 0)
497 panic("ubc_upl_map mapped 0");
498 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
499 return (0);
500 }
501
502 /*
503 * normalize an nfsbuf's valid range
504 *
505 * the read/write code guarantees that we'll always have a valid
506 * region that is an integral number of pages. If either end
507 * of the valid range isn't page-aligned, it gets corrected
508 * here as we extend the valid range through all of the
509 * contiguous valid pages.
510 */
511 void
512 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
513 {
514 int pg, npg;
515 /* pull validoff back to start of contiguous valid page range */
516 pg = bp->nb_validoff/PAGE_SIZE;
517 while (pg >= 0 && NBPGVALID(bp,pg))
518 pg--;
519 bp->nb_validoff = (pg+1) * PAGE_SIZE;
520 /* push validend forward to end of contiguous valid page range */
521 npg = bp->nb_bufsize/PAGE_SIZE;
522 pg = bp->nb_validend/PAGE_SIZE;
523 while (pg < npg && NBPGVALID(bp,pg))
524 pg++;
525 bp->nb_validend = pg * PAGE_SIZE;
526 /* clip to EOF */
527 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
528 bp->nb_validend = np->n_size % bp->nb_bufsize;
529 }
530
531 /*
532 * process some entries on the delayed write queue
533 * (must be called with nfs_buf_mutex held)
534 */
535 void
536 nfs_buf_delwri_service(void)
537 {
538 struct nfsbuf *bp;
539 nfsnode_t np;
540 int error, i = 0;
541
542 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
543 np = bp->nb_np;
544 nfs_buf_remfree(bp);
545 nfs_buf_refget(bp);
546 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
547 nfs_buf_refrele(bp);
548 if (error)
549 break;
550 if (!bp->nb_np) {
551 /* buffer is no longer valid */
552 nfs_buf_drop(bp);
553 continue;
554 }
555 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
556 nfs_buf_check_write_verifier(np, bp);
557 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
558 /* put buffer at end of delwri list */
559 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
560 nfsbufdelwricnt++;
561 nfs_buf_drop(bp);
562 lck_mtx_unlock(nfs_buf_mutex);
563 nfs_flushcommits(np, 1);
564 } else {
565 SET(bp->nb_flags, NB_ASYNC);
566 lck_mtx_unlock(nfs_buf_mutex);
567 nfs_buf_write(bp);
568 }
569 i++;
570 lck_mtx_lock(nfs_buf_mutex);
571 }
572 }
573
574 /*
575 * thread to service the delayed write queue when asked
576 */
577 void
578 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
579 {
580 struct timespec ts = { 30, 0 };
581 int error = 0;
582
583 lck_mtx_lock(nfs_buf_mutex);
584 while (!error) {
585 nfs_buf_delwri_service();
586 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
587 }
588 nfsbufdelwrithd = NULL;
589 lck_mtx_unlock(nfs_buf_mutex);
590 thread_terminate(nfsbufdelwrithd);
591 }
592
593 /*
594 * try to push out some delayed/uncommitted writes
595 * ("locked" indicates whether nfs_buf_mutex is already held)
596 */
597 void
598 nfs_buf_delwri_push(int locked)
599 {
600 if (TAILQ_EMPTY(&nfsbufdelwri))
601 return;
602 if (!locked)
603 lck_mtx_lock(nfs_buf_mutex);
604 /* wake up the delayed write service thread */
605 if (nfsbufdelwrithd)
606 wakeup(&nfsbufdelwrithd);
607 else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS)
608 thread_deallocate(nfsbufdelwrithd);
609 /* otherwise, try to do some of the work ourselves */
610 if (!nfsbufdelwrithd)
611 nfs_buf_delwri_service();
612 if (!locked)
613 lck_mtx_unlock(nfs_buf_mutex);
614 }
615
616 /*
617 * Get an nfs buffer.
618 *
619 * Returns errno on error, 0 otherwise.
620 * Any buffer is returned in *bpp.
621 *
622 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
623 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
624 *
625 * Check for existence of buffer in cache.
626 * Or attempt to reuse a buffer from one of the free lists.
627 * Or allocate a new buffer if we haven't already hit max allocation.
628 * Or wait for a free buffer.
629 *
630 * If available buffer found, prepare it, and return it.
631 *
632 * If the calling process is interrupted by a signal for
633 * an interruptible mount point, return EINTR.
634 */
635 int
636 nfs_buf_get(
637 nfsnode_t np,
638 daddr64_t blkno,
639 uint32_t size,
640 thread_t thd,
641 int flags,
642 struct nfsbuf **bpp)
643 {
644 vnode_t vp = NFSTOV(np);
645 struct nfsmount *nmp = VTONMP(vp);
646 struct nfsbuf *bp;
647 uint32_t bufsize;
648 int slpflag = PCATCH;
649 int operation = (flags & NBLK_OPMASK);
650 int error = 0;
651 struct timespec ts;
652
653 FSDBG_TOP(541, np, blkno, size, flags);
654 *bpp = NULL;
655
656 bufsize = size;
657 if (bufsize > NFS_MAXBSIZE)
658 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
659
660 if (!nmp) {
661 FSDBG_BOT(541, np, blkno, 0, ENXIO);
662 return (ENXIO);
663 }
664
665 if (!UBCINFOEXISTS(vp)) {
666 operation = NBLK_META;
667 } else if (bufsize < (uint32_t)nmp->nm_biosize) {
668 /* reg files should always have biosize blocks */
669 bufsize = nmp->nm_biosize;
670 }
671
672 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
673 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
674 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
675
676 /* poke the delwri list */
677 nfs_buf_delwri_push(0);
678
679 /* sleep to let other threads run... */
680 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
681 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
682 }
683
684 loop:
685 lck_mtx_lock(nfs_buf_mutex);
686
687 /* check for existence of nfsbuf in cache */
688 if ((bp = nfs_buf_incore(np, blkno))) {
689 /* if busy, set wanted and wait */
690 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
691 if (flags & NBLK_NOWAIT) {
692 lck_mtx_unlock(nfs_buf_mutex);
693 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
694 return (0);
695 }
696 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
697 SET(bp->nb_lflags, NBL_WANTED);
698
699 ts.tv_sec = 2;
700 ts.tv_nsec = 0;
701 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
702 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
703 slpflag = 0;
704 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
705 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
706 FSDBG_BOT(541, np, blkno, 0, error);
707 return (error);
708 }
709 goto loop;
710 }
711 if (bp->nb_bufsize != bufsize)
712 panic("nfsbuf size mismatch");
713 SET(bp->nb_lflags, NBL_BUSY);
714 SET(bp->nb_flags, NB_CACHE);
715 nfs_buf_remfree(bp);
716 /* additional paranoia: */
717 if (ISSET(bp->nb_flags, NB_PAGELIST))
718 panic("pagelist buffer was not busy");
719 goto buffer_setup;
720 }
721
722 if (flags & NBLK_ONLYVALID) {
723 lck_mtx_unlock(nfs_buf_mutex);
724 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
725 return (0);
726 }
727
728 /*
729 * where to get a free buffer:
730 * - if meta and maxmeta reached, must reuse meta
731 * - alloc new if we haven't reached min bufs
732 * - if free lists are NOT empty
733 * - if free list is stale, use it
734 * - else if freemeta list is stale, use it
735 * - else if max bufs allocated, use least-time-to-stale
736 * - alloc new if we haven't reached max allowed
737 * - start clearing out delwri list and try again
738 */
739
740 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
741 /* if we've hit max meta buffers, must reuse a meta buffer */
742 bp = TAILQ_FIRST(&nfsbuffreemeta);
743 } else if ((nfsbufcnt > nfsbufmin) &&
744 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
745 /* try to pull an nfsbuf off a free list */
746 struct nfsbuf *lrubp, *metabp;
747 struct timeval now;
748 microuptime(&now);
749
750 /* if the next LRU or META buffer is invalid or stale, use it */
751 lrubp = TAILQ_FIRST(&nfsbuffree);
752 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
753 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
754 bp = lrubp;
755 metabp = TAILQ_FIRST(&nfsbuffreemeta);
756 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
757 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
758 bp = metabp;
759
760 if (!bp && (nfsbufcnt >= nfsbufmax)) {
761 /* we've already allocated all bufs, so */
762 /* choose the buffer that'll go stale first */
763 if (!metabp)
764 bp = lrubp;
765 else if (!lrubp)
766 bp = metabp;
767 else {
768 int32_t lru_stale_time, meta_stale_time;
769 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
770 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
771 if (lru_stale_time <= meta_stale_time)
772 bp = lrubp;
773 else
774 bp = metabp;
775 }
776 }
777 }
778
779 if (bp) {
780 /* we have a buffer to reuse */
781 FSDBG(544, np, blkno, bp, bp->nb_flags);
782 nfs_buf_remfree(bp);
783 if (ISSET(bp->nb_flags, NB_DELWRI))
784 panic("nfs_buf_get: delwri");
785 SET(bp->nb_lflags, NBL_BUSY);
786 /* disassociate buffer from previous nfsnode */
787 if (bp->nb_np) {
788 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
789 LIST_REMOVE(bp, nb_vnbufs);
790 bp->nb_vnbufs.le_next = NFSNOLIST;
791 }
792 bp->nb_np = NULL;
793 }
794 LIST_REMOVE(bp, nb_hash);
795 /* nuke any creds we're holding */
796 if (IS_VALID_CRED(bp->nb_rcred))
797 kauth_cred_unref(&bp->nb_rcred);
798 if (IS_VALID_CRED(bp->nb_wcred))
799 kauth_cred_unref(&bp->nb_wcred);
800 /* if buf will no longer be NB_META, dump old buffer */
801 if (operation == NBLK_META) {
802 if (!ISSET(bp->nb_flags, NB_META))
803 nfsbufmetacnt++;
804 } else if (ISSET(bp->nb_flags, NB_META)) {
805 if (bp->nb_data) {
806 kfree(bp->nb_data, bp->nb_bufsize);
807 bp->nb_data = NULL;
808 }
809 nfsbufmetacnt--;
810 }
811 /* re-init buf fields */
812 bp->nb_error = 0;
813 bp->nb_validoff = bp->nb_validend = -1;
814 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
815 bp->nb_valid = 0;
816 bp->nb_dirty = 0;
817 bp->nb_verf = 0;
818 } else {
819 /* no buffer to reuse */
820 if ((nfsbufcnt < nfsbufmax) &&
821 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
822 /* just alloc a new one */
823 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
824 if (!bp) {
825 lck_mtx_unlock(nfs_buf_mutex);
826 FSDBG_BOT(541, np, blkno, 0, error);
827 return (ENOMEM);
828 }
829 nfsbufcnt++;
830
831 /*
832 * If any excess bufs, make sure the timer
833 * is running to free them up later.
834 */
835 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
836 nfs_buf_timer_on = 1;
837 nfs_interval_timer_start(nfs_buf_timer_call,
838 NFSBUF_FREE_PERIOD * 1000);
839 }
840
841 if (operation == NBLK_META)
842 nfsbufmetacnt++;
843 NFSBUFCNTCHK();
844 /* init nfsbuf */
845 bzero(bp, sizeof(*bp));
846 bp->nb_free.tqe_next = NFSNOLIST;
847 bp->nb_validoff = bp->nb_validend = -1;
848 FSDBG(545, np, blkno, bp, 0);
849 } else {
850 /* too many bufs... wait for buffers to free up */
851 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
852
853 /* poke the delwri list */
854 nfs_buf_delwri_push(1);
855
856 nfsneedbuffer = 1;
857 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL);
858 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
859 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
860 FSDBG_BOT(541, np, blkno, 0, error);
861 return (error);
862 }
863 goto loop;
864 }
865 }
866
867 /* set up nfsbuf */
868 SET(bp->nb_lflags, NBL_BUSY);
869 bp->nb_flags = 0;
870 bp->nb_lblkno = blkno;
871 /* insert buf in hash */
872 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
873 /* associate buffer with new nfsnode */
874 bp->nb_np = np;
875 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
876
877 buffer_setup:
878
879 /* unlock hash */
880 lck_mtx_unlock(nfs_buf_mutex);
881
882 switch (operation) {
883 case NBLK_META:
884 SET(bp->nb_flags, NB_META);
885 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
886 kfree(bp->nb_data, bp->nb_bufsize);
887 bp->nb_data = NULL;
888 bp->nb_validoff = bp->nb_validend = -1;
889 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
890 bp->nb_valid = 0;
891 bp->nb_dirty = 0;
892 CLR(bp->nb_flags, NB_CACHE);
893 }
894 if (!bp->nb_data)
895 bp->nb_data = kalloc(bufsize);
896 if (!bp->nb_data) {
897 /* Ack! couldn't allocate the data buffer! */
898 /* clean up buffer and return error */
899 lck_mtx_lock(nfs_buf_mutex);
900 LIST_REMOVE(bp, nb_vnbufs);
901 bp->nb_vnbufs.le_next = NFSNOLIST;
902 bp->nb_np = NULL;
903 /* invalidate usage timestamp to allow immediate freeing */
904 NBUFSTAMPINVALIDATE(bp);
905 if (bp->nb_free.tqe_next != NFSNOLIST)
906 panic("nfsbuf on freelist");
907 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
908 nfsbuffreecnt++;
909 lck_mtx_unlock(nfs_buf_mutex);
910 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
911 return (ENOMEM);
912 }
913 bp->nb_bufsize = bufsize;
914 break;
915
916 case NBLK_READ:
917 case NBLK_WRITE:
918 /*
919 * Set or clear NB_READ now to let the UPL subsystem know
920 * if we intend to modify the pages or not.
921 */
922 if (operation == NBLK_READ) {
923 SET(bp->nb_flags, NB_READ);
924 } else {
925 CLR(bp->nb_flags, NB_READ);
926 }
927 if (bufsize < PAGE_SIZE)
928 bufsize = PAGE_SIZE;
929 bp->nb_bufsize = bufsize;
930 bp->nb_validoff = bp->nb_validend = -1;
931
932 if (UBCINFOEXISTS(vp)) {
933 /* set up upl */
934 if (nfs_buf_upl_setup(bp)) {
935 /* unable to create upl */
936 /* vm object must no longer exist */
937 /* clean up buffer and return error */
938 lck_mtx_lock(nfs_buf_mutex);
939 LIST_REMOVE(bp, nb_vnbufs);
940 bp->nb_vnbufs.le_next = NFSNOLIST;
941 bp->nb_np = NULL;
942 /* invalidate usage timestamp to allow immediate freeing */
943 NBUFSTAMPINVALIDATE(bp);
944 if (bp->nb_free.tqe_next != NFSNOLIST)
945 panic("nfsbuf on freelist");
946 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
947 nfsbuffreecnt++;
948 lck_mtx_unlock(nfs_buf_mutex);
949 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
950 return (EIO);
951 }
952 nfs_buf_upl_check(bp);
953 }
954 break;
955
956 default:
957 panic("nfs_buf_get: %d unknown operation", operation);
958 }
959
960 *bpp = bp;
961
962 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
963
964 return (0);
965 }
966
967 void
968 nfs_buf_release(struct nfsbuf *bp, int freeup)
969 {
970 nfsnode_t np = bp->nb_np;
971 vnode_t vp;
972 struct timeval now;
973 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
974
975 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
976 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
977 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
978
979 vp = np ? NFSTOV(np) : NULL;
980 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
981 int upl_flags, rv;
982 upl_t upl;
983 uint32_t i;
984
985 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
986 rv = nfs_buf_upl_setup(bp);
987 if (rv)
988 printf("nfs_buf_release: upl create failed %d\n", rv);
989 else
990 nfs_buf_upl_check(bp);
991 }
992 upl = bp->nb_pagelist;
993 if (!upl)
994 goto pagelist_cleanup_done;
995 if (bp->nb_data) {
996 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
997 panic("ubc_upl_unmap failed");
998 bp->nb_data = NULL;
999 }
1000 /*
1001 * Abort the pages on error or: if this is an invalid or
1002 * non-needcommit nocache buffer AND no pages are dirty.
1003 */
1004 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1005 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
1006 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE)))
1007 upl_flags = UPL_ABORT_DUMP_PAGES;
1008 else
1009 upl_flags = 0;
1010 ubc_upl_abort(upl, upl_flags);
1011 goto pagelist_cleanup_done;
1012 }
1013 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
1014 if (!NBPGVALID(bp,i))
1015 ubc_upl_abort_range(upl,
1016 i*PAGE_SIZE, PAGE_SIZE,
1017 UPL_ABORT_DUMP_PAGES |
1018 UPL_ABORT_FREE_ON_EMPTY);
1019 else {
1020 if (NBPGDIRTY(bp,i))
1021 upl_flags = UPL_COMMIT_SET_DIRTY;
1022 else
1023 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1024
1025 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))
1026 upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
1027
1028 ubc_upl_commit_range(upl,
1029 i*PAGE_SIZE, PAGE_SIZE,
1030 upl_flags |
1031 UPL_COMMIT_INACTIVATE |
1032 UPL_COMMIT_FREE_ON_EMPTY);
1033 }
1034 }
1035 pagelist_cleanup_done:
1036 /* invalidate any pages past EOF */
1037 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1038 off_t start, end;
1039 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1040 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1041 if (start < NBOFF(bp))
1042 start = NBOFF(bp);
1043 if (end > start) {
1044 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1045 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1046 }
1047 }
1048 CLR(bp->nb_flags, NB_PAGELIST);
1049 bp->nb_pagelist = NULL;
1050 }
1051
1052 lck_mtx_lock(nfs_buf_mutex);
1053
1054 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1055
1056 /* Wake up any processes waiting for any buffer to become free. */
1057 if (nfsneedbuffer) {
1058 nfsneedbuffer = 0;
1059 wakeup_needbuffer = 1;
1060 }
1061 /* Wake up any processes waiting for _this_ buffer to become free. */
1062 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1063 CLR(bp->nb_lflags, NBL_WANTED);
1064 wakeup_buffer = 1;
1065 }
1066
1067 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1068 if (ISSET(bp->nb_flags, NB_ERROR) ||
1069 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
1070 SET(bp->nb_flags, NB_INVAL);
1071
1072 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1073 /* If it's invalid or empty, dissociate it from its nfsnode */
1074 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1075 LIST_REMOVE(bp, nb_vnbufs);
1076 bp->nb_vnbufs.le_next = NFSNOLIST;
1077 }
1078 bp->nb_np = NULL;
1079 /* if this was a delayed write, wakeup anyone */
1080 /* waiting for delayed writes to complete */
1081 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1082 CLR(bp->nb_flags, NB_DELWRI);
1083 nfs_nbdwrite--;
1084 NFSBUFCNTCHK();
1085 wakeup_nbdwrite = 1;
1086 }
1087 /* invalidate usage timestamp to allow immediate freeing */
1088 NBUFSTAMPINVALIDATE(bp);
1089 /* put buffer at head of free list */
1090 if (bp->nb_free.tqe_next != NFSNOLIST)
1091 panic("nfsbuf on freelist");
1092 SET(bp->nb_flags, NB_INVAL);
1093 if (ISSET(bp->nb_flags, NB_META)) {
1094 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1095 nfsbuffreemetacnt++;
1096 } else {
1097 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1098 nfsbuffreecnt++;
1099 }
1100 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1101 /* put buffer at end of delwri list */
1102 if (bp->nb_free.tqe_next != NFSNOLIST)
1103 panic("nfsbuf on freelist");
1104 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1105 nfsbufdelwricnt++;
1106 freeup = 0;
1107 } else {
1108 /* update usage timestamp */
1109 microuptime(&now);
1110 bp->nb_timestamp = now.tv_sec;
1111 /* put buffer at end of free list */
1112 if (bp->nb_free.tqe_next != NFSNOLIST)
1113 panic("nfsbuf on freelist");
1114 if (ISSET(bp->nb_flags, NB_META)) {
1115 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1116 nfsbuffreemetacnt++;
1117 } else {
1118 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1119 nfsbuffreecnt++;
1120 }
1121 }
1122
1123 NFSBUFCNTCHK();
1124
1125 /* Unlock the buffer. */
1126 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1127 CLR(bp->nb_lflags, NBL_BUSY);
1128
1129 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1130
1131 lck_mtx_unlock(nfs_buf_mutex);
1132
1133 if (wakeup_needbuffer)
1134 wakeup(&nfsneedbuffer);
1135 if (wakeup_buffer)
1136 wakeup(bp);
1137 if (wakeup_nbdwrite)
1138 wakeup(&nfs_nbdwrite);
1139 if (freeup)
1140 NFS_BUF_FREEUP();
1141 }
1142
1143 /*
1144 * Wait for operations on the buffer to complete.
1145 * When they do, extract and return the I/O's error value.
1146 */
1147 int
1148 nfs_buf_iowait(struct nfsbuf *bp)
1149 {
1150 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1151
1152 lck_mtx_lock(nfs_buf_mutex);
1153
1154 while (!ISSET(bp->nb_flags, NB_DONE))
1155 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1156
1157 lck_mtx_unlock(nfs_buf_mutex);
1158
1159 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1160
1161 /* check for interruption of I/O, then errors. */
1162 if (ISSET(bp->nb_flags, NB_EINTR)) {
1163 CLR(bp->nb_flags, NB_EINTR);
1164 return (EINTR);
1165 } else if (ISSET(bp->nb_flags, NB_ERROR))
1166 return (bp->nb_error ? bp->nb_error : EIO);
1167 return (0);
1168 }
1169
1170 /*
1171 * Mark I/O complete on a buffer.
1172 */
1173 void
1174 nfs_buf_iodone(struct nfsbuf *bp)
1175 {
1176
1177 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1178
1179 if (ISSET(bp->nb_flags, NB_DONE))
1180 panic("nfs_buf_iodone already");
1181
1182 if (!ISSET(bp->nb_flags, NB_READ)) {
1183 CLR(bp->nb_flags, NB_WRITEINPROG);
1184 /*
1185 * vnode_writedone() takes care of waking up
1186 * any throttled write operations
1187 */
1188 vnode_writedone(NFSTOV(bp->nb_np));
1189 nfs_node_lock_force(bp->nb_np);
1190 bp->nb_np->n_numoutput--;
1191 nfs_node_unlock(bp->nb_np);
1192 }
1193 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1194 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1195 nfs_buf_release(bp, 1);
1196 } else { /* or just wakeup the buffer */
1197 lck_mtx_lock(nfs_buf_mutex);
1198 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1199 CLR(bp->nb_lflags, NBL_WANTED);
1200 lck_mtx_unlock(nfs_buf_mutex);
1201 wakeup(bp);
1202 }
1203
1204 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1205 }
1206
1207 void
1208 nfs_buf_write_delayed(struct nfsbuf *bp)
1209 {
1210 nfsnode_t np = bp->nb_np;
1211
1212 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1213 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1214
1215 /*
1216 * If the block hasn't been seen before:
1217 * (1) Mark it as having been seen,
1218 * (2) Make sure it's on its node's correct block list,
1219 */
1220 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1221 SET(bp->nb_flags, NB_DELWRI);
1222 /* move to dirty list */
1223 lck_mtx_lock(nfs_buf_mutex);
1224 nfs_nbdwrite++;
1225 NFSBUFCNTCHK();
1226 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1227 LIST_REMOVE(bp, nb_vnbufs);
1228 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1229 lck_mtx_unlock(nfs_buf_mutex);
1230 }
1231
1232 /*
1233 * If the vnode has "too many" write operations in progress
1234 * wait for them to finish the IO
1235 */
1236 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1237
1238 /* the file is in a modified state, so make sure the flag's set */
1239 nfs_node_lock_force(np);
1240 np->n_flag |= NMODIFIED;
1241 nfs_node_unlock(np);
1242
1243 /*
1244 * If we have too many delayed write buffers,
1245 * just fall back to doing the async write.
1246 */
1247 if (nfs_nbdwrite < 0)
1248 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1249 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1250 /* issue async write */
1251 SET(bp->nb_flags, NB_ASYNC);
1252 nfs_buf_write(bp);
1253 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1254 return;
1255 }
1256
1257 /* Otherwise, the "write" is done, so mark and release the buffer. */
1258 SET(bp->nb_flags, NB_DONE);
1259 nfs_buf_release(bp, 1);
1260 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1261 return;
1262 }
1263
1264 /*
1265 * Check that a "needcommit" buffer can still be committed.
1266 * If the write verifier has changed, we need to clear the
1267 * the needcommit flag.
1268 */
1269 void
1270 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1271 {
1272 struct nfsmount *nmp;
1273
1274 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
1275 return;
1276
1277 nmp = NFSTONMP(np);
1278 if (!nmp)
1279 return;
1280 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
1281 return;
1282
1283 /* write verifier changed, clear commit/wverf flags */
1284 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1285 bp->nb_verf = 0;
1286 nfs_node_lock_force(np);
1287 np->n_needcommitcnt--;
1288 CHECK_NEEDCOMMITCNT(np);
1289 nfs_node_unlock(np);
1290 }
1291
1292 /*
1293 * add a reference to a buffer so it doesn't disappear while being used
1294 * (must be called with nfs_buf_mutex held)
1295 */
1296 void
1297 nfs_buf_refget(struct nfsbuf *bp)
1298 {
1299 bp->nb_refs++;
1300 }
1301 /*
1302 * release a reference on a buffer
1303 * (must be called with nfs_buf_mutex held)
1304 */
1305 void
1306 nfs_buf_refrele(struct nfsbuf *bp)
1307 {
1308 bp->nb_refs--;
1309 }
1310
1311 /*
1312 * mark a particular buffer as BUSY
1313 * (must be called with nfs_buf_mutex held)
1314 */
1315 errno_t
1316 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1317 {
1318 errno_t error;
1319 struct timespec ts;
1320
1321 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1322 /*
1323 * since the lck_mtx_lock may block, the buffer
1324 * may become BUSY, so we need to recheck for
1325 * a NOWAIT request
1326 */
1327 if (flags & NBAC_NOWAIT)
1328 return (EBUSY);
1329 SET(bp->nb_lflags, NBL_WANTED);
1330
1331 ts.tv_sec = (slptimeo/100);
1332 /* the hz value is 100; which leads to 10ms */
1333 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1334
1335 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1336 "nfs_buf_acquire", &ts);
1337 if (error)
1338 return (error);
1339 return (EAGAIN);
1340 }
1341 if (flags & NBAC_REMOVE)
1342 nfs_buf_remfree(bp);
1343 SET(bp->nb_lflags, NBL_BUSY);
1344
1345 return (0);
1346 }
1347
1348 /*
1349 * simply drop the BUSY status of a buffer
1350 * (must be called with nfs_buf_mutex held)
1351 */
1352 void
1353 nfs_buf_drop(struct nfsbuf *bp)
1354 {
1355 int need_wakeup = 0;
1356
1357 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1358 panic("nfs_buf_drop: buffer not busy!");
1359 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1360 /* delay the actual wakeup until after we clear NBL_BUSY */
1361 need_wakeup = 1;
1362 }
1363 /* Unlock the buffer. */
1364 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1365
1366 if (need_wakeup)
1367 wakeup(bp);
1368 }
1369
1370 /*
1371 * prepare for iterating over an nfsnode's buffer list
1372 * this lock protects the queue manipulation
1373 * (must be called with nfs_buf_mutex held)
1374 */
1375 int
1376 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1377 {
1378 struct nfsbuflists *listheadp;
1379
1380 if (flags & NBI_DIRTY)
1381 listheadp = &np->n_dirtyblkhd;
1382 else
1383 listheadp = &np->n_cleanblkhd;
1384
1385 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1386 LIST_INIT(iterheadp);
1387 return(EWOULDBLOCK);
1388 }
1389
1390 while (np->n_bufiterflags & NBI_ITER) {
1391 np->n_bufiterflags |= NBI_ITERWANT;
1392 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1393 }
1394 if (LIST_EMPTY(listheadp)) {
1395 LIST_INIT(iterheadp);
1396 return(EINVAL);
1397 }
1398 np->n_bufiterflags |= NBI_ITER;
1399
1400 iterheadp->lh_first = listheadp->lh_first;
1401 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1402 LIST_INIT(listheadp);
1403
1404 return(0);
1405 }
1406
1407 /*
1408 * clean up after iterating over an nfsnode's buffer list
1409 * this lock protects the queue manipulation
1410 * (must be called with nfs_buf_mutex held)
1411 */
1412 void
1413 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1414 {
1415 struct nfsbuflists * listheadp;
1416 struct nfsbuf *bp;
1417
1418 if (flags & NBI_DIRTY)
1419 listheadp = &np->n_dirtyblkhd;
1420 else
1421 listheadp = &np->n_cleanblkhd;
1422
1423 while (!LIST_EMPTY(iterheadp)) {
1424 bp = LIST_FIRST(iterheadp);
1425 LIST_REMOVE(bp, nb_vnbufs);
1426 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1427 }
1428
1429 np->n_bufiterflags &= ~NBI_ITER;
1430 if (np->n_bufiterflags & NBI_ITERWANT) {
1431 np->n_bufiterflags &= ~NBI_ITERWANT;
1432 wakeup(&np->n_bufiterflags);
1433 }
1434 }
1435
1436
1437 /*
1438 * Read an NFS buffer for a file.
1439 */
1440 int
1441 nfs_buf_read(struct nfsbuf *bp)
1442 {
1443 int error = 0;
1444 nfsnode_t np;
1445 thread_t thd;
1446 kauth_cred_t cred;
1447
1448 np = bp->nb_np;
1449 cred = bp->nb_rcred;
1450 if (IS_VALID_CRED(cred))
1451 kauth_cred_ref(cred);
1452 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1453
1454 /* sanity checks */
1455 if (!ISSET(bp->nb_flags, NB_READ))
1456 panic("nfs_buf_read: !NB_READ");
1457 if (ISSET(bp->nb_flags, NB_DONE))
1458 CLR(bp->nb_flags, NB_DONE);
1459
1460 NFS_BUF_MAP(bp);
1461
1462 OSAddAtomic(1, &nfsstats.read_bios);
1463
1464 error = nfs_buf_read_rpc(bp, thd, cred);
1465 /*
1466 * For async I/O, the callbacks will finish up the
1467 * read. Otherwise, the read has already been finished.
1468 */
1469
1470 if (IS_VALID_CRED(cred))
1471 kauth_cred_unref(&cred);
1472 return (error);
1473 }
1474
1475 /*
1476 * finish the reading of a buffer
1477 */
1478 void
1479 nfs_buf_read_finish(struct nfsbuf *bp)
1480 {
1481 nfsnode_t np = bp->nb_np;
1482 struct nfsmount *nmp;
1483
1484 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1485 /* update valid range */
1486 bp->nb_validoff = 0;
1487 bp->nb_validend = bp->nb_endio;
1488 if (bp->nb_endio < (int)bp->nb_bufsize) {
1489 /*
1490 * The read may be short because we have unflushed writes
1491 * that are extending the file size and the reads hit the
1492 * (old) EOF on the server. So, just make sure nb_validend
1493 * correctly tracks EOF.
1494 * Note that the missing data should have already been zeroed
1495 * in nfs_buf_read_rpc_finish().
1496 */
1497 off_t boff = NBOFF(bp);
1498 if ((off_t)np->n_size >= (boff + bp->nb_bufsize))
1499 bp->nb_validend = bp->nb_bufsize;
1500 else if ((off_t)np->n_size >= boff)
1501 bp->nb_validend = np->n_size - boff;
1502 else
1503 bp->nb_validend = 0;
1504 }
1505 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1506 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL))
1507 bp->nb_validend = 0x100000000LL - NBOFF(bp);
1508 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1509 if (bp->nb_validend & PAGE_MASK) {
1510 /* zero-fill remainder of last page */
1511 bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
1512 }
1513 }
1514 nfs_buf_iodone(bp);
1515 }
1516
1517 /*
1518 * initiate the NFS READ RPC(s) for a buffer
1519 */
1520 int
1521 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1522 {
1523 struct nfsmount *nmp;
1524 nfsnode_t np = bp->nb_np;
1525 int error = 0, nfsvers, async;
1526 int offset, nrpcs;
1527 uint32_t nmrsize, length, len;
1528 off_t boff;
1529 struct nfsreq *req;
1530 struct nfsreq_cbinfo cb;
1531
1532 nmp = NFSTONMP(np);
1533 if (!nmp) {
1534 bp->nb_error = error = ENXIO;
1535 SET(bp->nb_flags, NB_ERROR);
1536 nfs_buf_iodone(bp);
1537 return (error);
1538 }
1539 nfsvers = nmp->nm_vers;
1540 nmrsize = nmp->nm_rsize;
1541
1542 boff = NBOFF(bp);
1543 offset = 0;
1544 length = bp->nb_bufsize;
1545
1546 if (nfsvers == NFS_VER2) {
1547 if (boff > 0xffffffffLL) {
1548 bp->nb_error = error = EFBIG;
1549 SET(bp->nb_flags, NB_ERROR);
1550 nfs_buf_iodone(bp);
1551 return (error);
1552 }
1553 if ((boff + length - 1) > 0xffffffffLL)
1554 length = 0x100000000LL - boff;
1555 }
1556
1557 /* Note: Can only do async I/O if nfsiods are configured. */
1558 async = (bp->nb_flags & NB_ASYNC);
1559 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1560 cb.rcb_bp = bp;
1561
1562 bp->nb_offio = bp->nb_endio = 0;
1563 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1564 if (async && (nrpcs > 1)) {
1565 SET(bp->nb_flags, NB_MULTASYNCRPC);
1566 } else {
1567 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1568 }
1569
1570 while (length > 0) {
1571 if (ISSET(bp->nb_flags, NB_ERROR)) {
1572 error = bp->nb_error;
1573 break;
1574 }
1575 len = (length > nmrsize) ? nmrsize : length;
1576 cb.rcb_args[0] = offset;
1577 cb.rcb_args[1] = len;
1578 if (nmp->nm_vers >= NFS_VER4)
1579 cb.rcb_args[2] = nmp->nm_stategenid;
1580 req = NULL;
1581 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1582 if (error)
1583 break;
1584 offset += len;
1585 length -= len;
1586 if (async)
1587 continue;
1588 nfs_buf_read_rpc_finish(req);
1589 if (ISSET(bp->nb_flags, NB_ERROR)) {
1590 error = bp->nb_error;
1591 break;
1592 }
1593 }
1594
1595 if (length > 0) {
1596 /*
1597 * Something bad happened while trying to send the RPC(s).
1598 * Wait for any outstanding requests to complete.
1599 */
1600 bp->nb_error = error;
1601 SET(bp->nb_flags, NB_ERROR);
1602 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1603 nrpcs = (length + nmrsize - 1) / nmrsize;
1604 lck_mtx_lock(nfs_buf_mutex);
1605 bp->nb_rpcs -= nrpcs;
1606 if (bp->nb_rpcs == 0) {
1607 /* No RPCs left, so the buffer's done */
1608 lck_mtx_unlock(nfs_buf_mutex);
1609 nfs_buf_iodone(bp);
1610 } else {
1611 /* wait for the last RPC to mark it done */
1612 while (bp->nb_rpcs > 0)
1613 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1614 "nfs_buf_read_rpc_cancel", NULL);
1615 lck_mtx_unlock(nfs_buf_mutex);
1616 }
1617 } else {
1618 nfs_buf_iodone(bp);
1619 }
1620 }
1621
1622 return (error);
1623 }
1624
1625 /*
1626 * finish up an NFS READ RPC on a buffer
1627 */
1628 void
1629 nfs_buf_read_rpc_finish(struct nfsreq *req)
1630 {
1631 struct nfsmount *nmp;
1632 size_t rlen;
1633 struct nfsreq_cbinfo cb;
1634 struct nfsbuf *bp;
1635 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1636 void *wakeme = NULL;
1637 struct nfsreq *rreq = NULL;
1638 nfsnode_t np;
1639 thread_t thd;
1640 kauth_cred_t cred;
1641 uio_t auio;
1642 char uio_buf [ UIO_SIZEOF(1) ];
1643
1644 finish:
1645 np = req->r_np;
1646 thd = req->r_thread;
1647 cred = req->r_cred;
1648 if (IS_VALID_CRED(cred))
1649 kauth_cred_ref(cred);
1650 cb = req->r_callback;
1651 bp = cb.rcb_bp;
1652
1653 nmp = NFSTONMP(np);
1654 if (!nmp) {
1655 SET(bp->nb_flags, NB_ERROR);
1656 bp->nb_error = error = ENXIO;
1657 }
1658 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1659 /* just drop it */
1660 nfs_request_async_cancel(req);
1661 goto out;
1662 }
1663
1664 nfsvers = nmp->nm_vers;
1665 offset = cb.rcb_args[0];
1666 rlen = length = cb.rcb_args[1];
1667
1668 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
1669 UIO_READ, &uio_buf, sizeof(uio_buf));
1670 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
1671
1672 /* finish the RPC */
1673 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
1674 if ((error == EINPROGRESS) && cb.rcb_func) {
1675 /* async request restarted */
1676 if (IS_VALID_CRED(cred))
1677 kauth_cred_unref(&cred);
1678 return;
1679 }
1680 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1681 lck_mtx_lock(&nmp->nm_lock);
1682 if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) {
1683 printf("nfs_buf_read_rpc_finish: error %d, initiating recovery\n", error);
1684 nmp->nm_state |= NFSSTA_RECOVER;
1685 nfs_mount_sock_thread_wake(nmp);
1686 }
1687 lck_mtx_unlock(&nmp->nm_lock);
1688 if (error == NFSERR_GRACE)
1689 tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
1690 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1691 rlen = 0;
1692 goto readagain;
1693 }
1694 }
1695 if (error) {
1696 SET(bp->nb_flags, NB_ERROR);
1697 bp->nb_error = error;
1698 goto out;
1699 }
1700
1701 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen)))
1702 bp->nb_endio = offset + rlen;
1703
1704 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1705 /* zero out the remaining data (up to EOF) */
1706 off_t rpcrem, eofrem, rem;
1707 rpcrem = (length - rlen);
1708 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1709 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1710 if (rem > 0)
1711 bzero(bp->nb_data + offset + rlen, rem);
1712 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1713 /*
1714 * short read
1715 *
1716 * We haven't hit EOF and we didn't get all the data
1717 * requested, so we need to issue another read for the rest.
1718 * (Don't bother if the buffer already hit an error.)
1719 */
1720 readagain:
1721 offset += rlen;
1722 length -= rlen;
1723 cb.rcb_args[0] = offset;
1724 cb.rcb_args[1] = length;
1725 if (nmp->nm_vers >= NFS_VER4)
1726 cb.rcb_args[2] = nmp->nm_stategenid;
1727 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
1728 if (!error) {
1729 if (IS_VALID_CRED(cred))
1730 kauth_cred_unref(&cred);
1731 if (!cb.rcb_func) {
1732 /* if !async we'll need to wait for this RPC to finish */
1733 req = rreq;
1734 rreq = NULL;
1735 goto finish;
1736 }
1737 /*
1738 * We're done here.
1739 * Outstanding RPC count is unchanged.
1740 * Callback will be called when RPC is done.
1741 */
1742 return;
1743 }
1744 SET(bp->nb_flags, NB_ERROR);
1745 bp->nb_error = error;
1746 }
1747
1748 out:
1749 if (IS_VALID_CRED(cred))
1750 kauth_cred_unref(&cred);
1751
1752 /*
1753 * Decrement outstanding RPC count on buffer
1754 * and call nfs_buf_read_finish on last RPC.
1755 *
1756 * (Note: when there are multiple async RPCs issued for a
1757 * buffer we need nfs_buffer_mutex to avoid problems when
1758 * aborting a partially-initiated set of RPCs)
1759 */
1760
1761 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1762 if (multasyncrpc)
1763 lck_mtx_lock(nfs_buf_mutex);
1764
1765 bp->nb_rpcs--;
1766 finished = (bp->nb_rpcs == 0);
1767
1768 if (multasyncrpc)
1769 lck_mtx_unlock(nfs_buf_mutex);
1770
1771 if (finished) {
1772 if (multasyncrpc)
1773 wakeme = &bp->nb_rpcs;
1774 nfs_buf_read_finish(bp);
1775 if (wakeme)
1776 wakeup(wakeme);
1777 }
1778 }
1779
1780 /*
1781 * Do buffer readahead.
1782 * Initiate async I/O to read buffers not in cache.
1783 */
1784 int
1785 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1786 {
1787 struct nfsmount *nmp = NFSTONMP(np);
1788 struct nfsbuf *bp;
1789 int error = 0, nra;
1790
1791 if (!nmp)
1792 return (ENXIO);
1793 if (nmp->nm_readahead <= 0)
1794 return (0);
1795 if (*rabnp > lastrabn)
1796 return (0);
1797
1798 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1799 /* check if block exists and is valid. */
1800 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1801 /* stop reading ahead if we're beyond EOF */
1802 *rabnp = lastrabn;
1803 break;
1804 }
1805 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
1806 if (error)
1807 break;
1808 nfs_node_lock_force(np);
1809 np->n_lastrahead = *rabnp;
1810 nfs_node_unlock(np);
1811 if (!bp)
1812 continue;
1813 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1814 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) {
1815 CLR(bp->nb_flags, NB_CACHE);
1816 bp->nb_valid = 0;
1817 bp->nb_validoff = bp->nb_validend = -1;
1818 }
1819 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1820 !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1821 SET(bp->nb_flags, (NB_READ|NB_ASYNC));
1822 if (ioflag & IO_NOCACHE)
1823 SET(bp->nb_flags, NB_NCRDAHEAD);
1824 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1825 kauth_cred_ref(cred);
1826 bp->nb_rcred = cred;
1827 }
1828 if ((error = nfs_buf_read(bp)))
1829 break;
1830 continue;
1831 }
1832 nfs_buf_release(bp, 1);
1833 }
1834 return (error);
1835 }
1836
1837 /*
1838 * NFS buffer I/O for reading files.
1839 */
1840 int
1841 nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
1842 {
1843 vnode_t vp = NFSTOV(np);
1844 struct nfsbuf *bp = NULL;
1845 struct nfs_vattr nvattr;
1846 struct nfsmount *nmp = VTONMP(vp);
1847 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
1848 off_t diff;
1849 int error = 0, n = 0, on = 0;
1850 int nfsvers, biosize, modified, readaheads = 0;
1851 thread_t thd;
1852 kauth_cred_t cred;
1853 int64_t io_resid;
1854
1855 FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
1856
1857 nfsvers = nmp->nm_vers;
1858 biosize = nmp->nm_biosize;
1859 thd = vfs_context_thread(ctx);
1860 cred = vfs_context_ucred(ctx);
1861
1862 if (vnode_vtype(vp) != VREG) {
1863 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
1864 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
1865 return (EINVAL);
1866 }
1867
1868 /*
1869 * For NFS, cache consistency can only be maintained approximately.
1870 * Although RFC1094 does not specify the criteria, the following is
1871 * believed to be compatible with the reference port.
1872 *
1873 * If the file has changed since the last read RPC or you have
1874 * written to the file, you may have lost data cache consistency
1875 * with the server. So, check for a change, and flush all of the
1876 * file's data out of the cache.
1877 * NB: This implies that cache data can be read when up to
1878 * NFS_MAXATTRTIMO seconds out of date. If you find that you
1879 * need current attributes, nfs_getattr() can be forced to fetch
1880 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
1881 */
1882
1883 if (ISSET(np->n_flag, NUPDATESIZE))
1884 nfs_data_update_size(np, 0);
1885
1886 if ((error = nfs_node_lock(np))) {
1887 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
1888 return (error);
1889 }
1890
1891 if (np->n_flag & NNEEDINVALIDATE) {
1892 np->n_flag &= ~NNEEDINVALIDATE;
1893 nfs_node_unlock(np);
1894 error = nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
1895 if (!error)
1896 error = nfs_node_lock(np);
1897 if (error) {
1898 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
1899 return (error);
1900 }
1901 }
1902
1903 modified = (np->n_flag & NMODIFIED);
1904 nfs_node_unlock(np);
1905 /* nfs_getattr() will check changed and purge caches */
1906 error = nfs_getattr(np, &nvattr, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
1907 if (error) {
1908 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
1909 return (error);
1910 }
1911
1912 if (uio_resid(uio) == 0) {
1913 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
1914 return (0);
1915 }
1916 if (uio_offset(uio) < 0) {
1917 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
1918 return (EINVAL);
1919 }
1920
1921 /*
1922 * set up readahead - which may be limited by:
1923 * + current request length (for IO_NOCACHE)
1924 * + readahead setting
1925 * + file size
1926 */
1927 if (nmp->nm_readahead > 0) {
1928 off_t end = uio_offset(uio) + uio_resid(uio);
1929 if (end > (off_t)np->n_size)
1930 end = np->n_size;
1931 rabn = uio_offset(uio) / biosize;
1932 maxrabn = (end - 1) / biosize;
1933 nfs_node_lock_force(np);
1934 if (!(ioflag & IO_NOCACHE) &&
1935 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
1936 maxrabn += nmp->nm_readahead;
1937 if ((maxrabn * biosize) >= (off_t)np->n_size)
1938 maxrabn = ((off_t)np->n_size - 1)/biosize;
1939 }
1940 if (maxrabn < np->n_lastrahead)
1941 np->n_lastrahead = -1;
1942 if (rabn < np->n_lastrahead)
1943 rabn = np->n_lastrahead + 1;
1944 nfs_node_unlock(np);
1945 } else {
1946 rabn = maxrabn = 0;
1947 }
1948
1949 do {
1950
1951 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
1952 lbn = uio_offset(uio) / biosize;
1953
1954 /*
1955 * Copy directly from any cached pages without grabbing the bufs.
1956 * (If we are NOCACHE and we've issued readahead requests, we need
1957 * to grab the NB_NCRDAHEAD bufs to drop them.)
1958 */
1959 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
1960 ((uio->uio_segflg == UIO_USERSPACE32 ||
1961 uio->uio_segflg == UIO_USERSPACE64 ||
1962 uio->uio_segflg == UIO_USERSPACE))) {
1963 io_resid = uio_resid(uio);
1964 diff = np->n_size - uio_offset(uio);
1965 if (diff < io_resid)
1966 io_resid = diff;
1967 if (io_resid > 0) {
1968 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
1969 error = cluster_copy_ubc_data(vp, uio, &count, 0);
1970 if (error) {
1971 nfs_data_unlock(np);
1972 FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
1973 return (error);
1974 }
1975 }
1976 /* count any biocache reads that we just copied directly */
1977 if (lbn != (uio_offset(uio)/biosize)) {
1978 OSAddAtomic((uio_offset(uio)/biosize) - lbn, &nfsstats.biocache_reads);
1979 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
1980 }
1981 }
1982
1983 lbn = uio_offset(uio) / biosize;
1984 on = uio_offset(uio) % biosize;
1985 nfs_node_lock_force(np);
1986 np->n_lastread = (uio_offset(uio) - 1) / biosize;
1987 nfs_node_unlock(np);
1988
1989 /* adjust readahead block number, if necessary */
1990 if (rabn < lbn)
1991 rabn = lbn;
1992 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
1993 if (rabn <= lastrabn) { /* start readaheads */
1994 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
1995 if (error) {
1996 nfs_data_unlock(np);
1997 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
1998 return (error);
1999 }
2000 readaheads = 1;
2001 }
2002
2003 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2004 nfs_data_unlock(np);
2005 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
2006 return (0);
2007 }
2008
2009 OSAddAtomic(1, &nfsstats.biocache_reads);
2010
2011 /*
2012 * If the block is in the cache and has the required data
2013 * in a valid region, just copy it out.
2014 * Otherwise, get the block and write back/read in,
2015 * as required.
2016 */
2017 again:
2018 io_resid = uio_resid(uio);
2019 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2020 diff = np->n_size - uio_offset(uio);
2021 if (diff < n)
2022 n = diff;
2023
2024 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2025 if (error) {
2026 nfs_data_unlock(np);
2027 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2028 return (error);
2029 }
2030
2031 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2032 /*
2033 * IO_NOCACHE found a cached buffer.
2034 * Flush the buffer if it's dirty.
2035 * Invalidate the data if it wasn't just read
2036 * in as part of a "nocache readahead".
2037 */
2038 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2039 /* so write the buffer out and try again */
2040 SET(bp->nb_flags, NB_NOCACHE);
2041 goto flushbuffer;
2042 }
2043 if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2044 CLR(bp->nb_flags, NB_NCRDAHEAD);
2045 SET(bp->nb_flags, NB_NOCACHE);
2046 }
2047 }
2048
2049 /* if any pages are valid... */
2050 if (bp->nb_valid) {
2051 /* ...check for any invalid pages in the read range */
2052 int pg, firstpg, lastpg, dirtypg;
2053 dirtypg = firstpg = lastpg = -1;
2054 pg = on/PAGE_SIZE;
2055 while (pg <= (on + n - 1)/PAGE_SIZE) {
2056 if (!NBPGVALID(bp,pg)) {
2057 if (firstpg < 0)
2058 firstpg = pg;
2059 lastpg = pg;
2060 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
2061 dirtypg = pg;
2062 pg++;
2063 }
2064
2065 /* if there are no invalid pages, we're all set */
2066 if (firstpg < 0) {
2067 if (bp->nb_validoff < 0) {
2068 /* valid range isn't set up, so */
2069 /* set it to what we know is valid */
2070 bp->nb_validoff = trunc_page(on);
2071 bp->nb_validend = round_page(on+n);
2072 nfs_buf_normalize_valid_range(np, bp);
2073 }
2074 goto buffer_ready;
2075 }
2076
2077 /* there are invalid pages in the read range */
2078 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2079 (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) {
2080 /* there are also dirty page(s) (or range) in the read range, */
2081 /* so write the buffer out and try again */
2082 flushbuffer:
2083 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2084 SET(bp->nb_flags, NB_ASYNC);
2085 if (!IS_VALID_CRED(bp->nb_wcred)) {
2086 kauth_cred_ref(cred);
2087 bp->nb_wcred = cred;
2088 }
2089 error = nfs_buf_write(bp);
2090 if (error) {
2091 nfs_data_unlock(np);
2092 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2093 return (error);
2094 }
2095 goto again;
2096 }
2097 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2098 (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) {
2099 /* we need to read in more than half the buffer and the */
2100 /* buffer's not dirty, so just fetch the whole buffer */
2101 bp->nb_valid = 0;
2102 } else {
2103 /* read the page range in */
2104 uio_t auio;
2105 char uio_buf[ UIO_SIZEOF(1) ];
2106
2107 NFS_BUF_MAP(bp);
2108 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2109 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2110 if (!auio) {
2111 error = ENOMEM;
2112 } else {
2113 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
2114 ((lastpg - firstpg + 1) * PAGE_SIZE));
2115 error = nfs_read_rpc(np, auio, ctx);
2116 }
2117 if (error) {
2118 if (ioflag & IO_NOCACHE)
2119 SET(bp->nb_flags, NB_NOCACHE);
2120 nfs_buf_release(bp, 1);
2121 nfs_data_unlock(np);
2122 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2123 return (error);
2124 }
2125 /* Make sure that the valid range is set to cover this read. */
2126 bp->nb_validoff = trunc_page_32(on);
2127 bp->nb_validend = round_page_32(on+n);
2128 nfs_buf_normalize_valid_range(np, bp);
2129 if (uio_resid(auio) > 0) {
2130 /* if short read, must have hit EOF, */
2131 /* so zero the rest of the range */
2132 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2133 }
2134 /* mark the pages (successfully read) as valid */
2135 for (pg=firstpg; pg <= lastpg; pg++)
2136 NBPGVALID_SET(bp,pg);
2137 }
2138 }
2139 /* if no pages are valid, read the whole block */
2140 if (!bp->nb_valid) {
2141 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2142 kauth_cred_ref(cred);
2143 bp->nb_rcred = cred;
2144 }
2145 SET(bp->nb_flags, NB_READ);
2146 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2147 error = nfs_buf_read(bp);
2148 if (ioflag & IO_NOCACHE)
2149 SET(bp->nb_flags, NB_NOCACHE);
2150 if (error) {
2151 nfs_data_unlock(np);
2152 nfs_buf_release(bp, 1);
2153 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2154 return (error);
2155 }
2156 }
2157 buffer_ready:
2158 /* validate read range against valid range and clip */
2159 if (bp->nb_validend > 0) {
2160 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2161 if (diff < n)
2162 n = diff;
2163 }
2164 if (n > 0) {
2165 NFS_BUF_MAP(bp);
2166 error = uiomove(bp->nb_data + on, n, uio);
2167 }
2168
2169 nfs_buf_release(bp, 1);
2170 nfs_data_unlock(np);
2171 nfs_node_lock_force(np);
2172 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2173 nfs_node_unlock(np);
2174 } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2175 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
2176 return (error);
2177 }
2178
2179 /*
2180 * limit the number of outstanding async I/O writes
2181 */
2182 int
2183 nfs_async_write_start(struct nfsmount *nmp)
2184 {
2185 int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
2186 struct timespec ts = {1, 0};
2187
2188 if (nfs_max_async_writes <= 0)
2189 return (0);
2190 lck_mtx_lock(&nmp->nm_lock);
2191 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2192 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
2193 break;
2194 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
2195 slpflag = 0;
2196 }
2197 if (!error)
2198 nmp->nm_asyncwrites++;
2199 lck_mtx_unlock(&nmp->nm_lock);
2200 return (error);
2201 }
2202 void
2203 nfs_async_write_done(struct nfsmount *nmp)
2204 {
2205 if (nmp->nm_asyncwrites <= 0)
2206 return;
2207 lck_mtx_lock(&nmp->nm_lock);
2208 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
2209 wakeup(&nmp->nm_asyncwrites);
2210 lck_mtx_unlock(&nmp->nm_lock);
2211 }
2212
2213 /*
2214 * write (or commit) the given NFS buffer
2215 *
2216 * Commit the buffer if we can.
2217 * Write out any dirty range.
2218 * If any dirty pages remain, write them out.
2219 * Mark buffer done.
2220 *
2221 * For async requests, all the work beyond sending the initial
2222 * write RPC is handled in the RPC callback(s).
2223 */
2224 int
2225 nfs_buf_write(struct nfsbuf *bp)
2226 {
2227 int error = 0, oldflags, async;
2228 nfsnode_t np;
2229 thread_t thd;
2230 kauth_cred_t cred;
2231 proc_t p = current_proc();
2232 int iomode, doff, dend, firstpg, lastpg;
2233 uint32_t pagemask;
2234
2235 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2236
2237 if (!ISSET(bp->nb_lflags, NBL_BUSY))
2238 panic("nfs_buf_write: buffer is not busy???");
2239
2240 np = bp->nb_np;
2241 async = ISSET(bp->nb_flags, NB_ASYNC);
2242 oldflags = bp->nb_flags;
2243
2244 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
2245 if (ISSET(oldflags, NB_DELWRI)) {
2246 lck_mtx_lock(nfs_buf_mutex);
2247 nfs_nbdwrite--;
2248 NFSBUFCNTCHK();
2249 lck_mtx_unlock(nfs_buf_mutex);
2250 wakeup(&nfs_nbdwrite);
2251 }
2252
2253 /* move to clean list */
2254 if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
2255 lck_mtx_lock(nfs_buf_mutex);
2256 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2257 LIST_REMOVE(bp, nb_vnbufs);
2258 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2259 lck_mtx_unlock(nfs_buf_mutex);
2260 }
2261 nfs_node_lock_force(np);
2262 np->n_numoutput++;
2263 nfs_node_unlock(np);
2264 vnode_startwrite(NFSTOV(np));
2265
2266 if (p && p->p_stats)
2267 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
2268
2269 cred = bp->nb_wcred;
2270 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
2271 cred = bp->nb_rcred; /* shouldn't really happen, but... */
2272 if (IS_VALID_CRED(cred))
2273 kauth_cred_ref(cred);
2274 thd = async ? NULL : current_thread();
2275
2276 /* We need to make sure the pages are locked before doing I/O. */
2277 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
2278 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2279 error = nfs_buf_upl_setup(bp);
2280 if (error) {
2281 printf("nfs_buf_write: upl create failed %d\n", error);
2282 SET(bp->nb_flags, NB_ERROR);
2283 bp->nb_error = error = EIO;
2284 nfs_buf_iodone(bp);
2285 goto out;
2286 }
2287 nfs_buf_upl_check(bp);
2288 }
2289 }
2290
2291 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2292 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2293 nfs_buf_check_write_verifier(np, bp);
2294 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2295 struct nfsmount *nmp = NFSTONMP(np);
2296 if (!nmp) {
2297 SET(bp->nb_flags, NB_ERROR);
2298 bp->nb_error = error = EIO;
2299 nfs_buf_iodone(bp);
2300 goto out;
2301 }
2302 SET(bp->nb_flags, NB_WRITEINPROG);
2303 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2304 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
2305 CLR(bp->nb_flags, NB_WRITEINPROG);
2306 if (error) {
2307 if (error != NFSERR_STALEWRITEVERF) {
2308 SET(bp->nb_flags, NB_ERROR);
2309 bp->nb_error = error;
2310 }
2311 nfs_buf_iodone(bp);
2312 goto out;
2313 }
2314 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2315 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2316 nfs_node_lock_force(np);
2317 np->n_needcommitcnt--;
2318 CHECK_NEEDCOMMITCNT(np);
2319 nfs_node_unlock(np);
2320 }
2321 if (!error && (bp->nb_dirtyend > 0)) {
2322 /* sanity check the dirty range */
2323 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2324 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2325 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2326 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2327 }
2328 }
2329 if (!error && (bp->nb_dirtyend > 0)) {
2330 /* there's a dirty range that needs to be written out */
2331 NFS_BUF_MAP(bp);
2332
2333 doff = bp->nb_dirtyoff;
2334 dend = bp->nb_dirtyend;
2335
2336 /* if doff page is dirty, move doff to start of page */
2337 if (NBPGDIRTY(bp, doff / PAGE_SIZE))
2338 doff -= doff & PAGE_MASK;
2339 /* try to expand write range to include preceding dirty pages */
2340 if (!(doff & PAGE_MASK))
2341 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
2342 doff -= PAGE_SIZE;
2343 /* if dend page is dirty, move dend to start of next page */
2344 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2345 dend = round_page_32(dend);
2346 /* try to expand write range to include trailing dirty pages */
2347 if (!(dend & PAGE_MASK))
2348 while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2349 dend += PAGE_SIZE;
2350 /* make sure to keep dend clipped to EOF */
2351 if ((NBOFF(bp) + dend) > (off_t) np->n_size)
2352 dend = np->n_size - NBOFF(bp);
2353 /* calculate range of complete pages being written */
2354 firstpg = round_page_32(doff) / PAGE_SIZE;
2355 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2356 /* calculate mask for that page range */
2357 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2358
2359 /*
2360 * compare page mask to nb_dirty; if there are other dirty pages
2361 * then write FILESYNC; otherwise, write UNSTABLE if async and
2362 * not needcommit/stable; otherwise write FILESYNC
2363 */
2364 if (bp->nb_dirty & ~pagemask)
2365 iomode = NFS_WRITE_FILESYNC;
2366 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
2367 iomode = NFS_WRITE_UNSTABLE;
2368 else
2369 iomode = NFS_WRITE_FILESYNC;
2370
2371 /* write the whole contiguous dirty range */
2372 bp->nb_offio = doff;
2373 bp->nb_endio = dend;
2374
2375 OSAddAtomic(1, &nfsstats.write_bios);
2376
2377 SET(bp->nb_flags, NB_WRITEINPROG);
2378 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2379 /*
2380 * For async I/O, the callbacks will finish up the
2381 * write and push out any dirty pages. Otherwise,
2382 * the write has already been finished and any dirty
2383 * pages pushed out.
2384 */
2385 } else {
2386 if (!error && bp->nb_dirty) /* write out any dirty pages */
2387 error = nfs_buf_write_dirty_pages(bp, thd, cred);
2388 nfs_buf_iodone(bp);
2389 }
2390 /* note: bp is still valid only for !async case */
2391 out:
2392 if (!async) {
2393 error = nfs_buf_iowait(bp);
2394 /* move to clean list */
2395 if (oldflags & NB_DELWRI) {
2396 lck_mtx_lock(nfs_buf_mutex);
2397 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2398 LIST_REMOVE(bp, nb_vnbufs);
2399 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2400 lck_mtx_unlock(nfs_buf_mutex);
2401 }
2402 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2403 nfs_buf_release(bp, 1);
2404 /* check if we need to invalidate (and we can) */
2405 if ((np->n_flag & NNEEDINVALIDATE) &&
2406 !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
2407 int invalidate = 0;
2408 nfs_node_lock_force(np);
2409 if (np->n_flag & NNEEDINVALIDATE) {
2410 invalidate = 1;
2411 np->n_flag &= ~NNEEDINVALIDATE;
2412 }
2413 nfs_node_unlock(np);
2414 if (invalidate) {
2415 /*
2416 * There was a write error and we need to
2417 * invalidate attrs and flush buffers in
2418 * order to sync up with the server.
2419 * (if this write was extending the file,
2420 * we may no longer know the correct size)
2421 *
2422 * But we couldn't call vinvalbuf while holding
2423 * the buffer busy. So we call vinvalbuf() after
2424 * releasing the buffer.
2425 */
2426 nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
2427 }
2428 }
2429 }
2430
2431 if (IS_VALID_CRED(cred))
2432 kauth_cred_unref(&cred);
2433 return (error);
2434 }
2435
2436 /*
2437 * finish the writing of a buffer
2438 */
2439 void
2440 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2441 {
2442 nfsnode_t np = bp->nb_np;
2443 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2444 int firstpg, lastpg;
2445 uint32_t pagemask;
2446
2447 if ((error == EINTR) || (error == ERESTART)) {
2448 CLR(bp->nb_flags, NB_ERROR);
2449 SET(bp->nb_flags, NB_EINTR);
2450 }
2451
2452 if (!error) {
2453 /* calculate range of complete pages being written */
2454 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2455 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2456 /* calculate mask for that page range written */
2457 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2458 /* clear dirty bits for pages we've written */
2459 bp->nb_dirty &= ~pagemask;
2460 }
2461
2462 /* manage needcommit state */
2463 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2464 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2465 nfs_node_lock_force(np);
2466 np->n_needcommitcnt++;
2467 nfs_node_unlock(np);
2468 SET(bp->nb_flags, NB_NEEDCOMMIT);
2469 }
2470 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2471 bp->nb_dirtyoff = bp->nb_offio;
2472 bp->nb_dirtyend = bp->nb_endio;
2473 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2474 nfs_node_lock_force(np);
2475 np->n_needcommitcnt--;
2476 CHECK_NEEDCOMMITCNT(np);
2477 nfs_node_unlock(np);
2478 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2479 }
2480
2481 CLR(bp->nb_flags, NB_WRITEINPROG);
2482
2483 /*
2484 * For an unstable write, the buffer is still treated as dirty until
2485 * a commit (or stable (re)write) is performed. Buffers needing only
2486 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2487 *
2488 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2489 * because that would cause the buffer to be dropped. The buffer is
2490 * still valid and simply needs to be written again.
2491 */
2492 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2493 CLR(bp->nb_flags, NB_INVAL);
2494 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2495 SET(bp->nb_flags, NB_DELWRI);
2496 lck_mtx_lock(nfs_buf_mutex);
2497 nfs_nbdwrite++;
2498 NFSBUFCNTCHK();
2499 lck_mtx_unlock(nfs_buf_mutex);
2500 }
2501 /*
2502 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2503 * clean list, we have to reassign it back to the dirty one. Ugh.
2504 */
2505 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2506 /* move to dirty list */
2507 lck_mtx_lock(nfs_buf_mutex);
2508 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2509 LIST_REMOVE(bp, nb_vnbufs);
2510 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2511 lck_mtx_unlock(nfs_buf_mutex);
2512 }
2513 } else {
2514 /* either there's an error or we don't need to commit */
2515 if (error) {
2516 /*
2517 * There was a write error and we need to invalidate
2518 * attrs and flush buffers in order to sync up with the
2519 * server. (if this write was extending the file, we
2520 * may no longer know the correct size)
2521 *
2522 * But we can't call vinvalbuf while holding this
2523 * buffer busy. Set a flag to do it after releasing
2524 * the buffer.
2525 */
2526 nfs_node_lock_force(np);
2527 np->n_error = error;
2528 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2529 NATTRINVALIDATE(np);
2530 nfs_node_unlock(np);
2531 }
2532 /* clear the dirty range */
2533 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2534 }
2535
2536 if (!error && bp->nb_dirty)
2537 nfs_buf_write_dirty_pages(bp, thd, cred);
2538 nfs_buf_iodone(bp);
2539 }
2540
2541 /*
2542 * write out any pages marked dirty in a buffer
2543 *
2544 * We do use unstable writes and follow up with a commit.
2545 * If we catch the write verifier changing we'll restart
2546 * do the writes filesync.
2547 */
2548 int
2549 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2550 {
2551 nfsnode_t np = bp->nb_np;
2552 struct nfsmount *nmp = NFSTONMP(np);
2553 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2554 uint32_t dirty = bp->nb_dirty;
2555 uint64_t wverf;
2556 uio_t auio;
2557 char uio_buf [ UIO_SIZEOF(1) ];
2558
2559 if (!bp->nb_dirty)
2560 return (0);
2561
2562 /* there are pages marked dirty that need to be written out */
2563 OSAddAtomic(1, &nfsstats.write_bios);
2564 NFS_BUF_MAP(bp);
2565 SET(bp->nb_flags, NB_WRITEINPROG);
2566 npages = bp->nb_bufsize / PAGE_SIZE;
2567 iomode = NFS_WRITE_UNSTABLE;
2568
2569 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
2570 &uio_buf, sizeof(uio_buf));
2571
2572 again:
2573 dirty = bp->nb_dirty;
2574 wverf = bp->nb_verf;
2575 commit = NFS_WRITE_FILESYNC;
2576 for (pg = 0; pg < npages; pg++) {
2577 if (!NBPGDIRTY(bp, pg))
2578 continue;
2579 count = 1;
2580 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
2581 count++;
2582 /* write count pages starting with page pg */
2583 off = pg * PAGE_SIZE;
2584 len = count * PAGE_SIZE;
2585 /* clip writes to EOF */
2586 if (NBOFF(bp) + off + len > (off_t) np->n_size)
2587 len -= (NBOFF(bp) + off + len) - np->n_size;
2588 if (len > 0) {
2589 iomode2 = iomode;
2590 uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2591 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2592 error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
2593 if (error)
2594 break;
2595 if (iomode2 < commit) /* Retain the lowest commitment level returned. */
2596 commit = iomode2;
2597 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2598 /* verifier changed, redo all the writes filesync */
2599 iomode = NFS_WRITE_FILESYNC;
2600 goto again;
2601 }
2602 }
2603 /* clear dirty bits */
2604 while (count--) {
2605 dirty &= ~(1 << pg);
2606 if (count) /* leave pg on last page */
2607 pg++;
2608 }
2609 }
2610 CLR(bp->nb_flags, NB_WRITEINPROG);
2611
2612 if (!error && (commit != NFS_WRITE_FILESYNC)) {
2613 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
2614 if (error == NFSERR_STALEWRITEVERF) {
2615 /* verifier changed, so we need to restart all the writes */
2616 iomode = NFS_WRITE_FILESYNC;
2617 goto again;
2618 }
2619 }
2620 if (!error) {
2621 bp->nb_dirty = dirty;
2622 } else {
2623 SET(bp->nb_flags, NB_ERROR);
2624 bp->nb_error = error;
2625 }
2626 return (error);
2627 }
2628
2629 /*
2630 * initiate the NFS WRITE RPC(s) for a buffer
2631 */
2632 int
2633 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2634 {
2635 struct nfsmount *nmp;
2636 nfsnode_t np = bp->nb_np;
2637 int error = 0, nfsvers, async;
2638 int offset, nrpcs;
2639 uint32_t nmwsize, length, len;
2640 struct nfsreq *req;
2641 struct nfsreq_cbinfo cb;
2642 uio_t auio;
2643 char uio_buf [ UIO_SIZEOF(1) ];
2644
2645 nmp = NFSTONMP(np);
2646 if (!nmp) {
2647 bp->nb_error = error = ENXIO;
2648 SET(bp->nb_flags, NB_ERROR);
2649 nfs_buf_iodone(bp);
2650 return (error);
2651 }
2652 nfsvers = nmp->nm_vers;
2653 nmwsize = nmp->nm_wsize;
2654
2655 offset = bp->nb_offio;
2656 length = bp->nb_endio - bp->nb_offio;
2657
2658 /* Note: Can only do async I/O if nfsiods are configured. */
2659 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2660 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2661 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2662 cb.rcb_bp = bp;
2663
2664 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2665 bp->nb_error = error = EFBIG;
2666 SET(bp->nb_flags, NB_ERROR);
2667 nfs_buf_iodone(bp);
2668 return (error);
2669 }
2670
2671 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2672 UIO_WRITE, &uio_buf, sizeof(uio_buf));
2673 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2674
2675 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2676 if (async && (nrpcs > 1)) {
2677 SET(bp->nb_flags, NB_MULTASYNCRPC);
2678 } else {
2679 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2680 }
2681
2682 while (length > 0) {
2683 if (ISSET(bp->nb_flags, NB_ERROR)) {
2684 error = bp->nb_error;
2685 break;
2686 }
2687 len = (length > nmwsize) ? nmwsize : length;
2688 cb.rcb_args[0] = offset;
2689 cb.rcb_args[1] = len;
2690 if (nmp->nm_vers >= NFS_VER4)
2691 cb.rcb_args[2] = nmp->nm_stategenid;
2692 if (async && ((error = nfs_async_write_start(nmp))))
2693 break;
2694 req = NULL;
2695 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
2696 iomode, &cb, &req);
2697 if (error) {
2698 if (async)
2699 nfs_async_write_done(nmp);
2700 break;
2701 }
2702 offset += len;
2703 length -= len;
2704 if (async)
2705 continue;
2706 nfs_buf_write_rpc_finish(req);
2707 }
2708
2709 if (length > 0) {
2710 /*
2711 * Something bad happened while trying to send the RPCs.
2712 * Wait for any outstanding requests to complete.
2713 */
2714 bp->nb_error = error;
2715 SET(bp->nb_flags, NB_ERROR);
2716 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2717 nrpcs = (length + nmwsize - 1) / nmwsize;
2718 lck_mtx_lock(nfs_buf_mutex);
2719 bp->nb_rpcs -= nrpcs;
2720 if (bp->nb_rpcs == 0) {
2721 /* No RPCs left, so the buffer's done */
2722 lck_mtx_unlock(nfs_buf_mutex);
2723 nfs_buf_write_finish(bp, thd, cred);
2724 } else {
2725 /* wait for the last RPC to mark it done */
2726 while (bp->nb_rpcs > 0)
2727 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2728 "nfs_buf_write_rpc_cancel", NULL);
2729 lck_mtx_unlock(nfs_buf_mutex);
2730 }
2731 } else {
2732 nfs_buf_write_finish(bp, thd, cred);
2733 }
2734 }
2735
2736 return (error);
2737 }
2738
2739 /*
2740 * finish up an NFS WRITE RPC on a buffer
2741 */
2742 void
2743 nfs_buf_write_rpc_finish(struct nfsreq *req)
2744 {
2745 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2746 int committed = NFS_WRITE_FILESYNC;
2747 uint64_t wverf = 0;
2748 size_t rlen;
2749 void *wakeme = NULL;
2750 struct nfsreq_cbinfo cb;
2751 struct nfsreq *wreq = NULL;
2752 struct nfsbuf *bp;
2753 struct nfsmount *nmp;
2754 nfsnode_t np;
2755 thread_t thd;
2756 kauth_cred_t cred;
2757 uio_t auio;
2758 char uio_buf [ UIO_SIZEOF(1) ];
2759
2760 finish:
2761 np = req->r_np;
2762 thd = req->r_thread;
2763 cred = req->r_cred;
2764 if (IS_VALID_CRED(cred))
2765 kauth_cred_ref(cred);
2766 cb = req->r_callback;
2767 bp = cb.rcb_bp;
2768
2769 nmp = NFSTONMP(np);
2770 if (!nmp) {
2771 SET(bp->nb_flags, NB_ERROR);
2772 bp->nb_error = error = ENXIO;
2773 }
2774 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
2775 /* just drop it */
2776 nfs_request_async_cancel(req);
2777 goto out;
2778 }
2779 nfsvers = nmp->nm_vers;
2780
2781 offset = cb.rcb_args[0];
2782 rlen = length = cb.rcb_args[1];
2783
2784 /* finish the RPC */
2785 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
2786 if ((error == EINPROGRESS) && cb.rcb_func) {
2787 /* async request restarted */
2788 if (IS_VALID_CRED(cred))
2789 kauth_cred_unref(&cred);
2790 return;
2791 }
2792 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
2793 lck_mtx_lock(&nmp->nm_lock);
2794 if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) {
2795 printf("nfs_buf_write_rpc_finish: error %d, initiating recovery\n", error);
2796 nmp->nm_state |= NFSSTA_RECOVER;
2797 nfs_mount_sock_thread_wake(nmp);
2798 }
2799 lck_mtx_unlock(&nmp->nm_lock);
2800 if (error == NFSERR_GRACE)
2801 tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz);
2802 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
2803 rlen = 0;
2804 goto writeagain;
2805 }
2806 }
2807 if (error) {
2808 SET(bp->nb_flags, NB_ERROR);
2809 bp->nb_error = error;
2810 }
2811 if (error || (nfsvers == NFS_VER2))
2812 goto out;
2813 if (rlen <= 0) {
2814 SET(bp->nb_flags, NB_ERROR);
2815 bp->nb_error = error = EIO;
2816 goto out;
2817 }
2818
2819 /* save lowest commit level returned */
2820 if (committed < bp->nb_commitlevel)
2821 bp->nb_commitlevel = committed;
2822
2823 /* check the write verifier */
2824 if (!bp->nb_verf) {
2825 bp->nb_verf = wverf;
2826 } else if (bp->nb_verf != wverf) {
2827 /* verifier changed, so buffer will need to be rewritten */
2828 bp->nb_flags |= NB_STALEWVERF;
2829 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
2830 bp->nb_verf = wverf;
2831 }
2832
2833 /*
2834 * check for a short write
2835 *
2836 * If the server didn't write all the data, then we
2837 * need to issue another write for the rest of it.
2838 * (Don't bother if the buffer hit an error or stale wverf.)
2839 */
2840 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
2841 writeagain:
2842 offset += rlen;
2843 length -= rlen;
2844
2845 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
2846 UIO_WRITE, &uio_buf, sizeof(uio_buf));
2847 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2848
2849 cb.rcb_args[0] = offset;
2850 cb.rcb_args[1] = length;
2851 if (nmp->nm_vers >= NFS_VER4)
2852 cb.rcb_args[2] = nmp->nm_stategenid;
2853
2854 // XXX iomode should really match the original request
2855 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
2856 NFS_WRITE_FILESYNC, &cb, &wreq);
2857 if (!error) {
2858 if (IS_VALID_CRED(cred))
2859 kauth_cred_unref(&cred);
2860 if (!cb.rcb_func) {
2861 /* if !async we'll need to wait for this RPC to finish */
2862 req = wreq;
2863 wreq = NULL;
2864 goto finish;
2865 }
2866 /*
2867 * We're done here.
2868 * Outstanding RPC count is unchanged.
2869 * Callback will be called when RPC is done.
2870 */
2871 return;
2872 }
2873 SET(bp->nb_flags, NB_ERROR);
2874 bp->nb_error = error;
2875 }
2876
2877 out:
2878 if (cb.rcb_func)
2879 nfs_async_write_done(nmp);
2880 /*
2881 * Decrement outstanding RPC count on buffer
2882 * and call nfs_buf_write_finish on last RPC.
2883 *
2884 * (Note: when there are multiple async RPCs issued for a
2885 * buffer we need nfs_buffer_mutex to avoid problems when
2886 * aborting a partially-initiated set of RPCs)
2887 */
2888 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
2889 if (multasyncrpc)
2890 lck_mtx_lock(nfs_buf_mutex);
2891
2892 bp->nb_rpcs--;
2893 finished = (bp->nb_rpcs == 0);
2894
2895 if (multasyncrpc)
2896 lck_mtx_unlock(nfs_buf_mutex);
2897
2898 if (finished) {
2899 if (multasyncrpc)
2900 wakeme = &bp->nb_rpcs;
2901 nfs_buf_write_finish(bp, thd, cred);
2902 if (wakeme)
2903 wakeup(wakeme);
2904 }
2905
2906 if (IS_VALID_CRED(cred))
2907 kauth_cred_unref(&cred);
2908 }
2909
2910 /*
2911 * Send commit(s) for the given node's "needcommit" buffers
2912 */
2913 int
2914 nfs_flushcommits(nfsnode_t np, int nowait)
2915 {
2916 struct nfsmount *nmp;
2917 struct nfsbuf *bp, *prevlbp, *lbp;
2918 struct nfsbuflists blist, commitlist;
2919 int error = 0, retv, wcred_set, flags, dirty;
2920 u_quad_t off, endoff, toff;
2921 u_int32_t count;
2922 kauth_cred_t wcred = NULL;
2923
2924 FSDBG_TOP(557, np, 0, 0, 0);
2925
2926 /*
2927 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
2928 * server, but nas not been committed to stable storage on the server
2929 * yet. The byte range is worked out for as many nfsbufs as we can handle
2930 * and the commit rpc is done.
2931 */
2932 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
2933 error = nfs_node_lock(np);
2934 if (error)
2935 goto done;
2936 np->n_flag |= NMODIFIED;
2937 nfs_node_unlock(np);
2938 }
2939
2940 off = (u_quad_t)-1;
2941 endoff = 0;
2942 wcred_set = 0;
2943 LIST_INIT(&commitlist);
2944
2945 nmp = NFSTONMP(np);
2946 if (!nmp) {
2947 error = ENXIO;
2948 goto done;
2949 }
2950 if (nmp->nm_vers == NFS_VER2) {
2951 error = EINVAL;
2952 goto done;
2953 }
2954
2955 flags = NBI_DIRTY;
2956 if (nowait)
2957 flags |= NBI_NOWAIT;
2958 lck_mtx_lock(nfs_buf_mutex);
2959 if (!nfs_buf_iterprepare(np, &blist, flags)) {
2960 while ((bp = LIST_FIRST(&blist))) {
2961 LIST_REMOVE(bp, nb_vnbufs);
2962 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2963 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
2964 if (error)
2965 continue;
2966 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2967 nfs_buf_check_write_verifier(np, bp);
2968 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
2969 != (NB_DELWRI | NB_NEEDCOMMIT))) {
2970 nfs_buf_drop(bp);
2971 continue;
2972 }
2973 nfs_buf_remfree(bp);
2974
2975 /* buffer UPLs will be grabbed *in order* below */
2976
2977 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
2978 FSDBG(557, bp->nb_validoff, bp->nb_validend,
2979 bp->nb_dirtyoff, bp->nb_dirtyend);
2980
2981 /*
2982 * Work out if all buffers are using the same cred
2983 * so we can deal with them all with one commit.
2984 *
2985 * Note: creds in bp's must be obtained by kauth_cred_ref
2986 * on the same original cred in order for them to be equal.
2987 */
2988 if (wcred_set == 0) {
2989 wcred = bp->nb_wcred;
2990 if (!IS_VALID_CRED(wcred))
2991 panic("nfs: needcommit w/out wcred");
2992 wcred_set = 1;
2993 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
2994 wcred_set = -1;
2995 }
2996 SET(bp->nb_flags, NB_WRITEINPROG);
2997
2998 /*
2999 * Add this buffer to the list of buffers we are committing.
3000 * Buffers are inserted into the list in ascending order so that
3001 * we can take the UPLs in order after the list is complete.
3002 */
3003 prevlbp = NULL;
3004 LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
3005 if (bp->nb_lblkno < lbp->nb_lblkno)
3006 break;
3007 prevlbp = lbp;
3008 }
3009 LIST_REMOVE(bp, nb_vnbufs);
3010 if (prevlbp)
3011 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
3012 else
3013 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3014
3015 /* update commit range start, end */
3016 toff = NBOFF(bp) + bp->nb_dirtyoff;
3017 if (toff < off)
3018 off = toff;
3019 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3020 if (toff > endoff)
3021 endoff = toff;
3022 }
3023 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3024 }
3025 lck_mtx_unlock(nfs_buf_mutex);
3026
3027 if (LIST_EMPTY(&commitlist)) {
3028 error = ENOBUFS;
3029 goto done;
3030 }
3031
3032 /*
3033 * We need a UPL to prevent others from accessing the buffers during
3034 * our commit RPC(s).
3035 *
3036 * We used to also check for dirty pages here; if there were any we'd
3037 * abort the commit and force the entire buffer to be written again.
3038 * Instead of doing that, we just go ahead and commit the dirty range,
3039 * and then leave the buffer around with dirty pages that will be
3040 * written out later.
3041 */
3042 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3043 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3044 retv = nfs_buf_upl_setup(bp);
3045 if (retv) {
3046 /* Unable to create the UPL, the VM object probably no longer exists. */
3047 printf("nfs_flushcommits: upl create failed %d\n", retv);
3048 bp->nb_valid = bp->nb_dirty = 0;
3049 }
3050 }
3051 nfs_buf_upl_check(bp);
3052 }
3053
3054 /*
3055 * Commit data on the server, as required.
3056 * If all bufs are using the same wcred, then use that with
3057 * one call for all of them, otherwise commit each one
3058 * separately.
3059 */
3060 if (wcred_set == 1) {
3061 /*
3062 * Note, it's possible the commit range could be >2^32-1.
3063 * If it is, we'll send one commit that covers the whole file.
3064 */
3065 if ((endoff - off) > 0xffffffff)
3066 count = 0;
3067 else
3068 count = (endoff - off);
3069 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
3070 } else {
3071 retv = 0;
3072 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3073 toff = NBOFF(bp) + bp->nb_dirtyoff;
3074 count = bp->nb_dirtyend - bp->nb_dirtyoff;
3075 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
3076 if (retv)
3077 break;
3078 }
3079 }
3080
3081 /*
3082 * Now, either mark the blocks I/O done or mark the
3083 * blocks dirty, depending on whether the commit
3084 * succeeded.
3085 */
3086 while ((bp = LIST_FIRST(&commitlist))) {
3087 LIST_REMOVE(bp, nb_vnbufs);
3088 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3089 nfs_node_lock_force(np);
3090 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3091 np->n_needcommitcnt--;
3092 CHECK_NEEDCOMMITCNT(np);
3093 nfs_node_unlock(np);
3094
3095 if (retv) {
3096 /* move back to dirty list */
3097 lck_mtx_lock(nfs_buf_mutex);
3098 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3099 lck_mtx_unlock(nfs_buf_mutex);
3100 nfs_buf_release(bp, 1);
3101 continue;
3102 }
3103
3104 nfs_node_lock_force(np);
3105 np->n_numoutput++;
3106 nfs_node_unlock(np);
3107 vnode_startwrite(NFSTOV(np));
3108 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3109 lck_mtx_lock(nfs_buf_mutex);
3110 nfs_nbdwrite--;
3111 NFSBUFCNTCHK();
3112 lck_mtx_unlock(nfs_buf_mutex);
3113 wakeup(&nfs_nbdwrite);
3114 }
3115 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
3116 /* if block still has dirty pages, we don't want it to */
3117 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3118 if (!(dirty = bp->nb_dirty))
3119 SET(bp->nb_flags, NB_ASYNC);
3120 else
3121 CLR(bp->nb_flags, NB_ASYNC);
3122
3123 /* move to clean list */
3124 lck_mtx_lock(nfs_buf_mutex);
3125 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3126 lck_mtx_unlock(nfs_buf_mutex);
3127
3128 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3129
3130 nfs_buf_iodone(bp);
3131 if (dirty) {
3132 /* throw it back in as a delayed write buffer */
3133 CLR(bp->nb_flags, NB_DONE);
3134 nfs_buf_write_delayed(bp);
3135 }
3136 }
3137
3138 done:
3139 FSDBG_BOT(557, np, 0, 0, error);
3140 return (error);
3141 }
3142
3143 /*
3144 * Flush all the blocks associated with a vnode.
3145 * Walk through the buffer pool and push any dirty pages
3146 * associated with the vnode.
3147 */
3148 int
3149 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3150 {
3151 struct nfsbuf *bp;
3152 struct nfsbuflists blist;
3153 struct nfsmount *nmp = NFSTONMP(np);
3154 int error = 0, error2, slptimeo = 0, slpflag = 0;
3155 int nfsvers, flags, passone = 1;
3156
3157 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3158
3159 if (!nmp) {
3160 error = ENXIO;
3161 goto out;
3162 }
3163 nfsvers = nmp->nm_vers;
3164 if (nmp->nm_flag & NFSMNT_INT)
3165 slpflag = PCATCH;
3166
3167 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3168 nfs_node_lock_force(np);
3169 np->n_flag |= NMODIFIED;
3170 nfs_node_unlock(np);
3171 }
3172
3173 lck_mtx_lock(nfs_buf_mutex);
3174 while (np->n_bflag & NBFLUSHINPROG) {
3175 np->n_bflag |= NBFLUSHWANT;
3176 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3177 if ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0))) {
3178 lck_mtx_unlock(nfs_buf_mutex);
3179 goto out;
3180 }
3181 }
3182 np->n_bflag |= NBFLUSHINPROG;
3183
3184 /*
3185 * On the first pass, start async/unstable writes on all
3186 * delayed write buffers. Then wait for all writes to complete
3187 * and call nfs_flushcommits() to commit any uncommitted buffers.
3188 * On all subsequent passes, start STABLE writes on any remaining
3189 * dirty buffers. Then wait for all writes to complete.
3190 */
3191 again:
3192 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3193 if (!NFSTONMP(np)) {
3194 lck_mtx_unlock(nfs_buf_mutex);
3195 error = ENXIO;
3196 goto done;
3197 }
3198
3199 /* Start/do any write(s) that are required. */
3200 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3201 while ((bp = LIST_FIRST(&blist))) {
3202 LIST_REMOVE(bp, nb_vnbufs);
3203 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3204 flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
3205 if (flags != NBAC_NOWAIT)
3206 nfs_buf_refget(bp);
3207 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3208 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3209 if (error == EBUSY)
3210 break;
3211 if (error) {
3212 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3213 if (error2) {
3214 if (flags != NBAC_NOWAIT)
3215 nfs_buf_refrele(bp);
3216 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3217 lck_mtx_unlock(nfs_buf_mutex);
3218 error = error2;
3219 goto done;
3220 }
3221 if (slpflag == PCATCH) {
3222 slpflag = 0;
3223 slptimeo = 2 * hz;
3224 }
3225 }
3226 }
3227 if (flags != NBAC_NOWAIT)
3228 nfs_buf_refrele(bp);
3229 if (error == EBUSY)
3230 continue;
3231 if (!bp->nb_np) {
3232 /* buffer is no longer valid */
3233 nfs_buf_drop(bp);
3234 continue;
3235 }
3236 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3237 nfs_buf_check_write_verifier(np, bp);
3238 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3239 /* buffer is no longer dirty */
3240 nfs_buf_drop(bp);
3241 continue;
3242 }
3243 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3244 if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
3245 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3246 nfs_buf_drop(bp);
3247 continue;
3248 }
3249 nfs_buf_remfree(bp);
3250 lck_mtx_unlock(nfs_buf_mutex);
3251 if (ISSET(bp->nb_flags, NB_ERROR)) {
3252 nfs_node_lock_force(np);
3253 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3254 np->n_flag |= NWRITEERR;
3255 nfs_node_unlock(np);
3256 nfs_buf_release(bp, 1);
3257 lck_mtx_lock(nfs_buf_mutex);
3258 continue;
3259 }
3260 SET(bp->nb_flags, NB_ASYNC);
3261 if (!passone) {
3262 /* NB_STABLE forces this to be written FILESYNC */
3263 SET(bp->nb_flags, NB_STABLE);
3264 }
3265 nfs_buf_write(bp);
3266 lck_mtx_lock(nfs_buf_mutex);
3267 }
3268 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3269 }
3270 lck_mtx_unlock(nfs_buf_mutex);
3271
3272 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3273 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3274 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3275 if (error2) {
3276 error = error2;
3277 goto done;
3278 }
3279 if (slpflag == PCATCH) {
3280 slpflag = 0;
3281 slptimeo = 2 * hz;
3282 }
3283 }
3284 }
3285
3286 if (nfsvers != NFS_VER2) {
3287 /* loop while it looks like there are still buffers to be */
3288 /* commited and nfs_flushcommits() seems to be handling them. */
3289 while (np->n_needcommitcnt)
3290 if (nfs_flushcommits(np, 0))
3291 break;
3292 }
3293
3294 if (passone) {
3295 passone = 0;
3296 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3297 nfs_node_lock_force(np);
3298 np->n_flag |= NMODIFIED;
3299 nfs_node_unlock(np);
3300 }
3301 lck_mtx_lock(nfs_buf_mutex);
3302 goto again;
3303 }
3304
3305 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
3306 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3307 nfs_node_lock_force(np);
3308 np->n_flag |= NMODIFIED;
3309 nfs_node_unlock(np);
3310 }
3311 lck_mtx_lock(nfs_buf_mutex);
3312 if (!LIST_EMPTY(&np->n_dirtyblkhd))
3313 goto again;
3314 lck_mtx_unlock(nfs_buf_mutex);
3315 nfs_node_lock_force(np);
3316 /*
3317 * OK, it looks like there are no dirty blocks. If we have no
3318 * writes in flight and no one in the write code, we can clear
3319 * the modified flag. In order to make sure we see the latest
3320 * attributes and size, we also invalidate the attributes and
3321 * advance the attribute cache XID to guarantee that attributes
3322 * newer than our clearing of NMODIFIED will get loaded next.
3323 * (If we don't do this, it's possible for the flush's final
3324 * write/commit (xid1) to be executed in parallel with a subsequent
3325 * getattr request (xid2). The getattr could return attributes
3326 * from *before* the write/commit completed but the stale attributes
3327 * would be preferred because of the xid ordering.)
3328 */
3329 if (!np->n_wrbusy && !np->n_numoutput) {
3330 np->n_flag &= ~NMODIFIED;
3331 NATTRINVALIDATE(np);
3332 nfs_get_xid(&np->n_xid);
3333 }
3334 } else {
3335 nfs_node_lock_force(np);
3336 }
3337
3338 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3339 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3340 error = np->n_error;
3341 np->n_flag &= ~NWRITEERR;
3342 }
3343 nfs_node_unlock(np);
3344 done:
3345 lck_mtx_lock(nfs_buf_mutex);
3346 flags = np->n_bflag;
3347 np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT);
3348 lck_mtx_unlock(nfs_buf_mutex);
3349 if (flags & NBFLUSHWANT)
3350 wakeup(&np->n_bflag);
3351 out:
3352 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3353 return (error);
3354 }
3355
3356 /*
3357 * Flush out and invalidate all buffers associated with a vnode.
3358 * Called with the underlying object locked.
3359 */
3360 int
3361 nfs_vinvalbuf_internal(
3362 nfsnode_t np,
3363 int flags,
3364 thread_t thd,
3365 kauth_cred_t cred,
3366 int slpflag,
3367 int slptimeo)
3368 {
3369 struct nfsbuf *bp;
3370 struct nfsbuflists blist;
3371 int list, error = 0;
3372
3373 if (flags & V_SAVE) {
3374 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR))))
3375 return (error);
3376 }
3377
3378 lck_mtx_lock(nfs_buf_mutex);
3379 for (;;) {
3380 list = NBI_CLEAN;
3381 if (nfs_buf_iterprepare(np, &blist, list)) {
3382 list = NBI_DIRTY;
3383 if (nfs_buf_iterprepare(np, &blist, list))
3384 break;
3385 }
3386 while ((bp = LIST_FIRST(&blist))) {
3387 LIST_REMOVE(bp, nb_vnbufs);
3388 if (list == NBI_CLEAN)
3389 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3390 else
3391 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3392 nfs_buf_refget(bp);
3393 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3394 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3395 if (error != EAGAIN) {
3396 FSDBG(554, np, bp, -1, error);
3397 nfs_buf_refrele(bp);
3398 nfs_buf_itercomplete(np, &blist, list);
3399 lck_mtx_unlock(nfs_buf_mutex);
3400 return (error);
3401 }
3402 }
3403 nfs_buf_refrele(bp);
3404 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3405 lck_mtx_unlock(nfs_buf_mutex);
3406 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3407 (NBOFF(bp) < (off_t)np->n_size)) {
3408 /* extra paranoia: make sure we're not */
3409 /* somehow leaving any dirty data around */
3410 int mustwrite = 0;
3411 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3412 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3413 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3414 error = nfs_buf_upl_setup(bp);
3415 if (error == EINVAL) {
3416 /* vm object must no longer exist */
3417 /* hopefully we don't need to do */
3418 /* anything for this buffer */
3419 } else if (error)
3420 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3421 bp->nb_valid = bp->nb_dirty = 0;
3422 }
3423 nfs_buf_upl_check(bp);
3424 /* check for any dirty data before the EOF */
3425 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3426 /* clip dirty range to EOF */
3427 if (bp->nb_dirtyend > end) {
3428 bp->nb_dirtyend = end;
3429 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
3430 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3431 }
3432 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end))
3433 mustwrite++;
3434 }
3435 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
3436 if (bp->nb_dirty)
3437 mustwrite++;
3438 /* also make sure we'll have a credential to do the write */
3439 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3440 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3441 mustwrite = 0;
3442 }
3443 if (mustwrite) {
3444 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3445 if (!ISSET(bp->nb_flags, NB_PAGELIST))
3446 panic("nfs_vinvalbuf: dirty buffer without upl");
3447 /* gotta write out dirty data before invalidating */
3448 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3449 /* (NB_NOCACHE indicates buffer should be discarded) */
3450 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3451 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3452 if (!IS_VALID_CRED(bp->nb_wcred)) {
3453 kauth_cred_ref(cred);
3454 bp->nb_wcred = cred;
3455 }
3456 error = nfs_buf_write(bp);
3457 // Note: bp has been released
3458 if (error) {
3459 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3460 nfs_node_lock_force(np);
3461 np->n_error = error;
3462 np->n_flag |= NWRITEERR;
3463 /*
3464 * There was a write error and we need to
3465 * invalidate attrs to sync with server.
3466 * (if this write was extending the file,
3467 * we may no longer know the correct size)
3468 */
3469 NATTRINVALIDATE(np);
3470 nfs_node_unlock(np);
3471 if (error == EINTR) {
3472 /*
3473 * Abort on EINTR. If we don't, we could
3474 * be stuck in this loop forever because
3475 * the buffer will continue to stay dirty.
3476 */
3477 lck_mtx_lock(nfs_buf_mutex);
3478 nfs_buf_itercomplete(np, &blist, list);
3479 lck_mtx_unlock(nfs_buf_mutex);
3480 return (error);
3481 }
3482 error = 0;
3483 }
3484 lck_mtx_lock(nfs_buf_mutex);
3485 continue;
3486 }
3487 }
3488 SET(bp->nb_flags, NB_INVAL);
3489 // hold off on FREEUPs until we're done here
3490 nfs_buf_release(bp, 0);
3491 lck_mtx_lock(nfs_buf_mutex);
3492 }
3493 nfs_buf_itercomplete(np, &blist, list);
3494 }
3495 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
3496 panic("nfs_vinvalbuf: flush/inval failed");
3497 lck_mtx_unlock(nfs_buf_mutex);
3498 nfs_node_lock_force(np);
3499 if (!(flags & V_SAVE))
3500 np->n_flag &= ~NMODIFIED;
3501 if (vnode_vtype(NFSTOV(np)) == VREG)
3502 np->n_lastrahead = -1;
3503 nfs_node_unlock(np);
3504 NFS_BUF_FREEUP();
3505 return (0);
3506 }
3507
3508
3509 /*
3510 * Flush and invalidate all dirty buffers. If another process is already
3511 * doing the flush, just wait for completion.
3512 */
3513 int
3514 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3515 {
3516 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3517 }
3518
3519 int
3520 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3521 {
3522 nfsnode_t np = VTONFS(vp);
3523 struct nfsmount *nmp = VTONMP(vp);
3524 int error, rv, slpflag, slptimeo, nflags;
3525 off_t size;
3526
3527 FSDBG_TOP(554, np, flags, intrflg, 0);
3528
3529 if (nmp && !(nmp->nm_flag & NFSMNT_INT))
3530 intrflg = 0;
3531 if (intrflg) {
3532 slpflag = PCATCH;
3533 slptimeo = 2 * hz;
3534 } else {
3535 slpflag = 0;
3536 slptimeo = 0;
3537 }
3538
3539 /* First wait for any other process doing a flush to complete. */
3540 lck_mtx_lock(nfs_buf_mutex);
3541 while (np->n_bflag & NBINVALINPROG) {
3542 np->n_bflag |= NBINVALWANT;
3543 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
3544 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
3545 lck_mtx_unlock(nfs_buf_mutex);
3546 return (error);
3547 }
3548 }
3549 np->n_bflag |= NBINVALINPROG;
3550 lck_mtx_unlock(nfs_buf_mutex);
3551
3552 /* Now, flush as required. */
3553 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3554 while (error) {
3555 FSDBG(554, np, 0, 0, error);
3556 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))
3557 goto done;
3558 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3559 }
3560
3561 /* get the pages out of vm also */
3562 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
3563 if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
3564 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
3565 done:
3566 lck_mtx_lock(nfs_buf_mutex);
3567 nflags = np->n_bflag;
3568 np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT);
3569 lck_mtx_unlock(nfs_buf_mutex);
3570 if (nflags & NBINVALWANT)
3571 wakeup(&np->n_bflag);
3572
3573 FSDBG_BOT(554, np, flags, intrflg, error);
3574 return (error);
3575 }
3576
3577 /*
3578 * Add an async I/O request to the mount's async I/O queue and make
3579 * sure that an nfsiod will service it.
3580 */
3581 void
3582 nfs_asyncio_finish(struct nfsreq *req)
3583 {
3584 struct nfsmount *nmp;
3585 struct nfsiod *niod;
3586 int started = 0;
3587
3588 FSDBG_TOP(552, nmp, 0, 0, 0);
3589 again:
3590 if (((nmp = req->r_nmp)) == NULL)
3591 return;
3592 lck_mtx_lock(nfsiod_mutex);
3593 niod = nmp->nm_niod;
3594
3595 /* grab an nfsiod if we don't have one already */
3596 if (!niod) {
3597 niod = TAILQ_FIRST(&nfsiodfree);
3598 if (niod) {
3599 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
3600 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
3601 niod->niod_nmp = nmp;
3602 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
3603 /*
3604 * Try starting a new thread.
3605 * We may try a couple times if other callers
3606 * get the new threads before we do.
3607 */
3608 lck_mtx_unlock(nfsiod_mutex);
3609 started++;
3610 if (!nfsiod_start())
3611 goto again;
3612 lck_mtx_lock(nfsiod_mutex);
3613 }
3614 }
3615
3616 if (req->r_achain.tqe_next == NFSREQNOLIST)
3617 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
3618
3619 /* If this mount doesn't already have an nfsiod working on it... */
3620 if (!nmp->nm_niod) {
3621 if (niod) { /* give it the nfsiod we just grabbed */
3622 nmp->nm_niod = niod;
3623 lck_mtx_unlock(nfsiod_mutex);
3624 wakeup(niod);
3625 } else if (nfsiod_thread_count > 0) {
3626 /* just queue it up on nfsiod mounts queue */
3627 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
3628 lck_mtx_unlock(nfsiod_mutex);
3629 } else {
3630 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
3631 lck_mtx_unlock(nfsiod_mutex);
3632 /* we have no other option but to be persistent */
3633 started = 0;
3634 goto again;
3635 }
3636 } else {
3637 lck_mtx_unlock(nfsiod_mutex);
3638 }
3639
3640 FSDBG_BOT(552, nmp, 0, 0, 0);
3641 }
3642
3643 /*
3644 * queue up async I/O request for resend
3645 */
3646 void
3647 nfs_asyncio_resend(struct nfsreq *req)
3648 {
3649 struct nfsmount *nmp = req->r_nmp;
3650
3651 if (!nmp)
3652 return;
3653 nfs_gss_clnt_rpcdone(req);
3654 lck_mtx_lock(&nmp->nm_lock);
3655 if (!(req->r_flags & R_RESENDQ)) {
3656 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
3657 req->r_flags |= R_RESENDQ;
3658 }
3659 nfs_mount_sock_thread_wake(nmp);
3660 lck_mtx_unlock(&nmp->nm_lock);
3661 }
3662
3663 /*
3664 * Read directory data into a buffer.
3665 *
3666 * Buffer will be filled (unless EOF is hit).
3667 * Buffers after this one may also be completely/partially filled.
3668 */
3669 int
3670 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
3671 {
3672 nfsnode_t np = bp->nb_np;
3673 struct nfsmount *nmp = NFSTONMP(np);
3674 int error = 0;
3675
3676 if (!nmp)
3677 return (ENXIO);
3678
3679 if (nmp->nm_vers < NFS_VER4)
3680 error = nfs3_readdir_rpc(np, bp, ctx);
3681 else
3682 error = nfs4_readdir_rpc(np, bp, ctx);
3683
3684 if (error && (error != NFSERR_DIRBUFDROPPED)) {
3685 SET(bp->nb_flags, NB_ERROR);
3686 bp->nb_error = error;
3687 }
3688 return (error);
3689 }