]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-1228.3.13.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/resourcevar.h>
70 #include <sys/signalvar.h>
71 #include <sys/proc_internal.h>
72 #include <sys/kauth.h>
73 #include <sys/malloc.h>
74 #include <sys/vnode.h>
75 #include <sys/dirent.h>
76 #include <sys/mount_internal.h>
77 #include <sys/kernel.h>
78 #include <sys/ubc_internal.h>
79 #include <sys/uio_internal.h>
80
81 #include <sys/vm.h>
82 #include <sys/vmparam.h>
83
84 #include <sys/time.h>
85 #include <kern/clock.h>
86 #include <libkern/OSAtomic.h>
87 #include <kern/kalloc.h>
88 #include <kern/thread_call.h>
89
90 #include <nfs/rpcv2.h>
91 #include <nfs/nfsproto.h>
92 #include <nfs/nfs.h>
93 #include <nfs/nfs_gss.h>
94 #include <nfs/nfsmount.h>
95 #include <nfs/nfsnode.h>
96 #include <sys/buf_internal.h>
97 #include <libkern/OSAtomic.h>
98
99 kern_return_t thread_terminate(thread_t); /* XXX */
100
101 #define NFSBUFHASH(np, lbn) \
102 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
103 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
104 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
105 u_long nfsbufhash;
106 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
107 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
108 int nfs_nbdwrite;
109 int nfs_buf_timer_on = 0;
110 thread_t nfsbufdelwrithd = NULL;
111
112 lck_grp_t *nfs_buf_lck_grp;
113 lck_mtx_t *nfs_buf_mutex;
114
115 #define NFSBUF_FREE_PERIOD 30 /* seconds */
116 #define NFSBUF_LRU_STALE 120
117 #define NFSBUF_META_STALE 240
118
119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
120 #define LRU_TO_FREEUP 6
121 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
122 #define META_TO_FREEUP 3
123 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
124 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
126 #define LRU_FREEUP_FRAC_ON_TIMER 8
127 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
128 #define META_FREEUP_FRAC_ON_TIMER 16
129 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
130 #define LRU_FREEUP_MIN_FRAC 4
131 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
132 #define META_FREEUP_MIN_FRAC 2
133
134 #define NFS_BUF_FREEUP() \
135 do { \
136 /* only call nfs_buf_freeup() if it has work to do: */ \
137 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
138 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
139 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
140 nfs_buf_freeup(0); \
141 } while (0)
142
143 /*
144 * Initialize nfsbuf lists
145 */
146 void
147 nfs_nbinit(void)
148 {
149 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
150 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
151
152 nfsbufcnt = nfsbufmetacnt =
153 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
154 nfsbufmin = 128;
155 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
156 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
157 nfsbufmetamax = nfsbufmax / 4;
158 nfsneedbuffer = 0;
159 nfs_nbdwrite = 0;
160
161 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
162 TAILQ_INIT(&nfsbuffree);
163 TAILQ_INIT(&nfsbuffreemeta);
164 TAILQ_INIT(&nfsbufdelwri);
165
166 }
167
168 /*
169 * Check periodically for stale/unused nfs bufs
170 */
171 void
172 nfs_buf_timer(__unused void *param0, __unused void *param1)
173 {
174 nfs_buf_freeup(1);
175
176 lck_mtx_lock(nfs_buf_mutex);
177 if (nfsbufcnt <= nfsbufmin) {
178 nfs_buf_timer_on = 0;
179 lck_mtx_unlock(nfs_buf_mutex);
180 return;
181 }
182 lck_mtx_unlock(nfs_buf_mutex);
183
184 nfs_interval_timer_start(nfs_buf_timer_call,
185 NFSBUF_FREE_PERIOD * 1000);
186 }
187
188 /*
189 * try to free up some excess, unused nfsbufs
190 */
191 void
192 nfs_buf_freeup(int timer)
193 {
194 struct nfsbuf *fbp;
195 struct timeval now;
196 int count;
197 struct nfsbuffreehead nfsbuffreeup;
198
199 TAILQ_INIT(&nfsbuffreeup);
200
201 lck_mtx_lock(nfs_buf_mutex);
202
203 microuptime(&now);
204
205 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
206
207 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
208 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
209 fbp = TAILQ_FIRST(&nfsbuffree);
210 if (!fbp)
211 break;
212 if (fbp->nb_refs)
213 break;
214 if (NBUFSTAMPVALID(fbp) &&
215 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
216 break;
217 nfs_buf_remfree(fbp);
218 /* disassociate buffer from any nfsnode */
219 if (fbp->nb_np) {
220 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
221 LIST_REMOVE(fbp, nb_vnbufs);
222 fbp->nb_vnbufs.le_next = NFSNOLIST;
223 }
224 fbp->nb_np = NULL;
225 }
226 LIST_REMOVE(fbp, nb_hash);
227 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
228 nfsbufcnt--;
229 }
230
231 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
232 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
233 fbp = TAILQ_FIRST(&nfsbuffreemeta);
234 if (!fbp)
235 break;
236 if (fbp->nb_refs)
237 break;
238 if (NBUFSTAMPVALID(fbp) &&
239 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
240 break;
241 nfs_buf_remfree(fbp);
242 /* disassociate buffer from any nfsnode */
243 if (fbp->nb_np) {
244 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
245 LIST_REMOVE(fbp, nb_vnbufs);
246 fbp->nb_vnbufs.le_next = NFSNOLIST;
247 }
248 fbp->nb_np = NULL;
249 }
250 LIST_REMOVE(fbp, nb_hash);
251 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
252 nfsbufcnt--;
253 nfsbufmetacnt--;
254 }
255
256 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
257 NFSBUFCNTCHK();
258
259 lck_mtx_unlock(nfs_buf_mutex);
260
261 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
262 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
263 /* nuke any creds */
264 if (IS_VALID_CRED(fbp->nb_rcred))
265 kauth_cred_unref(&fbp->nb_rcred);
266 if (IS_VALID_CRED(fbp->nb_wcred))
267 kauth_cred_unref(&fbp->nb_wcred);
268 /* if buf was NB_META, dump buffer */
269 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
270 kfree(fbp->nb_data, fbp->nb_bufsize);
271 FREE(fbp, M_TEMP);
272 }
273
274 }
275
276 /*
277 * remove a buffer from the freelist
278 * (must be called with nfs_buf_mutex held)
279 */
280 void
281 nfs_buf_remfree(struct nfsbuf *bp)
282 {
283 if (bp->nb_free.tqe_next == NFSNOLIST)
284 panic("nfsbuf not on free list");
285 if (ISSET(bp->nb_flags, NB_DELWRI)) {
286 nfsbufdelwricnt--;
287 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
288 } else if (ISSET(bp->nb_flags, NB_META)) {
289 nfsbuffreemetacnt--;
290 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
291 } else {
292 nfsbuffreecnt--;
293 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
294 }
295 bp->nb_free.tqe_next = NFSNOLIST;
296 NFSBUFCNTCHK();
297 }
298
299 /*
300 * check for existence of nfsbuf in cache
301 */
302 boolean_t
303 nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
304 {
305 boolean_t rv;
306 lck_mtx_lock(nfs_buf_mutex);
307 if (nfs_buf_incore(np, blkno))
308 rv = TRUE;
309 else
310 rv = FALSE;
311 lck_mtx_unlock(nfs_buf_mutex);
312 return (rv);
313 }
314
315 /*
316 * return incore buffer (must be called with nfs_buf_mutex held)
317 */
318 struct nfsbuf *
319 nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
320 {
321 /* Search hash chain */
322 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
323 for (; bp != NULL; bp = bp->nb_hash.le_next)
324 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
325 if (!ISSET(bp->nb_flags, NB_INVAL)) {
326 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
327 return (bp);
328 }
329 }
330 return (NULL);
331 }
332
333 /*
334 * Check if it's OK to drop a page.
335 *
336 * Called by vnode_pager() on pageout request of non-dirty page.
337 * We need to make sure that it's not part of a delayed write.
338 * If it is, we can't let the VM drop it because we may need it
339 * later when/if we need to write the data (again).
340 */
341 int
342 nfs_buf_page_inval(vnode_t vp, off_t offset)
343 {
344 struct nfsmount *nmp = VTONMP(vp);
345 struct nfsbuf *bp;
346 int error = 0;
347
348 if (!nmp)
349 return (ENXIO);
350
351 lck_mtx_lock(nfs_buf_mutex);
352 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
353 if (!bp)
354 goto out;
355 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
356 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
357 error = EBUSY;
358 goto out;
359 }
360 /*
361 * If there's a dirty range in the buffer, check to
362 * see if this page intersects with the dirty range.
363 * If it does, we can't let the pager drop the page.
364 */
365 if (bp->nb_dirtyend > 0) {
366 int start = offset - NBOFF(bp);
367 if (bp->nb_dirtyend <= start ||
368 bp->nb_dirtyoff >= (start + PAGE_SIZE))
369 error = 0;
370 else
371 error = EBUSY;
372 }
373 out:
374 lck_mtx_unlock(nfs_buf_mutex);
375 return (error);
376 }
377
378 /*
379 * set up the UPL for a buffer
380 * (must NOT be called with nfs_buf_mutex held)
381 */
382 int
383 nfs_buf_upl_setup(struct nfsbuf *bp)
384 {
385 kern_return_t kret;
386 upl_t upl;
387 int upl_flags;
388
389 if (ISSET(bp->nb_flags, NB_PAGELIST))
390 return (0);
391
392 upl_flags = UPL_PRECIOUS;
393 if (!ISSET(bp->nb_flags, NB_READ)) {
394 /*
395 * We're doing a "write", so we intend to modify
396 * the pages we're gathering.
397 */
398 upl_flags |= UPL_WILL_MODIFY;
399 }
400 kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
401 &upl, NULL, upl_flags);
402 if (kret == KERN_INVALID_ARGUMENT) {
403 /* vm object probably doesn't exist any more */
404 bp->nb_pagelist = NULL;
405 return (EINVAL);
406 }
407 if (kret != KERN_SUCCESS) {
408 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
409 bp->nb_pagelist = NULL;
410 return (EIO);
411 }
412
413 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
414
415 bp->nb_pagelist = upl;
416 SET(bp->nb_flags, NB_PAGELIST);
417 return (0);
418 }
419
420 /*
421 * update buffer's valid/dirty info from UBC
422 * (must NOT be called with nfs_buf_mutex held)
423 */
424 void
425 nfs_buf_upl_check(struct nfsbuf *bp)
426 {
427 upl_page_info_t *pl;
428 off_t filesize, fileoffset;
429 int i, npages;
430
431 if (!ISSET(bp->nb_flags, NB_PAGELIST))
432 return;
433
434 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
435 filesize = ubc_getsize(NFSTOV(bp->nb_np));
436 fileoffset = NBOFF(bp);
437 if (fileoffset < filesize)
438 SET(bp->nb_flags, NB_CACHE);
439 else
440 CLR(bp->nb_flags, NB_CACHE);
441
442 pl = ubc_upl_pageinfo(bp->nb_pagelist);
443 bp->nb_valid = bp->nb_dirty = 0;
444
445 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
446 /* anything beyond the end of the file is not valid or dirty */
447 if (fileoffset >= filesize)
448 break;
449 if (!upl_valid_page(pl, i)) {
450 CLR(bp->nb_flags, NB_CACHE);
451 continue;
452 }
453 NBPGVALID_SET(bp,i);
454 if (upl_dirty_page(pl, i))
455 NBPGDIRTY_SET(bp, i);
456 }
457 fileoffset = NBOFF(bp);
458 if (ISSET(bp->nb_flags, NB_CACHE)) {
459 bp->nb_validoff = 0;
460 bp->nb_validend = bp->nb_bufsize;
461 if (fileoffset + bp->nb_validend > filesize)
462 bp->nb_validend = filesize - fileoffset;
463 } else {
464 bp->nb_validoff = bp->nb_validend = -1;
465 }
466 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
467 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
468 }
469
470 /*
471 * make sure that a buffer is mapped
472 * (must NOT be called with nfs_buf_mutex held)
473 */
474 int
475 nfs_buf_map(struct nfsbuf *bp)
476 {
477 kern_return_t kret;
478
479 if (bp->nb_data)
480 return (0);
481 if (!ISSET(bp->nb_flags, NB_PAGELIST))
482 return (EINVAL);
483
484 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
485 if (kret != KERN_SUCCESS)
486 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
487 if (bp->nb_data == 0)
488 panic("ubc_upl_map mapped 0");
489 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
490 return (0);
491 }
492
493 /*
494 * normalize an nfsbuf's valid range
495 *
496 * the read/write code guarantees that we'll always have a valid
497 * region that is an integral number of pages. If either end
498 * of the valid range isn't page-aligned, it gets corrected
499 * here as we extend the valid range through all of the
500 * contiguous valid pages.
501 */
502 void
503 nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
504 {
505 int pg, npg;
506 /* pull validoff back to start of contiguous valid page range */
507 pg = bp->nb_validoff/PAGE_SIZE;
508 while (pg >= 0 && NBPGVALID(bp,pg))
509 pg--;
510 bp->nb_validoff = (pg+1) * PAGE_SIZE;
511 /* push validend forward to end of contiguous valid page range */
512 npg = bp->nb_bufsize/PAGE_SIZE;
513 pg = bp->nb_validend/PAGE_SIZE;
514 while (pg < npg && NBPGVALID(bp,pg))
515 pg++;
516 bp->nb_validend = pg * PAGE_SIZE;
517 /* clip to EOF */
518 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
519 bp->nb_validend = np->n_size % bp->nb_bufsize;
520 }
521
522 /*
523 * process some entries on the delayed write queue
524 * (must be called with nfs_buf_mutex held)
525 */
526 static void
527 nfs_buf_delwri_service(void)
528 {
529 struct nfsbuf *bp;
530 nfsnode_t np;
531 int error, i = 0;
532
533 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
534 np = bp->nb_np;
535 nfs_buf_remfree(bp);
536 nfs_buf_refget(bp);
537 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
538 nfs_buf_refrele(bp);
539 if (error)
540 break;
541 if (!bp->nb_np) {
542 /* buffer is no longer valid */
543 nfs_buf_drop(bp);
544 continue;
545 }
546 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
547 nfs_buf_check_write_verifier(np, bp);
548 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
549 /* put buffer at end of delwri list */
550 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
551 nfsbufdelwricnt++;
552 nfs_buf_drop(bp);
553 lck_mtx_unlock(nfs_buf_mutex);
554 nfs_flushcommits(np, 1);
555 } else {
556 SET(bp->nb_flags, NB_ASYNC);
557 lck_mtx_unlock(nfs_buf_mutex);
558 nfs_buf_write(bp);
559 }
560 i++;
561 lck_mtx_lock(nfs_buf_mutex);
562 }
563 }
564
565 /*
566 * thread to service the delayed write queue when asked
567 */
568 static void
569 nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
570 {
571 struct timespec ts = { 30, 0 };
572 int error = 0;
573
574 lck_mtx_lock(nfs_buf_mutex);
575 while (!error) {
576 nfs_buf_delwri_service();
577 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
578 }
579 nfsbufdelwrithd = NULL;
580 lck_mtx_unlock(nfs_buf_mutex);
581 thread_terminate(nfsbufdelwrithd);
582 }
583
584 /*
585 * try to push out some delayed/uncommitted writes
586 * ("locked" indicates whether nfs_buf_mutex is already held)
587 */
588 static void
589 nfs_buf_delwri_push(int locked)
590 {
591 if (TAILQ_EMPTY(&nfsbufdelwri))
592 return;
593 if (!locked)
594 lck_mtx_lock(nfs_buf_mutex);
595 /* wake up the delayed write service thread */
596 if (nfsbufdelwrithd)
597 wakeup(&nfsbufdelwrithd);
598 else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS)
599 thread_deallocate(nfsbufdelwrithd);
600 /* otherwise, try to do some of the work ourselves */
601 if (!nfsbufdelwrithd)
602 nfs_buf_delwri_service();
603 if (!locked)
604 lck_mtx_unlock(nfs_buf_mutex);
605 }
606
607 /*
608 * Get an nfs buffer.
609 *
610 * Returns errno on error, 0 otherwise.
611 * Any buffer is returned in *bpp.
612 *
613 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
614 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
615 *
616 * Check for existence of buffer in cache.
617 * Or attempt to reuse a buffer from one of the free lists.
618 * Or allocate a new buffer if we haven't already hit max allocation.
619 * Or wait for a free buffer.
620 *
621 * If available buffer found, prepare it, and return it.
622 *
623 * If the calling process is interrupted by a signal for
624 * an interruptible mount point, return EINTR.
625 */
626 int
627 nfs_buf_get(
628 nfsnode_t np,
629 daddr64_t blkno,
630 int size,
631 thread_t thd,
632 int flags,
633 struct nfsbuf **bpp)
634 {
635 vnode_t vp = NFSTOV(np);
636 struct nfsmount *nmp = VTONMP(vp);
637 struct nfsbuf *bp;
638 int bufsize;
639 int slpflag = PCATCH;
640 int operation = (flags & NBLK_OPMASK);
641 int error = 0;
642 struct timespec ts;
643
644 FSDBG_TOP(541, np, blkno, size, flags);
645 *bpp = NULL;
646
647 bufsize = size;
648 if (bufsize > NFS_MAXBSIZE)
649 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
650
651 if (!nmp) {
652 FSDBG_BOT(541, np, blkno, 0, ENXIO);
653 return (ENXIO);
654 }
655
656 if (!UBCINFOEXISTS(vp)) {
657 operation = NBLK_META;
658 } else if (bufsize < nmp->nm_biosize) {
659 /* reg files should always have biosize blocks */
660 bufsize = nmp->nm_biosize;
661 }
662
663 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
664 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
665 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
666
667 /* poke the delwri list */
668 nfs_buf_delwri_push(0);
669
670 /* sleep to let other threads run... */
671 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
672 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
673 }
674
675 loop:
676 lck_mtx_lock(nfs_buf_mutex);
677
678 /* check for existence of nfsbuf in cache */
679 if ((bp = nfs_buf_incore(np, blkno))) {
680 /* if busy, set wanted and wait */
681 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
682 if (flags & NBLK_NOWAIT) {
683 lck_mtx_unlock(nfs_buf_mutex);
684 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
685 return (0);
686 }
687 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
688 SET(bp->nb_lflags, NBL_WANTED);
689
690 ts.tv_sec = 2;
691 ts.tv_nsec = 0;
692 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
693 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
694 slpflag = 0;
695 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
696 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
697 FSDBG_BOT(541, np, blkno, 0, error);
698 return (error);
699 }
700 goto loop;
701 }
702 if (bp->nb_bufsize != bufsize)
703 panic("nfsbuf size mismatch");
704 SET(bp->nb_lflags, NBL_BUSY);
705 SET(bp->nb_flags, NB_CACHE);
706 nfs_buf_remfree(bp);
707 /* additional paranoia: */
708 if (ISSET(bp->nb_flags, NB_PAGELIST))
709 panic("pagelist buffer was not busy");
710 goto buffer_setup;
711 }
712
713 if (flags & NBLK_ONLYVALID) {
714 lck_mtx_unlock(nfs_buf_mutex);
715 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
716 return (0);
717 }
718
719 /*
720 * where to get a free buffer:
721 * - if meta and maxmeta reached, must reuse meta
722 * - alloc new if we haven't reached min bufs
723 * - if free lists are NOT empty
724 * - if free list is stale, use it
725 * - else if freemeta list is stale, use it
726 * - else if max bufs allocated, use least-time-to-stale
727 * - alloc new if we haven't reached max allowed
728 * - start clearing out delwri list and try again
729 */
730
731 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
732 /* if we've hit max meta buffers, must reuse a meta buffer */
733 bp = TAILQ_FIRST(&nfsbuffreemeta);
734 } else if ((nfsbufcnt > nfsbufmin) &&
735 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
736 /* try to pull an nfsbuf off a free list */
737 struct nfsbuf *lrubp, *metabp;
738 struct timeval now;
739 microuptime(&now);
740
741 /* if the next LRU or META buffer is invalid or stale, use it */
742 lrubp = TAILQ_FIRST(&nfsbuffree);
743 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
744 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
745 bp = lrubp;
746 metabp = TAILQ_FIRST(&nfsbuffreemeta);
747 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
748 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
749 bp = metabp;
750
751 if (!bp && (nfsbufcnt >= nfsbufmax)) {
752 /* we've already allocated all bufs, so */
753 /* choose the buffer that'll go stale first */
754 if (!metabp)
755 bp = lrubp;
756 else if (!lrubp)
757 bp = metabp;
758 else {
759 int32_t lru_stale_time, meta_stale_time;
760 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
761 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
762 if (lru_stale_time <= meta_stale_time)
763 bp = lrubp;
764 else
765 bp = metabp;
766 }
767 }
768 }
769
770 if (bp) {
771 /* we have a buffer to reuse */
772 FSDBG(544, np, blkno, bp, bp->nb_flags);
773 nfs_buf_remfree(bp);
774 if (ISSET(bp->nb_flags, NB_DELWRI))
775 panic("nfs_buf_get: delwri");
776 SET(bp->nb_lflags, NBL_BUSY);
777 /* disassociate buffer from previous nfsnode */
778 if (bp->nb_np) {
779 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
780 LIST_REMOVE(bp, nb_vnbufs);
781 bp->nb_vnbufs.le_next = NFSNOLIST;
782 }
783 bp->nb_np = NULL;
784 }
785 LIST_REMOVE(bp, nb_hash);
786 /* nuke any creds we're holding */
787 if (IS_VALID_CRED(bp->nb_rcred))
788 kauth_cred_unref(&bp->nb_rcred);
789 if (IS_VALID_CRED(bp->nb_wcred))
790 kauth_cred_unref(&bp->nb_wcred);
791 /* if buf will no longer be NB_META, dump old buffer */
792 if (operation == NBLK_META) {
793 if (!ISSET(bp->nb_flags, NB_META))
794 nfsbufmetacnt++;
795 } else if (ISSET(bp->nb_flags, NB_META)) {
796 if (bp->nb_data) {
797 kfree(bp->nb_data, bp->nb_bufsize);
798 bp->nb_data = NULL;
799 }
800 nfsbufmetacnt--;
801 }
802 /* re-init buf fields */
803 bp->nb_error = 0;
804 bp->nb_validoff = bp->nb_validend = -1;
805 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
806 bp->nb_valid = 0;
807 bp->nb_dirty = 0;
808 bp->nb_verf = 0;
809 } else {
810 /* no buffer to reuse */
811 if ((nfsbufcnt < nfsbufmax) &&
812 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
813 /* just alloc a new one */
814 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
815 if (!bp) {
816 lck_mtx_unlock(nfs_buf_mutex);
817 FSDBG_BOT(541, np, blkno, 0, error);
818 return (ENOMEM);
819 }
820 nfsbufcnt++;
821
822 /*
823 * If any excess bufs, make sure the timer
824 * is running to free them up later.
825 */
826 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
827 nfs_buf_timer_on = 1;
828 nfs_interval_timer_start(nfs_buf_timer_call,
829 NFSBUF_FREE_PERIOD * 1000);
830 }
831
832 if (operation == NBLK_META)
833 nfsbufmetacnt++;
834 NFSBUFCNTCHK();
835 /* init nfsbuf */
836 bzero(bp, sizeof(*bp));
837 bp->nb_free.tqe_next = NFSNOLIST;
838 bp->nb_validoff = bp->nb_validend = -1;
839 FSDBG(545, np, blkno, bp, 0);
840 } else {
841 /* too many bufs... wait for buffers to free up */
842 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
843
844 /* poke the delwri list */
845 nfs_buf_delwri_push(1);
846
847 nfsneedbuffer = 1;
848 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL);
849 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
850 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
851 FSDBG_BOT(541, np, blkno, 0, error);
852 return (error);
853 }
854 goto loop;
855 }
856 }
857
858 /* setup nfsbuf */
859 bp->nb_lflags = NBL_BUSY;
860 bp->nb_flags = 0;
861 bp->nb_lblkno = blkno;
862 /* insert buf in hash */
863 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
864 /* associate buffer with new nfsnode */
865 bp->nb_np = np;
866 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
867
868 buffer_setup:
869
870 /* unlock hash */
871 lck_mtx_unlock(nfs_buf_mutex);
872
873 switch (operation) {
874 case NBLK_META:
875 SET(bp->nb_flags, NB_META);
876 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
877 kfree(bp->nb_data, bp->nb_bufsize);
878 bp->nb_data = NULL;
879 bp->nb_validoff = bp->nb_validend = -1;
880 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
881 bp->nb_valid = 0;
882 bp->nb_dirty = 0;
883 CLR(bp->nb_flags, NB_CACHE);
884 }
885 if (!bp->nb_data)
886 bp->nb_data = kalloc(bufsize);
887 if (!bp->nb_data) {
888 /* Ack! couldn't allocate the data buffer! */
889 /* clean up buffer and return error */
890 lck_mtx_lock(nfs_buf_mutex);
891 LIST_REMOVE(bp, nb_vnbufs);
892 bp->nb_vnbufs.le_next = NFSNOLIST;
893 bp->nb_np = NULL;
894 /* invalidate usage timestamp to allow immediate freeing */
895 NBUFSTAMPINVALIDATE(bp);
896 if (bp->nb_free.tqe_next != NFSNOLIST)
897 panic("nfsbuf on freelist");
898 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
899 nfsbuffreecnt++;
900 lck_mtx_unlock(nfs_buf_mutex);
901 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
902 return (ENOMEM);
903 }
904 bp->nb_bufsize = bufsize;
905 break;
906
907 case NBLK_READ:
908 case NBLK_WRITE:
909 /*
910 * Set or clear NB_READ now to let the UPL subsystem know
911 * if we intend to modify the pages or not.
912 */
913 if (operation == NBLK_READ) {
914 SET(bp->nb_flags, NB_READ);
915 } else {
916 CLR(bp->nb_flags, NB_READ);
917 }
918 if (bufsize < PAGE_SIZE)
919 bufsize = PAGE_SIZE;
920 bp->nb_bufsize = bufsize;
921 bp->nb_validoff = bp->nb_validend = -1;
922
923 if (UBCINFOEXISTS(vp)) {
924 /* set up upl */
925 if (nfs_buf_upl_setup(bp)) {
926 /* unable to create upl */
927 /* vm object must no longer exist */
928 /* clean up buffer and return error */
929 lck_mtx_lock(nfs_buf_mutex);
930 LIST_REMOVE(bp, nb_vnbufs);
931 bp->nb_vnbufs.le_next = NFSNOLIST;
932 bp->nb_np = NULL;
933 /* invalidate usage timestamp to allow immediate freeing */
934 NBUFSTAMPINVALIDATE(bp);
935 if (bp->nb_free.tqe_next != NFSNOLIST)
936 panic("nfsbuf on freelist");
937 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
938 nfsbuffreecnt++;
939 lck_mtx_unlock(nfs_buf_mutex);
940 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
941 return (EIO);
942 }
943 nfs_buf_upl_check(bp);
944 }
945 break;
946
947 default:
948 panic("nfs_buf_get: %d unknown operation", operation);
949 }
950
951 *bpp = bp;
952
953 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
954
955 return (0);
956 }
957
958 void
959 nfs_buf_release(struct nfsbuf *bp, int freeup)
960 {
961 nfsnode_t np = bp->nb_np;
962 vnode_t vp;
963 struct timeval now;
964 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
965
966 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
967 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
968 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
969
970 vp = np ? NFSTOV(np) : NULL;
971 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
972 int upl_flags;
973 upl_t upl;
974 int i, rv;
975
976 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
977 rv = nfs_buf_upl_setup(bp);
978 if (rv)
979 printf("nfs_buf_release: upl create failed %d\n", rv);
980 else
981 nfs_buf_upl_check(bp);
982 }
983 upl = bp->nb_pagelist;
984 if (!upl)
985 goto pagelist_cleanup_done;
986 if (bp->nb_data) {
987 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
988 panic("ubc_upl_unmap failed");
989 bp->nb_data = NULL;
990 }
991 /*
992 * Abort the pages on error or: if this is an invalid or
993 * non-needcommit nocache buffer AND no pages are dirty.
994 */
995 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
996 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
997 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE)))
998 upl_flags = UPL_ABORT_DUMP_PAGES;
999 else
1000 upl_flags = 0;
1001 ubc_upl_abort(upl, upl_flags);
1002 goto pagelist_cleanup_done;
1003 }
1004 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
1005 if (!NBPGVALID(bp,i))
1006 ubc_upl_abort_range(upl,
1007 i*PAGE_SIZE, PAGE_SIZE,
1008 UPL_ABORT_DUMP_PAGES |
1009 UPL_ABORT_FREE_ON_EMPTY);
1010 else {
1011 if (NBPGDIRTY(bp,i))
1012 upl_flags = UPL_COMMIT_SET_DIRTY;
1013 else
1014 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1015 ubc_upl_commit_range(upl,
1016 i*PAGE_SIZE, PAGE_SIZE,
1017 upl_flags |
1018 UPL_COMMIT_INACTIVATE |
1019 UPL_COMMIT_FREE_ON_EMPTY);
1020 }
1021 }
1022 pagelist_cleanup_done:
1023 /* was this the last buffer in the file? */
1024 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
1025 /* if so, invalidate all pages of last buffer past EOF */
1026 off_t start, end;
1027 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
1028 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1029 if (end > start) {
1030 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1031 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1032 }
1033 }
1034 CLR(bp->nb_flags, NB_PAGELIST);
1035 bp->nb_pagelist = NULL;
1036 }
1037
1038 lck_mtx_lock(nfs_buf_mutex);
1039
1040 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1041
1042 /* Wake up any processes waiting for any buffer to become free. */
1043 if (nfsneedbuffer) {
1044 nfsneedbuffer = 0;
1045 wakeup_needbuffer = 1;
1046 }
1047 /* Wake up any processes waiting for _this_ buffer to become free. */
1048 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1049 CLR(bp->nb_lflags, NBL_WANTED);
1050 wakeup_buffer = 1;
1051 }
1052
1053 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1054 if (ISSET(bp->nb_flags, NB_ERROR) ||
1055 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
1056 SET(bp->nb_flags, NB_INVAL);
1057
1058 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1059 /* If it's invalid or empty, dissociate it from its nfsnode */
1060 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1061 LIST_REMOVE(bp, nb_vnbufs);
1062 bp->nb_vnbufs.le_next = NFSNOLIST;
1063 }
1064 bp->nb_np = NULL;
1065 /* if this was a delayed write, wakeup anyone */
1066 /* waiting for delayed writes to complete */
1067 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1068 CLR(bp->nb_flags, NB_DELWRI);
1069 nfs_nbdwrite--;
1070 NFSBUFCNTCHK();
1071 wakeup_nbdwrite = 1;
1072 }
1073 /* invalidate usage timestamp to allow immediate freeing */
1074 NBUFSTAMPINVALIDATE(bp);
1075 /* put buffer at head of free list */
1076 if (bp->nb_free.tqe_next != NFSNOLIST)
1077 panic("nfsbuf on freelist");
1078 SET(bp->nb_flags, NB_INVAL);
1079 if (ISSET(bp->nb_flags, NB_META)) {
1080 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1081 nfsbuffreemetacnt++;
1082 } else {
1083 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1084 nfsbuffreecnt++;
1085 }
1086 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1087 /* put buffer at end of delwri list */
1088 if (bp->nb_free.tqe_next != NFSNOLIST)
1089 panic("nfsbuf on freelist");
1090 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1091 nfsbufdelwricnt++;
1092 freeup = 0;
1093 } else {
1094 /* update usage timestamp */
1095 microuptime(&now);
1096 bp->nb_timestamp = now.tv_sec;
1097 /* put buffer at end of free list */
1098 if (bp->nb_free.tqe_next != NFSNOLIST)
1099 panic("nfsbuf on freelist");
1100 if (ISSET(bp->nb_flags, NB_META)) {
1101 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1102 nfsbuffreemetacnt++;
1103 } else {
1104 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1105 nfsbuffreecnt++;
1106 }
1107 }
1108
1109 NFSBUFCNTCHK();
1110
1111 /* Unlock the buffer. */
1112 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1113 CLR(bp->nb_lflags, NBL_BUSY);
1114
1115 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1116
1117 lck_mtx_unlock(nfs_buf_mutex);
1118
1119 if (wakeup_needbuffer)
1120 wakeup(&nfsneedbuffer);
1121 if (wakeup_buffer)
1122 wakeup(bp);
1123 if (wakeup_nbdwrite)
1124 wakeup(&nfs_nbdwrite);
1125 if (freeup)
1126 NFS_BUF_FREEUP();
1127 }
1128
1129 /*
1130 * Wait for operations on the buffer to complete.
1131 * When they do, extract and return the I/O's error value.
1132 */
1133 int
1134 nfs_buf_iowait(struct nfsbuf *bp)
1135 {
1136 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1137
1138 lck_mtx_lock(nfs_buf_mutex);
1139
1140 while (!ISSET(bp->nb_flags, NB_DONE))
1141 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
1142
1143 lck_mtx_unlock(nfs_buf_mutex);
1144
1145 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1146
1147 /* check for interruption of I/O, then errors. */
1148 if (ISSET(bp->nb_flags, NB_EINTR)) {
1149 CLR(bp->nb_flags, NB_EINTR);
1150 return (EINTR);
1151 } else if (ISSET(bp->nb_flags, NB_ERROR))
1152 return (bp->nb_error ? bp->nb_error : EIO);
1153 return (0);
1154 }
1155
1156 /*
1157 * Mark I/O complete on a buffer.
1158 */
1159 void
1160 nfs_buf_iodone(struct nfsbuf *bp)
1161 {
1162
1163 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1164
1165 if (ISSET(bp->nb_flags, NB_DONE))
1166 panic("nfs_buf_iodone already");
1167
1168 if (!ISSET(bp->nb_flags, NB_READ)) {
1169 CLR(bp->nb_flags, NB_WRITEINPROG);
1170 /*
1171 * vnode_writedone() takes care of waking up
1172 * any throttled write operations
1173 */
1174 vnode_writedone(NFSTOV(bp->nb_np));
1175 }
1176 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1177 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1178 nfs_buf_release(bp, 1);
1179 } else { /* or just wakeup the buffer */
1180 lck_mtx_lock(nfs_buf_mutex);
1181 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1182 CLR(bp->nb_lflags, NBL_WANTED);
1183 lck_mtx_unlock(nfs_buf_mutex);
1184 wakeup(bp);
1185 }
1186
1187 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1188 }
1189
1190 void
1191 nfs_buf_write_delayed(struct nfsbuf *bp)
1192 {
1193 nfsnode_t np = bp->nb_np;
1194
1195 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1196 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1197
1198 /*
1199 * If the block hasn't been seen before:
1200 * (1) Mark it as having been seen,
1201 * (2) Make sure it's on its node's correct block list,
1202 */
1203 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1204 SET(bp->nb_flags, NB_DELWRI);
1205 /* move to dirty list */
1206 lck_mtx_lock(nfs_buf_mutex);
1207 nfs_nbdwrite++;
1208 NFSBUFCNTCHK();
1209 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1210 LIST_REMOVE(bp, nb_vnbufs);
1211 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
1212 lck_mtx_unlock(nfs_buf_mutex);
1213 }
1214
1215 /*
1216 * If the vnode has "too many" write operations in progress
1217 * wait for them to finish the IO
1218 */
1219 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1220
1221 /* the file is in a modified state, so make sure the flag's set */
1222 nfs_lock(np, NFS_NODE_LOCK_FORCE);
1223 np->n_flag |= NMODIFIED;
1224 nfs_unlock(np);
1225
1226 /*
1227 * If we have too many delayed write buffers,
1228 * just fall back to doing the async write.
1229 */
1230 if (nfs_nbdwrite < 0)
1231 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1232 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
1233 /* issue async write */
1234 SET(bp->nb_flags, NB_ASYNC);
1235 nfs_buf_write(bp);
1236 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1237 return;
1238 }
1239
1240 /* Otherwise, the "write" is done, so mark and release the buffer. */
1241 SET(bp->nb_flags, NB_DONE);
1242 nfs_buf_release(bp, 1);
1243 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1244 return;
1245 }
1246
1247 /*
1248 * Check that a "needcommit" buffer can still be committed.
1249 * If the write verifier has changed, we need to clear the
1250 * the needcommit flag.
1251 */
1252 void
1253 nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
1254 {
1255 struct nfsmount *nmp;
1256
1257 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
1258 return;
1259
1260 nmp = NFSTONMP(np);
1261 if (!nmp)
1262 return;
1263 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
1264 return;
1265
1266 /* write verifier changed, clear commit/wverf flags */
1267 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1268 bp->nb_verf = 0;
1269 nfs_lock(np, NFS_NODE_LOCK_FORCE);
1270 np->n_needcommitcnt--;
1271 CHECK_NEEDCOMMITCNT(np);
1272 nfs_unlock(np);
1273 }
1274
1275 /*
1276 * add a reference to a buffer so it doesn't disappear while being used
1277 * (must be called with nfs_buf_mutex held)
1278 */
1279 void
1280 nfs_buf_refget(struct nfsbuf *bp)
1281 {
1282 bp->nb_refs++;
1283 }
1284 /*
1285 * release a reference on a buffer
1286 * (must be called with nfs_buf_mutex held)
1287 */
1288 void
1289 nfs_buf_refrele(struct nfsbuf *bp)
1290 {
1291 bp->nb_refs--;
1292 }
1293
1294 /*
1295 * mark a particular buffer as BUSY
1296 * (must be called with nfs_buf_mutex held)
1297 */
1298 errno_t
1299 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1300 {
1301 errno_t error;
1302 struct timespec ts;
1303
1304 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1305 /*
1306 * since the mutex_lock may block, the buffer
1307 * may become BUSY, so we need to recheck for
1308 * a NOWAIT request
1309 */
1310 if (flags & NBAC_NOWAIT)
1311 return (EBUSY);
1312 SET(bp->nb_lflags, NBL_WANTED);
1313
1314 ts.tv_sec = (slptimeo/100);
1315 /* the hz value is 100; which leads to 10ms */
1316 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1317
1318 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1319 "nfs_buf_acquire", &ts);
1320 if (error)
1321 return (error);
1322 return (EAGAIN);
1323 }
1324 if (flags & NBAC_REMOVE)
1325 nfs_buf_remfree(bp);
1326 SET(bp->nb_lflags, NBL_BUSY);
1327
1328 return (0);
1329 }
1330
1331 /*
1332 * simply drop the BUSY status of a buffer
1333 * (must be called with nfs_buf_mutex held)
1334 */
1335 void
1336 nfs_buf_drop(struct nfsbuf *bp)
1337 {
1338 int need_wakeup = 0;
1339
1340 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1341 panic("nfs_buf_drop: buffer not busy!");
1342 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1343 /* delay the actual wakeup until after we clear NBL_BUSY */
1344 need_wakeup = 1;
1345 }
1346 /* Unlock the buffer. */
1347 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1348
1349 if (need_wakeup)
1350 wakeup(bp);
1351 }
1352
1353 /*
1354 * prepare for iterating over an nfsnode's buffer list
1355 * this lock protects the queue manipulation
1356 * (must be called with nfs_buf_mutex held)
1357 */
1358 int
1359 nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1360 {
1361 struct nfsbuflists *listheadp;
1362
1363 if (flags & NBI_DIRTY)
1364 listheadp = &np->n_dirtyblkhd;
1365 else
1366 listheadp = &np->n_cleanblkhd;
1367
1368 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1369 LIST_INIT(iterheadp);
1370 return(EWOULDBLOCK);
1371 }
1372
1373 while (np->n_bufiterflags & NBI_ITER) {
1374 np->n_bufiterflags |= NBI_ITERWANT;
1375 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
1376 }
1377 if (LIST_EMPTY(listheadp)) {
1378 LIST_INIT(iterheadp);
1379 return(EINVAL);
1380 }
1381 np->n_bufiterflags |= NBI_ITER;
1382
1383 iterheadp->lh_first = listheadp->lh_first;
1384 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1385 LIST_INIT(listheadp);
1386
1387 return(0);
1388 }
1389
1390 /*
1391 * clean up after iterating over an nfsnode's buffer list
1392 * this lock protects the queue manipulation
1393 * (must be called with nfs_buf_mutex held)
1394 */
1395 void
1396 nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
1397 {
1398 struct nfsbuflists * listheadp;
1399 struct nfsbuf *bp;
1400
1401 if (flags & NBI_DIRTY)
1402 listheadp = &np->n_dirtyblkhd;
1403 else
1404 listheadp = &np->n_cleanblkhd;
1405
1406 while (!LIST_EMPTY(iterheadp)) {
1407 bp = LIST_FIRST(iterheadp);
1408 LIST_REMOVE(bp, nb_vnbufs);
1409 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1410 }
1411
1412 np->n_bufiterflags &= ~NBI_ITER;
1413 if (np->n_bufiterflags & NBI_ITERWANT) {
1414 np->n_bufiterflags &= ~NBI_ITERWANT;
1415 wakeup(&np->n_bufiterflags);
1416 }
1417 }
1418
1419
1420 /*
1421 * Read an NFS buffer for a file.
1422 */
1423 int
1424 nfs_buf_read(struct nfsbuf *bp)
1425 {
1426 int error = 0;
1427 nfsnode_t np;
1428 thread_t thd;
1429 kauth_cred_t cred;
1430
1431 np = bp->nb_np;
1432 cred = bp->nb_rcred;
1433 if (IS_VALID_CRED(cred))
1434 kauth_cred_ref(cred);
1435 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1436
1437 /* sanity checks */
1438 if (!ISSET(bp->nb_flags, NB_READ))
1439 panic("nfs_buf_read: !NB_READ");
1440 if (ISSET(bp->nb_flags, NB_DONE))
1441 CLR(bp->nb_flags, NB_DONE);
1442
1443 NFS_BUF_MAP(bp);
1444
1445 OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
1446
1447 error = nfs_buf_read_rpc(bp, thd, cred);
1448 /*
1449 * For async I/O, the callbacks will finish up the
1450 * read. Otherwise, the read has already been finished.
1451 */
1452
1453 if (IS_VALID_CRED(cred))
1454 kauth_cred_unref(&cred);
1455 return (error);
1456 }
1457
1458 /*
1459 * finish the reading of a buffer
1460 */
1461 void
1462 nfs_buf_read_finish(struct nfsbuf *bp)
1463 {
1464 nfsnode_t np = bp->nb_np;
1465 struct nfsmount *nmp;
1466
1467 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1468 /* update valid range */
1469 bp->nb_validoff = 0;
1470 bp->nb_validend = bp->nb_endio;
1471 if (bp->nb_endio < bp->nb_bufsize) {
1472 /*
1473 * The read may be short because we have unflushed writes
1474 * that are extending the file size and the reads hit the
1475 * (old) EOF on the server. So, just make sure nb_validend
1476 * correctly tracks EOF.
1477 * Note that the missing data should have already been zeroed
1478 * in nfs_buf_read_rpc_finish().
1479 */
1480 off_t boff = NBOFF(bp);
1481 if ((off_t)np->n_size >= (boff + bp->nb_bufsize))
1482 bp->nb_validend = bp->nb_bufsize;
1483 else if ((off_t)np->n_size >= boff)
1484 bp->nb_validend = np->n_size - boff;
1485 else
1486 bp->nb_validend = 0;
1487 }
1488 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1489 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL))
1490 bp->nb_validend = 0x100000000LL - NBOFF(bp);
1491 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1492 if (bp->nb_validend & PAGE_MASK) {
1493 /* zero-fill remainder of last page */
1494 bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
1495 }
1496 }
1497 nfs_buf_iodone(bp);
1498 }
1499
1500 /*
1501 * initiate the NFS READ RPC(s) for a buffer
1502 */
1503 int
1504 nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1505 {
1506 struct nfsmount *nmp;
1507 nfsnode_t np = bp->nb_np;
1508 int error = 0, nfsvers, async;
1509 int offset, length, nmrsize, nrpcs, len;
1510 off_t boff;
1511 struct nfsreq *req;
1512 struct nfsreq_cbinfo cb;
1513
1514 nmp = NFSTONMP(np);
1515 if (!nmp) {
1516 bp->nb_error = error = ENXIO;
1517 SET(bp->nb_flags, NB_ERROR);
1518 nfs_buf_iodone(bp);
1519 return (error);
1520 }
1521 nfsvers = nmp->nm_vers;
1522 nmrsize = nmp->nm_rsize;
1523
1524 boff = NBOFF(bp);
1525 offset = 0;
1526 length = bp->nb_bufsize;
1527
1528 if (nfsvers == NFS_VER2) {
1529 if (boff > 0xffffffffLL) {
1530 bp->nb_error = error = EFBIG;
1531 SET(bp->nb_flags, NB_ERROR);
1532 nfs_buf_iodone(bp);
1533 return (error);
1534 }
1535 if ((boff + length - 1) > 0xffffffffLL)
1536 length = 0x100000000LL - boff;
1537 }
1538
1539 /* Note: Can only do async I/O if nfsiods are configured. */
1540 async = (bp->nb_flags & NB_ASYNC);
1541 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1542 cb.rcb_bp = bp;
1543
1544 bp->nb_offio = bp->nb_endio = 0;
1545 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1546 if (async && (nrpcs > 1)) {
1547 SET(bp->nb_flags, NB_MULTASYNCRPC);
1548 } else {
1549 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1550 }
1551
1552 while (length > 0) {
1553 if (ISSET(bp->nb_flags, NB_ERROR)) {
1554 error = bp->nb_error;
1555 break;
1556 }
1557 len = (length > nmrsize) ? nmrsize : length;
1558 cb.rcb_args[0] = offset;
1559 cb.rcb_args[1] = len;
1560 req = NULL;
1561 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1562 if (error)
1563 break;
1564 offset += len;
1565 length -= len;
1566 if (async)
1567 continue;
1568 nfs_buf_read_rpc_finish(req);
1569 if (ISSET(bp->nb_flags, NB_ERROR)) {
1570 error = bp->nb_error;
1571 break;
1572 }
1573 }
1574
1575 if (length > 0) {
1576 /*
1577 * Something bad happened while trying to send the RPC(s).
1578 * Wait for any outstanding requests to complete.
1579 */
1580 bp->nb_error = error;
1581 SET(bp->nb_flags, NB_ERROR);
1582 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1583 nrpcs = (length + nmrsize - 1) / nmrsize;
1584 lck_mtx_lock(nfs_buf_mutex);
1585 bp->nb_rpcs -= nrpcs;
1586 if (bp->nb_rpcs == 0) {
1587 /* No RPCs left, so the buffer's done */
1588 lck_mtx_unlock(nfs_buf_mutex);
1589 nfs_buf_iodone(bp);
1590 } else {
1591 /* wait for the last RPC to mark it done */
1592 while (bp->nb_rpcs > 0)
1593 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1594 "nfs_buf_read_rpc_cancel", NULL);
1595 lck_mtx_unlock(nfs_buf_mutex);
1596 }
1597 } else {
1598 nfs_buf_iodone(bp);
1599 }
1600 }
1601
1602 return (error);
1603 }
1604
1605 /*
1606 * finish up an NFS READ RPC on a buffer
1607 */
1608 void
1609 nfs_buf_read_rpc_finish(struct nfsreq *req)
1610 {
1611 struct nfsmount *nmp;
1612 size_t rlen;
1613 struct nfsreq_cbinfo cb;
1614 struct nfsbuf *bp;
1615 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1616 void *wakeme = NULL;
1617 struct nfsreq *rreq = NULL;
1618 nfsnode_t np;
1619 thread_t thd;
1620 kauth_cred_t cred;
1621 struct uio uio;
1622 struct iovec_32 io;
1623
1624 finish:
1625 np = req->r_np;
1626 thd = req->r_thread;
1627 cred = req->r_cred;
1628 if (IS_VALID_CRED(cred))
1629 kauth_cred_ref(cred);
1630 cb = req->r_callback;
1631 bp = cb.rcb_bp;
1632
1633 nmp = NFSTONMP(np);
1634 if (!nmp) {
1635 SET(bp->nb_flags, NB_ERROR);
1636 bp->nb_error = error = ENXIO;
1637 }
1638 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1639 /* just drop it */
1640 nfs_request_async_cancel(req);
1641 goto out;
1642 }
1643
1644 nfsvers = nmp->nm_vers;
1645 offset = cb.rcb_args[0];
1646 rlen = length = cb.rcb_args[1];
1647
1648 uio.uio_iovs.iov32p = &io;
1649 uio.uio_iovcnt = 1;
1650 uio.uio_rw = UIO_READ;
1651 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
1652 uio.uio_segflg = UIO_SYSSPACE;
1653 #else
1654 uio.uio_segflg = UIO_SYSSPACE32;
1655 #endif
1656 io.iov_len = length;
1657 uio_uio_resid_set(&uio, io.iov_len);
1658 uio.uio_offset = NBOFF(bp) + offset;
1659 io.iov_base = (uintptr_t) bp->nb_data + offset;
1660
1661 /* finish the RPC */
1662 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
1663 if ((error == EINPROGRESS) && cb.rcb_func) {
1664 /* async request restarted */
1665 if (IS_VALID_CRED(cred))
1666 kauth_cred_unref(&cred);
1667 return;
1668 }
1669
1670 if (error) {
1671 SET(bp->nb_flags, NB_ERROR);
1672 bp->nb_error = error;
1673 goto out;
1674 }
1675
1676 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen)))
1677 bp->nb_endio = offset + rlen;
1678
1679 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1680 /* zero out the remaining data (up to EOF) */
1681 off_t rpcrem, eofrem, rem;
1682 rpcrem = (length - rlen);
1683 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1684 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1685 if (rem > 0)
1686 bzero(bp->nb_data + offset + rlen, rem);
1687 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1688 /*
1689 * short read
1690 *
1691 * We haven't hit EOF and we didn't get all the data
1692 * requested, so we need to issue another read for the rest.
1693 * (Don't bother if the buffer already hit an error.)
1694 */
1695 offset += rlen;
1696 length -= rlen;
1697 cb.rcb_args[0] = offset;
1698 cb.rcb_args[1] = length;
1699 error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
1700 if (!error) {
1701 if (IS_VALID_CRED(cred))
1702 kauth_cred_unref(&cred);
1703 if (!cb.rcb_func) {
1704 /* if !async we'll need to wait for this RPC to finish */
1705 req = rreq;
1706 goto finish;
1707 }
1708 /*
1709 * We're done here.
1710 * Outstanding RPC count is unchanged.
1711 * Callback will be called when RPC is done.
1712 */
1713 return;
1714 }
1715 SET(bp->nb_flags, NB_ERROR);
1716 bp->nb_error = error;
1717 }
1718
1719 out:
1720 if (IS_VALID_CRED(cred))
1721 kauth_cred_unref(&cred);
1722
1723 /*
1724 * Decrement outstanding RPC count on buffer
1725 * and call nfs_buf_read_finish on last RPC.
1726 *
1727 * (Note: when there are multiple async RPCs issued for a
1728 * buffer we need nfs_buffer_mutex to avoid problems when
1729 * aborting a partially-initiated set of RPCs)
1730 */
1731
1732 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1733 if (multasyncrpc)
1734 lck_mtx_lock(nfs_buf_mutex);
1735
1736 bp->nb_rpcs--;
1737 finished = (bp->nb_rpcs == 0);
1738
1739 if (multasyncrpc)
1740 lck_mtx_unlock(nfs_buf_mutex);
1741
1742 if (finished) {
1743 if (multasyncrpc)
1744 wakeme = &bp->nb_rpcs;
1745 nfs_buf_read_finish(bp);
1746 if (wakeme)
1747 wakeup(wakeme);
1748 }
1749 }
1750
1751 /*
1752 * Do buffer readahead.
1753 * Initiate async I/O to read buffers not in cache.
1754 */
1755 static int
1756 nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1757 {
1758 struct nfsmount *nmp = NFSTONMP(np);
1759 struct nfsbuf *bp;
1760 int error = 0, nra;
1761
1762 if (!nmp)
1763 return (ENXIO);
1764 if (nmp->nm_readahead <= 0)
1765 return (0);
1766 if (*rabnp > lastrabn)
1767 return (0);
1768
1769 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1770 /* check if block exists and is valid. */
1771 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
1772 if (error)
1773 break;
1774 if (!bp)
1775 continue;
1776 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1777 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) {
1778 CLR(bp->nb_flags, NB_CACHE);
1779 bp->nb_valid = 0;
1780 bp->nb_validoff = bp->nb_validend = -1;
1781 }
1782 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1783 !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1784 SET(bp->nb_flags, (NB_READ|NB_ASYNC));
1785 if (ioflag & IO_NOCACHE)
1786 SET(bp->nb_flags, NB_NCRDAHEAD);
1787 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1788 kauth_cred_ref(cred);
1789 bp->nb_rcred = cred;
1790 }
1791 if ((error = nfs_buf_read(bp)))
1792 break;
1793 continue;
1794 }
1795 nfs_buf_release(bp, 1);
1796 }
1797 return (error);
1798 }
1799
1800 /*
1801 * NFS buffer I/O for reading files/directories.
1802 */
1803 int
1804 nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
1805 {
1806 vnode_t vp = NFSTOV(np);
1807 struct nfsbuf *bp = NULL;
1808 struct nfs_vattr nvattr;
1809 struct nfsmount *nmp = VTONMP(vp);
1810 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
1811 off_t diff;
1812 int error = 0, n = 0, on = 0;
1813 int nfsvers, biosize;
1814 caddr_t dp;
1815 struct dirent *direntp = NULL;
1816 enum vtype vtype;
1817 thread_t thd;
1818 kauth_cred_t cred;
1819
1820 FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
1821
1822 if (uio_uio_resid(uio) == 0) {
1823 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
1824 return (0);
1825 }
1826 if (uio->uio_offset < 0) {
1827 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
1828 return (EINVAL);
1829 }
1830
1831 nfsvers = nmp->nm_vers;
1832 biosize = nmp->nm_biosize;
1833 thd = vfs_context_thread(ctx);
1834 cred = vfs_context_ucred(ctx);
1835
1836 vtype = vnode_vtype(vp);
1837 if ((vtype != VREG) && (vtype != VDIR)) {
1838 printf("nfs_bioread: type %x unexpected\n", vtype);
1839 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
1840 return (EINVAL);
1841 }
1842
1843 /*
1844 * For nfs, cache consistency can only be maintained approximately.
1845 * Although RFC1094 does not specify the criteria, the following is
1846 * believed to be compatible with the reference port.
1847 * For nfs:
1848 * If the file's modify time on the server has changed since the
1849 * last read rpc or you have written to the file,
1850 * you may have lost data cache consistency with the
1851 * server, so flush all of the file's data out of the cache.
1852 * Then force a getattr rpc to ensure that you have up to date
1853 * attributes.
1854 * NB: This implies that cache data can be read when up to
1855 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1856 * current attributes this could be forced by calling
1857 * NATTRINVALIDATE() before the nfs_getattr() call.
1858 */
1859
1860 if (ISSET(np->n_flag, NUPDATESIZE))
1861 nfs_data_update_size(np, 0);
1862
1863 if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
1864 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
1865 return (error);
1866 }
1867
1868 if (np->n_flag & NNEEDINVALIDATE) {
1869 np->n_flag &= ~NNEEDINVALIDATE;
1870 nfs_unlock(np);
1871 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
1872 if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
1873 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
1874 return (error);
1875 }
1876 }
1877
1878 if (np->n_flag & NMODIFIED) {
1879 if (vtype == VDIR) {
1880 nfs_invaldir(np);
1881 nfs_unlock(np);
1882 error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1883 if (!error)
1884 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
1885 if (error) {
1886 FSDBG_BOT(514, np, 0xd1e0003, 0, error);
1887 return (error);
1888 }
1889 }
1890 NATTRINVALIDATE(np);
1891 error = nfs_getattr(np, &nvattr, ctx, 1);
1892 if (error) {
1893 nfs_unlock(np);
1894 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
1895 return (error);
1896 }
1897 if (vtype == VDIR) {
1898 /* if directory changed, purge any name cache entries */
1899 if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
1900 cache_purge(vp);
1901 NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
1902 }
1903 NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
1904 } else {
1905 error = nfs_getattr(np, &nvattr, ctx, 1);
1906 if (error) {
1907 nfs_unlock(np);
1908 FSDBG_BOT(514, np, 0xd1e0005, 0, error);
1909 return (error);
1910 }
1911 if (NFS_CHANGED(nfsvers, np, &nvattr)) {
1912 if (vtype == VDIR) {
1913 nfs_invaldir(np);
1914 /* purge name cache entries */
1915 if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
1916 cache_purge(vp);
1917 }
1918 nfs_unlock(np);
1919 error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1920 if (!error)
1921 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
1922 if (error) {
1923 FSDBG_BOT(514, np, 0xd1e0006, 0, error);
1924 return (error);
1925 }
1926 if (vtype == VDIR)
1927 NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
1928 NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
1929 }
1930 }
1931
1932 nfs_unlock(np);
1933
1934 if (vtype == VREG) {
1935 if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
1936 /* We have only a block or so to read, just do the rpc directly. */
1937 error = nfs_read_rpc(np, uio, ctx);
1938 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
1939 return (error);
1940 }
1941 /*
1942 * set up readahead - which may be limited by:
1943 * + current request length (for IO_NOCACHE)
1944 * + readahead setting
1945 * + file size
1946 */
1947 if (nmp->nm_readahead > 0) {
1948 off_t end = uio->uio_offset + uio_uio_resid(uio);
1949 if (end > (off_t)np->n_size)
1950 end = np->n_size;
1951 rabn = uio->uio_offset / biosize;
1952 maxrabn = (end - 1) / biosize;
1953 if (!(ioflag & IO_NOCACHE) &&
1954 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
1955 maxrabn += nmp->nm_readahead;
1956 if ((maxrabn * biosize) >= (off_t)np->n_size)
1957 maxrabn = ((off_t)np->n_size - 1)/biosize;
1958 }
1959 } else {
1960 rabn = maxrabn = 0;
1961 }
1962 }
1963
1964 do {
1965
1966 if (vtype == VREG) {
1967 nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
1968 lbn = uio->uio_offset / biosize;
1969
1970 /*
1971 * Copy directly from any cached pages without grabbing the bufs.
1972 *
1973 * Note: for "nocache" reads, we don't copy directly from UBC
1974 * because any cached pages will be for readahead buffers that
1975 * need to be invalidated anyway before we finish this request.
1976 */
1977 if (!(ioflag & IO_NOCACHE) &&
1978 (uio->uio_segflg == UIO_USERSPACE32 ||
1979 uio->uio_segflg == UIO_USERSPACE64 ||
1980 uio->uio_segflg == UIO_USERSPACE)) {
1981 // LP64todo - fix this!
1982 int io_resid = uio_uio_resid(uio);
1983 diff = np->n_size - uio->uio_offset;
1984 if (diff < io_resid)
1985 io_resid = diff;
1986 if (io_resid > 0) {
1987 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1988 if (error) {
1989 nfs_data_unlock(np);
1990 FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
1991 return (error);
1992 }
1993 }
1994 /* count any biocache reads that we just copied directly */
1995 if (lbn != (uio->uio_offset / biosize)) {
1996 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1997 FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
1998 }
1999 }
2000
2001 lbn = uio->uio_offset / biosize;
2002 on = uio->uio_offset % biosize;
2003 np->n_lastread = (uio->uio_offset - 1) / biosize;
2004
2005 /* adjust readahead block number, if necessary */
2006 if (rabn < lbn)
2007 rabn = lbn;
2008 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2009 if (rabn <= lastrabn) { /* start readaheads */
2010 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2011 if (error) {
2012 nfs_data_unlock(np);
2013 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2014 return (error);
2015 }
2016 }
2017
2018 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
2019 nfs_data_unlock(np);
2020 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
2021 return (0);
2022 }
2023
2024 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
2025
2026 /*
2027 * If the block is in the cache and has the required data
2028 * in a valid region, just copy it out.
2029 * Otherwise, get the block and write back/read in,
2030 * as required.
2031 */
2032 again:
2033 // LP64todo - fix this!
2034 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
2035 diff = np->n_size - uio->uio_offset;
2036 if (diff < n)
2037 n = diff;
2038
2039 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
2040 if (error) {
2041 nfs_data_unlock(np);
2042 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2043 return (error);
2044 }
2045
2046 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2047 /*
2048 * IO_NOCACHE found a cached buffer.
2049 * Flush the buffer if it's dirty.
2050 * Invalidate the data if it wasn't just read
2051 * in as part of a "nocache readahead".
2052 */
2053 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2054 /* so write the buffer out and try again */
2055 SET(bp->nb_flags, NB_NOCACHE);
2056 goto flushbuffer;
2057 }
2058 if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2059 CLR(bp->nb_flags, NB_CACHE);
2060 bp->nb_valid = 0;
2061 } else {
2062 CLR(bp->nb_flags, NB_NCRDAHEAD);
2063 }
2064 }
2065
2066 /* if any pages are valid... */
2067 if (bp->nb_valid) {
2068 /* ...check for any invalid pages in the read range */
2069 int pg, firstpg, lastpg, dirtypg;
2070 dirtypg = firstpg = lastpg = -1;
2071 pg = on/PAGE_SIZE;
2072 while (pg <= (on + n - 1)/PAGE_SIZE) {
2073 if (!NBPGVALID(bp,pg)) {
2074 if (firstpg < 0)
2075 firstpg = pg;
2076 lastpg = pg;
2077 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
2078 dirtypg = pg;
2079 pg++;
2080 }
2081
2082 /* if there are no invalid pages, we're all set */
2083 if (firstpg < 0) {
2084 if (bp->nb_validoff < 0) {
2085 /* valid range isn't set up, so */
2086 /* set it to what we know is valid */
2087 bp->nb_validoff = trunc_page(on);
2088 bp->nb_validend = round_page(on+n);
2089 nfs_buf_normalize_valid_range(np, bp);
2090 }
2091 goto buffer_ready;
2092 }
2093
2094 /* there are invalid pages in the read range */
2095 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2096 (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) {
2097 /* there are also dirty page(s) (or range) in the read range, */
2098 /* so write the buffer out and try again */
2099 flushbuffer:
2100 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2101 SET(bp->nb_flags, NB_ASYNC);
2102 if (!IS_VALID_CRED(bp->nb_wcred)) {
2103 kauth_cred_ref(cred);
2104 bp->nb_wcred = cred;
2105 }
2106 error = nfs_buf_write(bp);
2107 if (error) {
2108 nfs_data_unlock(np);
2109 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
2110 return (error);
2111 }
2112 goto again;
2113 }
2114 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2115 (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) {
2116 /* we need to read in more than half the buffer and the */
2117 /* buffer's not dirty, so just fetch the whole buffer */
2118 bp->nb_valid = 0;
2119 } else {
2120 /* read the page range in */
2121 uio_t auio;
2122 char uio_buf[ UIO_SIZEOF(1) ];
2123
2124 NFS_BUF_MAP(bp);
2125 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
2126 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2127 if (!auio) {
2128 error = ENOMEM;
2129 } else {
2130 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
2131 ((lastpg - firstpg + 1) * PAGE_SIZE));
2132 error = nfs_read_rpc(np, auio, ctx);
2133 }
2134 if (error) {
2135 if (ioflag & IO_NOCACHE)
2136 SET(bp->nb_flags, NB_NOCACHE);
2137 nfs_buf_release(bp, 1);
2138 nfs_data_unlock(np);
2139 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
2140 return (error);
2141 }
2142 /* Make sure that the valid range is set to cover this read. */
2143 bp->nb_validoff = trunc_page_32(on);
2144 bp->nb_validend = round_page_32(on+n);
2145 nfs_buf_normalize_valid_range(np, bp);
2146 if (uio_resid(auio) > 0) {
2147 /* if short read, must have hit EOF, */
2148 /* so zero the rest of the range */
2149 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
2150 }
2151 /* mark the pages (successfully read) as valid */
2152 for (pg=firstpg; pg <= lastpg; pg++)
2153 NBPGVALID_SET(bp,pg);
2154 }
2155 }
2156 /* if no pages are valid, read the whole block */
2157 if (!bp->nb_valid) {
2158 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2159 kauth_cred_ref(cred);
2160 bp->nb_rcred = cred;
2161 }
2162 SET(bp->nb_flags, NB_READ);
2163 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2164 error = nfs_buf_read(bp);
2165 if (error) {
2166 nfs_data_unlock(np);
2167 nfs_buf_release(bp, 1);
2168 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
2169 return (error);
2170 }
2171 }
2172 buffer_ready:
2173 /* validate read range against valid range and clip */
2174 if (bp->nb_validend > 0) {
2175 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2176 if (diff < n)
2177 n = diff;
2178 }
2179 if (n > 0)
2180 NFS_BUF_MAP(bp);
2181 } else if (vtype == VDIR) {
2182 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
2183 error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
2184 if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
2185 if (!error)
2186 nfs_unlock(np);
2187 if (eofflag)
2188 *eofflag = 1;
2189 FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
2190 return (0);
2191 }
2192 nfs_unlock(np);
2193 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
2194 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
2195 error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
2196 if (error) {
2197 FSDBG_BOT(514, np, 0xd1e0012, 0, error);
2198 return (error);
2199 }
2200 if (!ISSET(bp->nb_flags, NB_CACHE)) {
2201 SET(bp->nb_flags, NB_READ);
2202 error = nfs_buf_readdir(bp, ctx);
2203 if (error)
2204 nfs_buf_release(bp, 1);
2205 while (error == NFSERR_BAD_COOKIE) {
2206 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
2207 if (!error) {
2208 nfs_invaldir(np);
2209 nfs_unlock(np);
2210 }
2211 error = nfs_vinvalbuf(vp, 0, ctx, 1);
2212 /*
2213 * Yuck! The directory has been modified on the
2214 * server. The only way to get the block is by
2215 * reading from the beginning to get all the
2216 * offset cookies.
2217 */
2218 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
2219 if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
2220 break;
2221 if (np->n_direofoffset
2222 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
2223 nfs_unlock(np);
2224 if (eofflag)
2225 *eofflag = 1;
2226 FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
2227 return (0);
2228 }
2229 nfs_unlock(np);
2230 error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
2231 if (error) {
2232 FSDBG_BOT(514, np, 0xd1e0013, 0, error);
2233 return (error);
2234 }
2235 if (!ISSET(bp->nb_flags, NB_CACHE)) {
2236 SET(bp->nb_flags, NB_READ);
2237 error = nfs_buf_readdir(bp, ctx);
2238 /*
2239 * no error + NB_INVAL == directory EOF,
2240 * use the block.
2241 */
2242 if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
2243 if (eofflag)
2244 *eofflag = 1;
2245 break;
2246 }
2247 }
2248 /*
2249 * An error will throw away the block and the
2250 * for loop will break out. If no error and this
2251 * is not the block we want, we throw away the
2252 * block and go for the next one via the for loop.
2253 */
2254 if (error || (tlbn < lbn))
2255 nfs_buf_release(bp, 1);
2256 }
2257 }
2258 /*
2259 * The above while is repeated if we hit another cookie
2260 * error. If we hit an error and it wasn't a cookie error,
2261 * we give up.
2262 */
2263 if (error) {
2264 FSDBG_BOT(514, np, 0xd1e0014, 0, error);
2265 return (error);
2266 }
2267 }
2268 /*
2269 * Make sure we use a signed variant of min() since
2270 * the second term may be negative.
2271 */
2272 // LP64todo - fix this!
2273 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
2274 /*
2275 * We keep track of the directory eof in
2276 * np->n_direofoffset and chop it off as an
2277 * extra step right here.
2278 */
2279 if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
2280 FSDBG_BOT(514, np, 0xd1e0115, 0, error);
2281 return (error);
2282 }
2283 if (np->n_direofoffset &&
2284 n > np->n_direofoffset - uio->uio_offset)
2285 n = np->n_direofoffset - uio->uio_offset;
2286 nfs_unlock(np);
2287 /*
2288 * Make sure that we return an integral number of entries so
2289 * that any subsequent calls will start copying from the start
2290 * of the next entry.
2291 *
2292 * If the current value of n has the last entry cut short,
2293 * set n to copy everything up to the last entry instead.
2294 */
2295 if (n > 0) {
2296 dp = bp->nb_data + on;
2297 while (dp < (bp->nb_data + on + n)) {
2298 direntp = (struct dirent *)dp;
2299 dp += direntp->d_reclen;
2300 }
2301 if (dp > (bp->nb_data + on + n))
2302 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
2303 }
2304 }
2305
2306 if (n > 0)
2307 error = uiomove(bp->nb_data + on, (int)n, uio);
2308
2309 if (vtype == VREG) {
2310 if (ioflag & IO_NOCACHE)
2311 SET(bp->nb_flags, NB_NOCACHE);
2312 nfs_buf_release(bp, 1);
2313 nfs_data_unlock(np);
2314 np->n_lastread = (uio->uio_offset - 1) / biosize;
2315 } else {
2316 nfs_buf_release(bp, 1);
2317 }
2318 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
2319 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
2320 return (error);
2321 }
2322
2323 /*
2324 * limit the number of outstanding async I/O writes
2325 */
2326 static int
2327 nfs_async_write_start(struct nfsmount *nmp)
2328 {
2329 int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
2330 struct timespec ts = {1, 0};
2331
2332 if (nfs_max_async_writes <= 0)
2333 return (0);
2334 lck_mtx_lock(&nmp->nm_lock);
2335 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2336 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
2337 break;
2338 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
2339 slpflag = 0;
2340 }
2341 if (!error)
2342 nmp->nm_asyncwrites++;
2343 lck_mtx_unlock(&nmp->nm_lock);
2344 return (error);
2345 }
2346 static void
2347 nfs_async_write_done(struct nfsmount *nmp)
2348 {
2349 if (nmp->nm_asyncwrites <= 0)
2350 return;
2351 lck_mtx_lock(&nmp->nm_lock);
2352 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
2353 wakeup(&nmp->nm_asyncwrites);
2354 lck_mtx_unlock(&nmp->nm_lock);
2355 }
2356
2357 /*
2358 * write (or commit) the given NFS buffer
2359 *
2360 * Commit the buffer if we can.
2361 * Write out any dirty range.
2362 * If any dirty pages remain, write them out.
2363 * Mark buffer done.
2364 *
2365 * For async requests, all the work beyond sending the initial
2366 * write RPC is handled in the RPC callback(s).
2367 */
2368 int
2369 nfs_buf_write(struct nfsbuf *bp)
2370 {
2371 int error = 0, oldflags, async;
2372 nfsnode_t np;
2373 thread_t thd;
2374 kauth_cred_t cred;
2375 proc_t p = current_proc();
2376 int iomode, doff, dend, firstpg, lastpg;
2377 uint32_t pagemask;
2378
2379 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
2380
2381 if (!ISSET(bp->nb_lflags, NBL_BUSY))
2382 panic("nfs_buf_write: buffer is not busy???");
2383
2384 np = bp->nb_np;
2385 async = ISSET(bp->nb_flags, NB_ASYNC);
2386 oldflags = bp->nb_flags;
2387
2388 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
2389 if (ISSET(oldflags, NB_DELWRI)) {
2390 lck_mtx_lock(nfs_buf_mutex);
2391 nfs_nbdwrite--;
2392 NFSBUFCNTCHK();
2393 lck_mtx_unlock(nfs_buf_mutex);
2394 wakeup(&nfs_nbdwrite);
2395 }
2396
2397 /* move to clean list */
2398 if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
2399 lck_mtx_lock(nfs_buf_mutex);
2400 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2401 LIST_REMOVE(bp, nb_vnbufs);
2402 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2403 lck_mtx_unlock(nfs_buf_mutex);
2404 }
2405 vnode_startwrite(NFSTOV(np));
2406
2407 if (p && p->p_stats)
2408 OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
2409
2410 cred = bp->nb_wcred;
2411 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
2412 cred = bp->nb_rcred; /* shouldn't really happen, but... */
2413 if (IS_VALID_CRED(cred))
2414 kauth_cred_ref(cred);
2415 thd = async ? NULL : current_thread();
2416
2417 /* We need to make sure the pages are locked before doing I/O. */
2418 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
2419 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2420 error = nfs_buf_upl_setup(bp);
2421 if (error) {
2422 printf("nfs_buf_write: upl create failed %d\n", error);
2423 SET(bp->nb_flags, NB_ERROR);
2424 bp->nb_error = error = EIO;
2425 nfs_buf_iodone(bp);
2426 goto out;
2427 }
2428 nfs_buf_upl_check(bp);
2429 }
2430 }
2431
2432 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2433 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2434 nfs_buf_check_write_verifier(np, bp);
2435 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2436 struct nfsmount *nmp = NFSTONMP(np);
2437 if (!nmp) {
2438 SET(bp->nb_flags, NB_ERROR);
2439 bp->nb_error = error = EIO;
2440 nfs_buf_iodone(bp);
2441 goto out;
2442 }
2443 SET(bp->nb_flags, NB_WRITEINPROG);
2444 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2445 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
2446 CLR(bp->nb_flags, NB_WRITEINPROG);
2447 if (error) {
2448 if (error != NFSERR_STALEWRITEVERF) {
2449 SET(bp->nb_flags, NB_ERROR);
2450 bp->nb_error = error;
2451 }
2452 nfs_buf_iodone(bp);
2453 goto out;
2454 }
2455 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2456 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2457 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2458 np->n_needcommitcnt--;
2459 CHECK_NEEDCOMMITCNT(np);
2460 nfs_unlock(np);
2461 }
2462 if (!error && (bp->nb_dirtyend > 0)) {
2463 /* sanity check the dirty range */
2464 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2465 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2466 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2467 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2468 }
2469 }
2470 if (!error && (bp->nb_dirtyend > 0)) {
2471 /* there's a dirty range that needs to be written out */
2472 NFS_BUF_MAP(bp);
2473
2474 doff = bp->nb_dirtyoff;
2475 dend = bp->nb_dirtyend;
2476
2477 /* if doff page is dirty, move doff to start of page */
2478 if (NBPGDIRTY(bp, doff / PAGE_SIZE))
2479 doff -= doff & PAGE_MASK;
2480 /* try to expand write range to include preceding dirty pages */
2481 if (!(doff & PAGE_MASK))
2482 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
2483 doff -= PAGE_SIZE;
2484 /* if dend page is dirty, move dend to start of next page */
2485 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2486 dend = round_page_32(dend);
2487 /* try to expand write range to include trailing dirty pages */
2488 if (!(dend & PAGE_MASK))
2489 while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2490 dend += PAGE_SIZE;
2491 /* make sure to keep dend clipped to EOF */
2492 if ((NBOFF(bp) + dend) > (off_t) np->n_size)
2493 dend = np->n_size - NBOFF(bp);
2494 /* calculate range of complete pages being written */
2495 firstpg = round_page_32(doff) / PAGE_SIZE;
2496 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2497 /* calculate mask for that page range */
2498 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2499
2500 /*
2501 * compare page mask to nb_dirty; if there are other dirty pages
2502 * then write FILESYNC; otherwise, write UNSTABLE if async and
2503 * not needcommit/stable; otherwise write FILESYNC
2504 */
2505 if (bp->nb_dirty & ~pagemask)
2506 iomode = NFS_WRITE_FILESYNC;
2507 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
2508 iomode = NFS_WRITE_UNSTABLE;
2509 else
2510 iomode = NFS_WRITE_FILESYNC;
2511
2512 /* write the whole contiguous dirty range */
2513 bp->nb_offio = doff;
2514 bp->nb_endio = dend;
2515
2516 OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
2517
2518 SET(bp->nb_flags, NB_WRITEINPROG);
2519 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
2520 /*
2521 * For async I/O, the callbacks will finish up the
2522 * write and push out any dirty pages. Otherwise,
2523 * the write has already been finished and any dirty
2524 * pages pushed out.
2525 */
2526 } else {
2527 if (!error && bp->nb_dirty) /* write out any dirty pages */
2528 error = nfs_buf_write_dirty_pages(bp, thd, cred);
2529 nfs_buf_iodone(bp);
2530 }
2531 /* note: bp is still valid only for !async case */
2532 out:
2533 if (!async) {
2534 error = nfs_buf_iowait(bp);
2535 /* move to clean list */
2536 if (oldflags & NB_DELWRI) {
2537 lck_mtx_lock(nfs_buf_mutex);
2538 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2539 LIST_REMOVE(bp, nb_vnbufs);
2540 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2541 lck_mtx_unlock(nfs_buf_mutex);
2542 }
2543 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2544 nfs_buf_release(bp, 1);
2545 /* check if we need to invalidate (and we can) */
2546 if ((np->n_flag & NNEEDINVALIDATE) &&
2547 !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
2548 int invalidate = 0;
2549 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2550 if (np->n_flag & NNEEDINVALIDATE) {
2551 invalidate = 1;
2552 np->n_flag &= ~NNEEDINVALIDATE;
2553 }
2554 nfs_unlock(np);
2555 if (invalidate) {
2556 /*
2557 * There was a write error and we need to
2558 * invalidate attrs and flush buffers in
2559 * order to sync up with the server.
2560 * (if this write was extending the file,
2561 * we may no longer know the correct size)
2562 *
2563 * But we couldn't call vinvalbuf while holding
2564 * the buffer busy. So we call vinvalbuf() after
2565 * releasing the buffer.
2566 */
2567 nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
2568 }
2569 }
2570 }
2571
2572 if (IS_VALID_CRED(cred))
2573 kauth_cred_unref(&cred);
2574 return (error);
2575 }
2576
2577 /*
2578 * finish the writing of a buffer
2579 */
2580 void
2581 nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2582 {
2583 nfsnode_t np = bp->nb_np;
2584 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2585 int firstpg, lastpg;
2586 uint32_t pagemask;
2587
2588 if ((error == EINTR) || (error == ERESTART)) {
2589 CLR(bp->nb_flags, NB_ERROR);
2590 SET(bp->nb_flags, NB_EINTR);
2591 }
2592
2593 if (!error) {
2594 /* calculate range of complete pages being written */
2595 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2596 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2597 /* calculate mask for that page range written */
2598 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2599 /* clear dirty bits for pages we've written */
2600 bp->nb_dirty &= ~pagemask;
2601 }
2602
2603 /* manage needcommit state */
2604 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2605 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2606 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2607 np->n_needcommitcnt++;
2608 nfs_unlock(np);
2609 SET(bp->nb_flags, NB_NEEDCOMMIT);
2610 }
2611 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2612 bp->nb_dirtyoff = bp->nb_offio;
2613 bp->nb_dirtyend = bp->nb_endio;
2614 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2615 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2616 np->n_needcommitcnt--;
2617 CHECK_NEEDCOMMITCNT(np);
2618 nfs_unlock(np);
2619 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2620 }
2621
2622 CLR(bp->nb_flags, NB_WRITEINPROG);
2623
2624 /*
2625 * For an unstable write, the buffer is still treated as dirty until
2626 * a commit (or stable (re)write) is performed. Buffers needing only
2627 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2628 *
2629 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2630 * because that would cause the buffer to be dropped. The buffer is
2631 * still valid and simply needs to be written again.
2632 */
2633 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2634 CLR(bp->nb_flags, NB_INVAL);
2635 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2636 SET(bp->nb_flags, NB_DELWRI);
2637 lck_mtx_lock(nfs_buf_mutex);
2638 nfs_nbdwrite++;
2639 NFSBUFCNTCHK();
2640 lck_mtx_unlock(nfs_buf_mutex);
2641 }
2642 /*
2643 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2644 * clean list, we have to reassign it back to the dirty one. Ugh.
2645 */
2646 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2647 /* move to dirty list */
2648 lck_mtx_lock(nfs_buf_mutex);
2649 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2650 LIST_REMOVE(bp, nb_vnbufs);
2651 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2652 lck_mtx_unlock(nfs_buf_mutex);
2653 }
2654 } else {
2655 /* either there's an error or we don't need to commit */
2656 if (error) {
2657 /*
2658 * There was a write error and we need to invalidate
2659 * attrs and flush buffers in order to sync up with the
2660 * server. (if this write was extending the file, we
2661 * may no longer know the correct size)
2662 *
2663 * But we can't call vinvalbuf while holding this
2664 * buffer busy. Set a flag to do it after releasing
2665 * the buffer.
2666 */
2667 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2668 np->n_error = error;
2669 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2670 NATTRINVALIDATE(np);
2671 nfs_unlock(np);
2672 }
2673 /* clear the dirty range */
2674 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2675 }
2676
2677 if (!error && bp->nb_dirty)
2678 nfs_buf_write_dirty_pages(bp, thd, cred);
2679 nfs_buf_iodone(bp);
2680 }
2681
2682 /*
2683 * write out any pages marked dirty in a buffer
2684 *
2685 * We do use unstable writes and follow up with a commit.
2686 * If we catch the write verifier changing we'll restart
2687 * do the writes filesync.
2688 */
2689 int
2690 nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2691 {
2692 nfsnode_t np = bp->nb_np;
2693 struct nfsmount *nmp = NFSTONMP(np);
2694 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2695 uint32_t dirty = bp->nb_dirty;
2696 uint64_t wverf;
2697 struct uio uio;
2698 struct iovec_32 io;
2699
2700 if (!bp->nb_dirty)
2701 return (0);
2702
2703 /* there are pages marked dirty that need to be written out */
2704 OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
2705 NFS_BUF_MAP(bp);
2706 SET(bp->nb_flags, NB_WRITEINPROG);
2707 npages = bp->nb_bufsize / PAGE_SIZE;
2708 iomode = NFS_WRITE_UNSTABLE;
2709
2710 uio.uio_iovs.iov32p = &io;
2711 uio.uio_iovcnt = 1;
2712 uio.uio_rw = UIO_WRITE;
2713 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2714 uio.uio_segflg = UIO_SYSSPACE;
2715 #else
2716 uio.uio_segflg = UIO_SYSSPACE32;
2717 #endif
2718
2719 again:
2720 dirty = bp->nb_dirty;
2721 wverf = bp->nb_verf;
2722 commit = NFS_WRITE_FILESYNC;
2723 for (pg = 0; pg < npages; pg++) {
2724 if (!NBPGDIRTY(bp, pg))
2725 continue;
2726 count = 1;
2727 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
2728 count++;
2729 /* write count pages starting with page pg */
2730 off = pg * PAGE_SIZE;
2731 len = count * PAGE_SIZE;
2732 /* clip writes to EOF */
2733 if (NBOFF(bp) + off + len > (off_t) np->n_size)
2734 len -= (NBOFF(bp) + off + len) - np->n_size;
2735 if (len > 0) {
2736 iomode2 = iomode;
2737 io.iov_len = len;
2738 uio_uio_resid_set(&uio, io.iov_len);
2739 uio.uio_offset = NBOFF(bp) + off;
2740 io.iov_base = (uintptr_t) bp->nb_data + off;
2741 error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
2742 if (error)
2743 break;
2744 if (iomode2 < commit) /* Retain the lowest commitment level returned. */
2745 commit = iomode2;
2746 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2747 /* verifier changed, redo all the writes filesync */
2748 iomode = NFS_WRITE_FILESYNC;
2749 goto again;
2750 }
2751 }
2752 /* clear dirty bits */
2753 while (count--) {
2754 dirty &= ~(1 << pg);
2755 if (count) /* leave pg on last page */
2756 pg++;
2757 }
2758 }
2759 CLR(bp->nb_flags, NB_WRITEINPROG);
2760
2761 if (!error && (commit != NFS_WRITE_FILESYNC)) {
2762 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
2763 if (error == NFSERR_STALEWRITEVERF) {
2764 /* verifier changed, so we need to restart all the writes */
2765 iomode = NFS_WRITE_FILESYNC;
2766 goto again;
2767 }
2768 }
2769 if (!error) {
2770 bp->nb_dirty = dirty;
2771 } else {
2772 SET(bp->nb_flags, NB_ERROR);
2773 bp->nb_error = error;
2774 }
2775 return (error);
2776 }
2777
2778 /*
2779 * initiate the NFS WRITE RPC(s) for a buffer
2780 */
2781 int
2782 nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2783 {
2784 struct nfsmount *nmp;
2785 nfsnode_t np = bp->nb_np;
2786 int error = 0, nfsvers, async;
2787 int offset, length, nmwsize, nrpcs, len;
2788 struct nfsreq *req;
2789 struct nfsreq_cbinfo cb;
2790 struct uio uio;
2791 struct iovec_32 io;
2792
2793 nmp = NFSTONMP(np);
2794 if (!nmp) {
2795 bp->nb_error = error = ENXIO;
2796 SET(bp->nb_flags, NB_ERROR);
2797 nfs_buf_iodone(bp);
2798 return (error);
2799 }
2800 nfsvers = nmp->nm_vers;
2801 nmwsize = nmp->nm_wsize;
2802
2803 offset = bp->nb_offio;
2804 length = bp->nb_endio - bp->nb_offio;
2805
2806 /* Note: Can only do async I/O if nfsiods are configured. */
2807 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2808 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2809 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2810 cb.rcb_bp = bp;
2811
2812 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2813 bp->nb_error = error = EFBIG;
2814 SET(bp->nb_flags, NB_ERROR);
2815 nfs_buf_iodone(bp);
2816 return (error);
2817 }
2818
2819 uio.uio_iovs.iov32p = &io;
2820 uio.uio_iovcnt = 1;
2821 uio.uio_rw = UIO_WRITE;
2822 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2823 uio.uio_segflg = UIO_SYSSPACE;
2824 #else
2825 uio.uio_segflg = UIO_SYSSPACE32;
2826 #endif
2827 io.iov_len = length;
2828 uio_uio_resid_set(&uio, io.iov_len);
2829 uio.uio_offset = NBOFF(bp) + offset;
2830 io.iov_base = (uintptr_t) bp->nb_data + offset;
2831
2832 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2833 if (async && (nrpcs > 1)) {
2834 SET(bp->nb_flags, NB_MULTASYNCRPC);
2835 } else {
2836 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2837 }
2838
2839 while (length > 0) {
2840 if (ISSET(bp->nb_flags, NB_ERROR)) {
2841 error = bp->nb_error;
2842 break;
2843 }
2844 len = (length > nmwsize) ? nmwsize : length;
2845 cb.rcb_args[0] = offset;
2846 cb.rcb_args[1] = len;
2847 if (async && ((error = nfs_async_write_start(nmp))))
2848 break;
2849 req = NULL;
2850 error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
2851 iomode, &cb, &req);
2852 if (error) {
2853 if (async)
2854 nfs_async_write_done(nmp);
2855 break;
2856 }
2857 offset += len;
2858 length -= len;
2859 if (async)
2860 continue;
2861 nfs_buf_write_rpc_finish(req);
2862 }
2863
2864 if (length > 0) {
2865 /*
2866 * Something bad happened while trying to send the RPCs.
2867 * Wait for any outstanding requests to complete.
2868 */
2869 bp->nb_error = error;
2870 SET(bp->nb_flags, NB_ERROR);
2871 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2872 nrpcs = (length + nmwsize - 1) / nmwsize;
2873 lck_mtx_lock(nfs_buf_mutex);
2874 bp->nb_rpcs -= nrpcs;
2875 if (bp->nb_rpcs == 0) {
2876 /* No RPCs left, so the buffer's done */
2877 lck_mtx_unlock(nfs_buf_mutex);
2878 nfs_buf_write_finish(bp, thd, cred);
2879 } else {
2880 /* wait for the last RPC to mark it done */
2881 while (bp->nb_rpcs > 0)
2882 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2883 "nfs_buf_write_rpc_cancel", NULL);
2884 lck_mtx_unlock(nfs_buf_mutex);
2885 }
2886 } else {
2887 nfs_buf_write_finish(bp, thd, cred);
2888 }
2889 }
2890
2891 return (error);
2892 }
2893
2894 /*
2895 * finish up an NFS WRITE RPC on a buffer
2896 */
2897 void
2898 nfs_buf_write_rpc_finish(struct nfsreq *req)
2899 {
2900 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2901 int committed = NFS_WRITE_FILESYNC;
2902 uint64_t wverf = 0;
2903 size_t rlen;
2904 void *wakeme = NULL;
2905 struct nfsreq_cbinfo cb;
2906 struct nfsreq *wreq = NULL;
2907 struct nfsbuf *bp;
2908 struct nfsmount *nmp;
2909 nfsnode_t np;
2910 thread_t thd;
2911 kauth_cred_t cred;
2912 struct uio uio;
2913 struct iovec_32 io;
2914
2915 finish:
2916 np = req->r_np;
2917 thd = req->r_thread;
2918 cred = req->r_cred;
2919 if (IS_VALID_CRED(cred))
2920 kauth_cred_ref(cred);
2921 cb = req->r_callback;
2922 bp = cb.rcb_bp;
2923
2924 nmp = NFSTONMP(np);
2925 if (!nmp) {
2926 SET(bp->nb_flags, NB_ERROR);
2927 bp->nb_error = error = ENXIO;
2928 }
2929 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
2930 /* just drop it */
2931 nfs_request_async_cancel(req);
2932 goto out;
2933 }
2934 nfsvers = nmp->nm_vers;
2935
2936 offset = cb.rcb_args[0];
2937 rlen = length = cb.rcb_args[1];
2938
2939 /* finish the RPC */
2940 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
2941 if ((error == EINPROGRESS) && cb.rcb_func) {
2942 /* async request restarted */
2943 if (IS_VALID_CRED(cred))
2944 kauth_cred_unref(&cred);
2945 return;
2946 }
2947
2948 if (error) {
2949 SET(bp->nb_flags, NB_ERROR);
2950 bp->nb_error = error;
2951 }
2952 if (error || (nfsvers == NFS_VER2))
2953 goto out;
2954 if (rlen <= 0) {
2955 SET(bp->nb_flags, NB_ERROR);
2956 bp->nb_error = error = EIO;
2957 goto out;
2958 }
2959
2960 /* save lowest commit level returned */
2961 if (committed < bp->nb_commitlevel)
2962 bp->nb_commitlevel = committed;
2963
2964 /* check the write verifier */
2965 if (!bp->nb_verf) {
2966 bp->nb_verf = wverf;
2967 } else if (bp->nb_verf != wverf) {
2968 /* verifier changed, so buffer will need to be rewritten */
2969 bp->nb_flags |= NB_STALEWVERF;
2970 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
2971 bp->nb_verf = wverf;
2972 }
2973
2974 /*
2975 * check for a short write
2976 *
2977 * If the server didn't write all the data, then we
2978 * need to issue another write for the rest of it.
2979 * (Don't bother if the buffer hit an error or stale wverf.)
2980 */
2981 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
2982 offset += rlen;
2983 length -= rlen;
2984
2985 uio.uio_iovs.iov32p = &io;
2986 uio.uio_iovcnt = 1;
2987 uio.uio_rw = UIO_WRITE;
2988 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2989 uio.uio_segflg = UIO_SYSSPACE;
2990 #else
2991 uio.uio_segflg = UIO_SYSSPACE32;
2992 #endif
2993 io.iov_len = length;
2994 uio_uio_resid_set(&uio, io.iov_len);
2995 uio.uio_offset = NBOFF(bp) + offset;
2996 io.iov_base = (uintptr_t) bp->nb_data + offset;
2997
2998 cb.rcb_args[0] = offset;
2999 cb.rcb_args[1] = length;
3000
3001 error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
3002 NFS_WRITE_FILESYNC, &cb, &wreq);
3003 if (!error) {
3004 if (IS_VALID_CRED(cred))
3005 kauth_cred_unref(&cred);
3006 if (!cb.rcb_func) {
3007 /* if !async we'll need to wait for this RPC to finish */
3008 req = wreq;
3009 goto finish;
3010 }
3011 /*
3012 * We're done here.
3013 * Outstanding RPC count is unchanged.
3014 * Callback will be called when RPC is done.
3015 */
3016 return;
3017 }
3018 SET(bp->nb_flags, NB_ERROR);
3019 bp->nb_error = error;
3020 }
3021
3022 out:
3023 if (cb.rcb_func)
3024 nfs_async_write_done(nmp);
3025 /*
3026 * Decrement outstanding RPC count on buffer
3027 * and call nfs_buf_write_finish on last RPC.
3028 *
3029 * (Note: when there are multiple async RPCs issued for a
3030 * buffer we need nfs_buffer_mutex to avoid problems when
3031 * aborting a partially-initiated set of RPCs)
3032 */
3033 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3034 if (multasyncrpc)
3035 lck_mtx_lock(nfs_buf_mutex);
3036
3037 bp->nb_rpcs--;
3038 finished = (bp->nb_rpcs == 0);
3039
3040 if (multasyncrpc)
3041 lck_mtx_unlock(nfs_buf_mutex);
3042
3043 if (finished) {
3044 if (multasyncrpc)
3045 wakeme = &bp->nb_rpcs;
3046 nfs_buf_write_finish(bp, thd, cred);
3047 if (wakeme)
3048 wakeup(wakeme);
3049 }
3050
3051 if (IS_VALID_CRED(cred))
3052 kauth_cred_unref(&cred);
3053 }
3054
3055 /*
3056 * Send commit(s) for the given node's "needcommit" buffers
3057 */
3058 int
3059 nfs_flushcommits(nfsnode_t np, int nowait)
3060 {
3061 struct nfsmount *nmp;
3062 struct nfsbuf *bp;
3063 struct nfsbuflists blist, commitlist;
3064 int error = 0, retv, wcred_set, flags, dirty;
3065 u_quad_t off, endoff, toff;
3066 u_int32_t count;
3067 kauth_cred_t wcred = NULL;
3068
3069 FSDBG_TOP(557, np, 0, 0, 0);
3070
3071 /*
3072 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3073 * server, but nas not been committed to stable storage on the server
3074 * yet. The byte range is worked out for as many nfsbufs as we can handle
3075 * and the commit rpc is done.
3076 */
3077 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3078 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
3079 if (error)
3080 goto done;
3081 np->n_flag |= NMODIFIED;
3082 nfs_unlock(np);
3083 }
3084
3085 off = (u_quad_t)-1;
3086 endoff = 0;
3087 wcred_set = 0;
3088 LIST_INIT(&commitlist);
3089
3090 nmp = NFSTONMP(np);
3091 if (!nmp) {
3092 error = ENXIO;
3093 goto done;
3094 }
3095 if (nmp->nm_vers == NFS_VER2) {
3096 error = EINVAL;
3097 goto done;
3098 }
3099
3100 flags = NBI_DIRTY;
3101 if (nowait)
3102 flags |= NBI_NOWAIT;
3103 lck_mtx_lock(nfs_buf_mutex);
3104 if (!nfs_buf_iterprepare(np, &blist, flags)) {
3105 while ((bp = LIST_FIRST(&blist))) {
3106 LIST_REMOVE(bp, nb_vnbufs);
3107 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3108 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3109 if (error)
3110 continue;
3111 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3112 nfs_buf_check_write_verifier(np, bp);
3113 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
3114 != (NB_DELWRI | NB_NEEDCOMMIT))) {
3115 nfs_buf_drop(bp);
3116 continue;
3117 }
3118 nfs_buf_remfree(bp);
3119 lck_mtx_unlock(nfs_buf_mutex);
3120 /*
3121 * we need a upl to see if the page has been
3122 * dirtied (think mmap) since the unstable write, and
3123 * also to prevent vm from paging it during our commit rpc
3124 */
3125 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3126 retv = nfs_buf_upl_setup(bp);
3127 if (retv) {
3128 /* unable to create upl */
3129 /* vm object must no longer exist */
3130 /* this could be fatal if we need */
3131 /* to write the data again, we'll see... */
3132 printf("nfs_flushcommits: upl create failed %d\n", retv);
3133 bp->nb_valid = bp->nb_dirty = 0;
3134 }
3135 }
3136 nfs_buf_upl_check(bp);
3137 lck_mtx_lock(nfs_buf_mutex);
3138
3139 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3140 FSDBG(557, bp->nb_validoff, bp->nb_validend,
3141 bp->nb_dirtyoff, bp->nb_dirtyend);
3142
3143 /*
3144 * We used to check for dirty pages here; if there were any
3145 * we'd abort the commit and force the entire buffer to be
3146 * written again.
3147 *
3148 * Instead of doing that, we now go ahead and commit the dirty
3149 * range, and then leave the buffer around with dirty pages
3150 * that will be written out later.
3151 */
3152
3153 /*
3154 * Work out if all buffers are using the same cred
3155 * so we can deal with them all with one commit.
3156 *
3157 * Note: creds in bp's must be obtained by kauth_cred_ref
3158 * on the same original cred in order for them to be equal.
3159 */
3160 if (wcred_set == 0) {
3161 wcred = bp->nb_wcred;
3162 if (!IS_VALID_CRED(wcred))
3163 panic("nfs: needcommit w/out wcred");
3164 wcred_set = 1;
3165 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3166 wcred_set = -1;
3167 }
3168 SET(bp->nb_flags, NB_WRITEINPROG);
3169
3170 /*
3171 * A list of these buffers is kept so that the
3172 * second loop knows which buffers have actually
3173 * been committed. This is necessary, since there
3174 * may be a race between the commit rpc and new
3175 * uncommitted writes on the file.
3176 */
3177 LIST_REMOVE(bp, nb_vnbufs);
3178 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3179 toff = NBOFF(bp) + bp->nb_dirtyoff;
3180 if (toff < off)
3181 off = toff;
3182 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3183 if (toff > endoff)
3184 endoff = toff;
3185 }
3186 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3187 }
3188 lck_mtx_unlock(nfs_buf_mutex);
3189
3190 if (LIST_EMPTY(&commitlist)) {
3191 error = ENOBUFS;
3192 goto done;
3193 }
3194
3195 /*
3196 * Commit data on the server, as required.
3197 * If all bufs are using the same wcred, then use that with
3198 * one call for all of them, otherwise commit each one
3199 * separately.
3200 */
3201 if (wcred_set == 1) {
3202 /*
3203 * Note, it's possible the commit range could be >2^32-1.
3204 * If it is, we'll send one commit that covers the whole file.
3205 */
3206 if ((endoff - off) > 0xffffffff)
3207 count = 0;
3208 else
3209 count = (endoff - off);
3210 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
3211 } else {
3212 retv = 0;
3213 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3214 toff = NBOFF(bp) + bp->nb_dirtyoff;
3215 count = bp->nb_dirtyend - bp->nb_dirtyoff;
3216 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
3217 if (retv)
3218 break;
3219 }
3220 }
3221
3222 /*
3223 * Now, either mark the blocks I/O done or mark the
3224 * blocks dirty, depending on whether the commit
3225 * succeeded.
3226 */
3227 while ((bp = LIST_FIRST(&commitlist))) {
3228 LIST_REMOVE(bp, nb_vnbufs);
3229 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3230 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3231 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3232 np->n_needcommitcnt--;
3233 CHECK_NEEDCOMMITCNT(np);
3234 nfs_unlock(np);
3235
3236 if (retv) {
3237 /* move back to dirty list */
3238 lck_mtx_lock(nfs_buf_mutex);
3239 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3240 lck_mtx_unlock(nfs_buf_mutex);
3241 nfs_buf_release(bp, 1);
3242 continue;
3243 }
3244
3245 vnode_startwrite(NFSTOV(np));
3246 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3247 lck_mtx_lock(nfs_buf_mutex);
3248 nfs_nbdwrite--;
3249 NFSBUFCNTCHK();
3250 lck_mtx_unlock(nfs_buf_mutex);
3251 wakeup(&nfs_nbdwrite);
3252 }
3253 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
3254 /* if block still has dirty pages, we don't want it to */
3255 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3256 if (!(dirty = bp->nb_dirty))
3257 SET(bp->nb_flags, NB_ASYNC);
3258 else
3259 CLR(bp->nb_flags, NB_ASYNC);
3260
3261 /* move to clean list */
3262 lck_mtx_lock(nfs_buf_mutex);
3263 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3264 lck_mtx_unlock(nfs_buf_mutex);
3265
3266 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3267
3268 nfs_buf_iodone(bp);
3269 if (dirty) {
3270 /* throw it back in as a delayed write buffer */
3271 CLR(bp->nb_flags, NB_DONE);
3272 nfs_buf_write_delayed(bp);
3273 }
3274 }
3275
3276 done:
3277 FSDBG_BOT(557, np, 0, 0, error);
3278 return (error);
3279 }
3280
3281 /*
3282 * Flush all the blocks associated with a vnode.
3283 * Walk through the buffer pool and push any dirty pages
3284 * associated with the vnode.
3285 */
3286 int
3287 nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3288 {
3289 struct nfsbuf *bp;
3290 struct nfsbuflists blist;
3291 struct nfsmount *nmp = NFSTONMP(np);
3292 int error = 0, error2, slptimeo = 0, slpflag = 0;
3293 int nfsvers, flags, passone = 1;
3294
3295 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3296
3297 if (!nmp) {
3298 error = ENXIO;
3299 goto out;
3300 }
3301 nfsvers = nmp->nm_vers;
3302 if (nmp->nm_flag & NFSMNT_INT)
3303 slpflag = PCATCH;
3304
3305 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3306 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3307 np->n_flag |= NMODIFIED;
3308 nfs_unlock(np);
3309 }
3310
3311 lck_mtx_lock(nfs_buf_mutex);
3312 while (np->n_bflag & NBFLUSHINPROG) {
3313 np->n_bflag |= NBFLUSHWANT;
3314 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3315 if (error) {
3316 lck_mtx_unlock(nfs_buf_mutex);
3317 goto out;
3318 }
3319 }
3320 np->n_bflag |= NBFLUSHINPROG;
3321
3322 /*
3323 * On the first pass, start async/unstable writes on all
3324 * delayed write buffers. Then wait for all writes to complete
3325 * and call nfs_flushcommits() to commit any uncommitted buffers.
3326 * On all subsequent passes, start STABLE writes on any remaining
3327 * dirty buffers. Then wait for all writes to complete.
3328 */
3329 again:
3330 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3331 if (!NFSTONMP(np)) {
3332 lck_mtx_unlock(nfs_buf_mutex);
3333 error = ENXIO;
3334 goto done;
3335 }
3336
3337 /* Start/do any write(s) that are required. */
3338 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3339 while ((bp = LIST_FIRST(&blist))) {
3340 LIST_REMOVE(bp, nb_vnbufs);
3341 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3342 flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
3343 if (flags != NBAC_NOWAIT)
3344 nfs_buf_refget(bp);
3345 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3346 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3347 if (error == EBUSY)
3348 break;
3349 if (error) {
3350 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3351 if (error2) {
3352 if (flags != NBAC_NOWAIT)
3353 nfs_buf_refrele(bp);
3354 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3355 lck_mtx_unlock(nfs_buf_mutex);
3356 error = error2;
3357 goto done;
3358 }
3359 if (slpflag == PCATCH) {
3360 slpflag = 0;
3361 slptimeo = 2 * hz;
3362 }
3363 }
3364 }
3365 if (flags != NBAC_NOWAIT)
3366 nfs_buf_refrele(bp);
3367 if (error == EBUSY)
3368 continue;
3369 if (!bp->nb_np) {
3370 /* buffer is no longer valid */
3371 nfs_buf_drop(bp);
3372 continue;
3373 }
3374 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3375 nfs_buf_check_write_verifier(np, bp);
3376 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3377 /* buffer is no longer dirty */
3378 nfs_buf_drop(bp);
3379 continue;
3380 }
3381 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3382 if ((passone || (waitfor != MNT_WAIT)) &&
3383 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3384 nfs_buf_drop(bp);
3385 continue;
3386 }
3387 nfs_buf_remfree(bp);
3388 lck_mtx_unlock(nfs_buf_mutex);
3389 if (ISSET(bp->nb_flags, NB_ERROR)) {
3390 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3391 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3392 np->n_flag |= NWRITEERR;
3393 nfs_unlock(np);
3394 nfs_buf_release(bp, 1);
3395 lck_mtx_lock(nfs_buf_mutex);
3396 continue;
3397 }
3398 SET(bp->nb_flags, NB_ASYNC);
3399 if (!passone) {
3400 /* NB_STABLE forces this to be written FILESYNC */
3401 SET(bp->nb_flags, NB_STABLE);
3402 }
3403 nfs_buf_write(bp);
3404 lck_mtx_lock(nfs_buf_mutex);
3405 }
3406 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3407 }
3408 lck_mtx_unlock(nfs_buf_mutex);
3409
3410 if (waitfor == MNT_WAIT) {
3411 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3412 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3413 if (error2) {
3414 error = error2;
3415 goto done;
3416 }
3417 if (slpflag == PCATCH) {
3418 slpflag = 0;
3419 slptimeo = 2 * hz;
3420 }
3421 }
3422 }
3423
3424 if (nfsvers != NFS_VER2) {
3425 /* loop while it looks like there are still buffers to be */
3426 /* commited and nfs_flushcommits() seems to be handling them. */
3427 while (np->n_needcommitcnt)
3428 if (nfs_flushcommits(np, 0))
3429 break;
3430 }
3431
3432 if (passone) {
3433 passone = 0;
3434 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3435 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3436 np->n_flag |= NMODIFIED;
3437 nfs_unlock(np);
3438 }
3439 lck_mtx_lock(nfs_buf_mutex);
3440 goto again;
3441 }
3442
3443 if (waitfor == MNT_WAIT) {
3444 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3445 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3446 np->n_flag |= NMODIFIED;
3447 nfs_unlock(np);
3448 }
3449 lck_mtx_lock(nfs_buf_mutex);
3450 if (!LIST_EMPTY(&np->n_dirtyblkhd))
3451 goto again;
3452 lck_mtx_unlock(nfs_buf_mutex);
3453 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3454 /* if we have no dirty blocks, we can clear the modified flag */
3455 if (!np->n_wrbusy)
3456 np->n_flag &= ~NMODIFIED;
3457 } else {
3458 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3459 }
3460
3461 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3462 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3463 error = np->n_error;
3464 np->n_flag &= ~NWRITEERR;
3465 }
3466 nfs_unlock(np);
3467 done:
3468 lck_mtx_lock(nfs_buf_mutex);
3469 flags = np->n_bflag;
3470 np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT);
3471 lck_mtx_unlock(nfs_buf_mutex);
3472 if (flags & NBFLUSHWANT)
3473 wakeup(&np->n_bflag);
3474 out:
3475 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
3476 return (error);
3477 }
3478
3479 /*
3480 * Flush out and invalidate all buffers associated with a vnode.
3481 * Called with the underlying object locked.
3482 */
3483 static int
3484 nfs_vinvalbuf_internal(
3485 nfsnode_t np,
3486 int flags,
3487 thread_t thd,
3488 kauth_cred_t cred,
3489 int slpflag,
3490 int slptimeo)
3491 {
3492 struct nfsbuf *bp;
3493 struct nfsbuflists blist;
3494 int list, error = 0;
3495
3496 if (flags & V_SAVE) {
3497 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR))))
3498 return (error);
3499 }
3500
3501 lck_mtx_lock(nfs_buf_mutex);
3502 for (;;) {
3503 list = NBI_CLEAN;
3504 if (nfs_buf_iterprepare(np, &blist, list)) {
3505 list = NBI_DIRTY;
3506 if (nfs_buf_iterprepare(np, &blist, list))
3507 break;
3508 }
3509 while ((bp = LIST_FIRST(&blist))) {
3510 LIST_REMOVE(bp, nb_vnbufs);
3511 if (list == NBI_CLEAN)
3512 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3513 else
3514 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3515 nfs_buf_refget(bp);
3516 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
3517 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
3518 if (error != EAGAIN) {
3519 FSDBG(554, np, bp, -1, error);
3520 nfs_buf_refrele(bp);
3521 nfs_buf_itercomplete(np, &blist, list);
3522 lck_mtx_unlock(nfs_buf_mutex);
3523 return (error);
3524 }
3525 }
3526 nfs_buf_refrele(bp);
3527 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
3528 lck_mtx_unlock(nfs_buf_mutex);
3529 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
3530 (NBOFF(bp) < (off_t)np->n_size)) {
3531 /* extra paranoia: make sure we're not */
3532 /* somehow leaving any dirty data around */
3533 int mustwrite = 0;
3534 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3535 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
3536 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3537 error = nfs_buf_upl_setup(bp);
3538 if (error == EINVAL) {
3539 /* vm object must no longer exist */
3540 /* hopefully we don't need to do */
3541 /* anything for this buffer */
3542 } else if (error)
3543 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
3544 bp->nb_valid = bp->nb_dirty = 0;
3545 }
3546 nfs_buf_upl_check(bp);
3547 /* check for any dirty data before the EOF */
3548 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
3549 /* clip dirty range to EOF */
3550 if (bp->nb_dirtyend > end) {
3551 bp->nb_dirtyend = end;
3552 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
3553 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3554 }
3555 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end))
3556 mustwrite++;
3557 }
3558 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
3559 if (bp->nb_dirty)
3560 mustwrite++;
3561 /* also make sure we'll have a credential to do the write */
3562 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
3563 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3564 mustwrite = 0;
3565 }
3566 if (mustwrite) {
3567 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
3568 if (!ISSET(bp->nb_flags, NB_PAGELIST))
3569 panic("nfs_vinvalbuf: dirty buffer without upl");
3570 /* gotta write out dirty data before invalidating */
3571 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3572 /* (NB_NOCACHE indicates buffer should be discarded) */
3573 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3574 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
3575 if (!IS_VALID_CRED(bp->nb_wcred)) {
3576 kauth_cred_ref(cred);
3577 bp->nb_wcred = cred;
3578 }
3579 error = nfs_buf_write(bp);
3580 // Note: bp has been released
3581 if (error) {
3582 FSDBG(554, bp, 0xd00dee, 0xbad, error);
3583 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3584 np->n_error = error;
3585 np->n_flag |= NWRITEERR;
3586 nfs_unlock(np);
3587 /*
3588 * There was a write error and we need to
3589 * invalidate attrs to sync with server.
3590 * (if this write was extending the file,
3591 * we may no longer know the correct size)
3592 */
3593 NATTRINVALIDATE(np);
3594 error = 0;
3595 }
3596 lck_mtx_lock(nfs_buf_mutex);
3597 continue;
3598 }
3599 }
3600 SET(bp->nb_flags, NB_INVAL);
3601 // hold off on FREEUPs until we're done here
3602 nfs_buf_release(bp, 0);
3603 lck_mtx_lock(nfs_buf_mutex);
3604 }
3605 nfs_buf_itercomplete(np, &blist, list);
3606 }
3607 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
3608 panic("nfs_vinvalbuf: flush/inval failed");
3609 lck_mtx_unlock(nfs_buf_mutex);
3610 if (!(flags & V_SAVE)) {
3611 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3612 np->n_flag &= ~NMODIFIED;
3613 nfs_unlock(np);
3614 }
3615 NFS_BUF_FREEUP();
3616 return (0);
3617 }
3618
3619
3620 /*
3621 * Flush and invalidate all dirty buffers. If another process is already
3622 * doing the flush, just wait for completion.
3623 */
3624 int
3625 nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3626 {
3627 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3628 }
3629
3630 int
3631 nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
3632 {
3633 nfsnode_t np = VTONFS(vp);
3634 struct nfsmount *nmp = VTONMP(vp);
3635 int error, rv, slpflag, slptimeo, nflags;
3636 off_t size;
3637
3638 FSDBG_TOP(554, np, flags, intrflg, 0);
3639
3640 if (nmp && !(nmp->nm_flag & NFSMNT_INT))
3641 intrflg = 0;
3642 if (intrflg) {
3643 slpflag = PCATCH;
3644 slptimeo = 2 * hz;
3645 } else {
3646 slpflag = 0;
3647 slptimeo = 0;
3648 }
3649
3650 /* First wait for any other process doing a flush to complete. */
3651 lck_mtx_lock(nfs_buf_mutex);
3652 while (np->n_bflag & NBINVALINPROG) {
3653 np->n_bflag |= NBINVALWANT;
3654 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
3655 if (error) {
3656 lck_mtx_unlock(nfs_buf_mutex);
3657 return (error);
3658 }
3659 }
3660 np->n_bflag |= NBINVALINPROG;
3661 lck_mtx_unlock(nfs_buf_mutex);
3662
3663 /* Now, flush as required. */
3664 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3665 while (error) {
3666 FSDBG(554, np, 0, 0, error);
3667 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))
3668 goto done;
3669 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
3670 }
3671
3672 /* get the pages out of vm also */
3673 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
3674 if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
3675 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
3676 done:
3677 lck_mtx_lock(nfs_buf_mutex);
3678 nflags = np->n_bflag;
3679 np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT);
3680 lck_mtx_unlock(nfs_buf_mutex);
3681 if (nflags & NBINVALWANT)
3682 wakeup(&np->n_bflag);
3683
3684 FSDBG_BOT(554, np, flags, intrflg, error);
3685 return (error);
3686 }
3687
3688 /*
3689 * Add an async I/O request to the mount's async I/O queue and make
3690 * sure that an nfsiod will service it.
3691 */
3692 void
3693 nfs_asyncio_finish(struct nfsreq *req)
3694 {
3695 struct nfsmount *nmp;
3696 struct nfsiod *niod;
3697 int started = 0;
3698
3699 FSDBG_TOP(552, nmp, 0, 0, 0);
3700 again:
3701 if (((nmp = req->r_nmp)) == NULL)
3702 return;
3703 lck_mtx_lock(nfsiod_mutex);
3704 niod = nmp->nm_niod;
3705
3706 /* grab an nfsiod if we don't have one already */
3707 if (!niod) {
3708 niod = TAILQ_FIRST(&nfsiodfree);
3709 if (niod) {
3710 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
3711 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
3712 niod->niod_nmp = nmp;
3713 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
3714 /*
3715 * Try starting a new thread.
3716 * We may try a couple times if other callers
3717 * get the new threads before we do.
3718 */
3719 lck_mtx_unlock(nfsiod_mutex);
3720 started++;
3721 if (!nfsiod_start())
3722 goto again;
3723 lck_mtx_lock(nfsiod_mutex);
3724 }
3725 }
3726
3727 if (req->r_achain.tqe_next == NFSREQNOLIST)
3728 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
3729
3730 /* If this mount doesn't already have an nfsiod working on it... */
3731 if (!nmp->nm_niod) {
3732 if (niod) { /* give it the nfsiod we just grabbed */
3733 nmp->nm_niod = niod;
3734 lck_mtx_unlock(nfsiod_mutex);
3735 wakeup(niod);
3736 } else if (nfsiod_thread_count > 0) {
3737 /* just queue it up on nfsiod mounts queue */
3738 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
3739 lck_mtx_unlock(nfsiod_mutex);
3740 } else {
3741 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
3742 lck_mtx_unlock(nfsiod_mutex);
3743 /* we have no other option but to be persistent */
3744 started = 0;
3745 goto again;
3746 }
3747 } else {
3748 lck_mtx_unlock(nfsiod_mutex);
3749 }
3750
3751 FSDBG_BOT(552, nmp, 0, 0, 0);
3752 }
3753
3754 /*
3755 * queue up async I/O request for resend
3756 */
3757 void
3758 nfs_asyncio_resend(struct nfsreq *req)
3759 {
3760 struct nfsmount *nmp = req->r_nmp;
3761
3762 if (!nmp)
3763 return;
3764 nfs_gss_clnt_rpcdone(req);
3765 lck_mtx_lock(&nmp->nm_lock);
3766 if (req->r_rchain.tqe_next == NFSREQNOLIST) {
3767 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
3768 req->r_flags |= R_RESENDQ;
3769 }
3770 nfs_mount_sock_thread_wake(nmp);
3771 lck_mtx_unlock(&nmp->nm_lock);
3772 }
3773
3774 /*
3775 * Read an NFS buffer for a directory.
3776 */
3777 int
3778 nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
3779 {
3780 nfsnode_t np;
3781 vnode_t vp;
3782 struct nfsmount *nmp;
3783 int error = 0, nfsvers;
3784 struct uio uio;
3785 struct iovec_32 io;
3786
3787 np = bp->nb_np;
3788 vp = NFSTOV(np);
3789 nmp = VTONMP(vp);
3790 nfsvers = nmp->nm_vers;
3791 uio.uio_iovs.iov32p = &io;
3792 uio.uio_iovcnt = 1;
3793 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
3794 uio.uio_segflg = UIO_SYSSPACE;
3795 #else
3796 uio.uio_segflg = UIO_SYSSPACE32;
3797 #endif
3798
3799 /* sanity check */
3800 if (ISSET(bp->nb_flags, NB_DONE))
3801 CLR(bp->nb_flags, NB_DONE);
3802
3803 uio.uio_rw = UIO_READ;
3804 io.iov_len = bp->nb_bufsize;
3805 uio_uio_resid_set(&uio, io.iov_len);
3806 io.iov_base = (uintptr_t) bp->nb_data;
3807 uio.uio_offset = NBOFF(bp);
3808
3809 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
3810 if (nfsvers < NFS_VER4) {
3811 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
3812 error = nfs3_readdirplus_rpc(np, &uio, ctx);
3813 if (error == NFSERR_NOTSUPP) {
3814 lck_mtx_lock(&nmp->nm_lock);
3815 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
3816 lck_mtx_unlock(&nmp->nm_lock);
3817 }
3818 }
3819 if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
3820 error = nfs3_readdir_rpc(np, &uio, ctx);
3821 } else {
3822 error = nfs4_readdir_rpc(np, &uio, ctx);
3823 }
3824 if (error) {
3825 SET(bp->nb_flags, NB_ERROR);
3826 bp->nb_error = error;
3827 } else {
3828 bp->nb_validoff = 0;
3829 bp->nb_validend = uio.uio_offset - NBOFF(bp);
3830 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
3831 }
3832
3833 nfs_buf_iodone(bp);
3834 return (error);
3835 }