]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_bio.c
xnu-1228.9.59.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
CommitLineData
1c79356b 1/*
2d21ac55 2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
1c79356b
A
67#include <sys/param.h>
68#include <sys/systm.h>
69#include <sys/resourcevar.h>
70#include <sys/signalvar.h>
91447636
A
71#include <sys/proc_internal.h>
72#include <sys/kauth.h>
55e303ae 73#include <sys/malloc.h>
1c79356b 74#include <sys/vnode.h>
55e303ae 75#include <sys/dirent.h>
91447636 76#include <sys/mount_internal.h>
1c79356b 77#include <sys/kernel.h>
91447636
A
78#include <sys/ubc_internal.h>
79#include <sys/uio_internal.h>
1c79356b
A
80
81#include <sys/vm.h>
82#include <sys/vmparam.h>
83
84#include <sys/time.h>
85#include <kern/clock.h>
91447636
A
86#include <libkern/OSAtomic.h>
87#include <kern/kalloc.h>
2d21ac55 88#include <kern/thread_call.h>
1c79356b
A
89
90#include <nfs/rpcv2.h>
91#include <nfs/nfsproto.h>
92#include <nfs/nfs.h>
2d21ac55 93#include <nfs/nfs_gss.h>
1c79356b 94#include <nfs/nfsmount.h>
1c79356b 95#include <nfs/nfsnode.h>
91447636 96#include <sys/buf_internal.h>
2d21ac55 97#include <libkern/OSAtomic.h>
1c79356b 98
2d21ac55 99kern_return_t thread_terminate(thread_t); /* XXX */
55e303ae 100
91447636
A
101#define NFSBUFHASH(np, lbn) \
102 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
55e303ae 103LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
483a1d10 104struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
55e303ae 105u_long nfsbufhash;
91447636 106int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
483a1d10 107int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
55e303ae 108int nfs_nbdwrite;
2d21ac55
A
109int nfs_buf_timer_on = 0;
110thread_t nfsbufdelwrithd = NULL;
483a1d10 111
91447636 112lck_grp_t *nfs_buf_lck_grp;
91447636
A
113lck_mtx_t *nfs_buf_mutex;
114
2d21ac55 115#define NFSBUF_FREE_PERIOD 30 /* seconds */
483a1d10
A
116#define NFSBUF_LRU_STALE 120
117#define NFSBUF_META_STALE 240
118
119/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
120#define LRU_TO_FREEUP 6
121/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
122#define META_TO_FREEUP 3
123/* total number of nfsbufs nfs_buf_freeup() should attempt to free */
124#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
2d21ac55 125/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
483a1d10 126#define LRU_FREEUP_FRAC_ON_TIMER 8
2d21ac55 127/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
483a1d10
A
128#define META_FREEUP_FRAC_ON_TIMER 16
129/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
130#define LRU_FREEUP_MIN_FRAC 4
131/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
132#define META_FREEUP_MIN_FRAC 2
55e303ae 133
483a1d10 134#define NFS_BUF_FREEUP() \
91447636 135 do { \
483a1d10
A
136 /* only call nfs_buf_freeup() if it has work to do: */ \
137 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
138 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
139 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
140 nfs_buf_freeup(0); \
141 } while (0)
55e303ae
A
142
143/*
144 * Initialize nfsbuf lists
145 */
146void
147nfs_nbinit(void)
148{
2d21ac55
A
149 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
150 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
91447636
A
151
152 nfsbufcnt = nfsbufmetacnt =
153 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
154 nfsbufmin = 128;
2d21ac55
A
155 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
156 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
157 nfsbufmetamax = nfsbufmax / 4;
55e303ae
A
158 nfsneedbuffer = 0;
159 nfs_nbdwrite = 0;
91447636
A
160
161 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
162 TAILQ_INIT(&nfsbuffree);
163 TAILQ_INIT(&nfsbuffreemeta);
164 TAILQ_INIT(&nfsbufdelwri);
165
55e303ae
A
166}
167
2d21ac55
A
168/*
169 * Check periodically for stale/unused nfs bufs
170 */
171void
172nfs_buf_timer(__unused void *param0, __unused void *param1)
173{
174 nfs_buf_freeup(1);
175
176 lck_mtx_lock(nfs_buf_mutex);
177 if (nfsbufcnt <= nfsbufmin) {
178 nfs_buf_timer_on = 0;
179 lck_mtx_unlock(nfs_buf_mutex);
180 return;
181 }
182 lck_mtx_unlock(nfs_buf_mutex);
183
184 nfs_interval_timer_start(nfs_buf_timer_call,
185 NFSBUF_FREE_PERIOD * 1000);
186}
187
55e303ae
A
188/*
189 * try to free up some excess, unused nfsbufs
190 */
483a1d10
A
191void
192nfs_buf_freeup(int timer)
55e303ae
A
193{
194 struct nfsbuf *fbp;
483a1d10
A
195 struct timeval now;
196 int count;
91447636
A
197 struct nfsbuffreehead nfsbuffreeup;
198
199 TAILQ_INIT(&nfsbuffreeup);
200
201 lck_mtx_lock(nfs_buf_mutex);
55e303ae 202
483a1d10 203 microuptime(&now);
55e303ae 204
91447636
A
205 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
206
483a1d10
A
207 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
208 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
55e303ae
A
209 fbp = TAILQ_FIRST(&nfsbuffree);
210 if (!fbp)
211 break;
91447636
A
212 if (fbp->nb_refs)
213 break;
214 if (NBUFSTAMPVALID(fbp) &&
215 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
483a1d10
A
216 break;
217 nfs_buf_remfree(fbp);
2d21ac55
A
218 /* disassociate buffer from any nfsnode */
219 if (fbp->nb_np) {
483a1d10
A
220 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
221 LIST_REMOVE(fbp, nb_vnbufs);
222 fbp->nb_vnbufs.le_next = NFSNOLIST;
223 }
2d21ac55 224 fbp->nb_np = NULL;
483a1d10
A
225 }
226 LIST_REMOVE(fbp, nb_hash);
91447636 227 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
483a1d10
A
228 nfsbufcnt--;
229 }
230
231 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
232 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
233 fbp = TAILQ_FIRST(&nfsbuffreemeta);
234 if (!fbp)
235 break;
91447636
A
236 if (fbp->nb_refs)
237 break;
238 if (NBUFSTAMPVALID(fbp) &&
239 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
483a1d10 240 break;
55e303ae 241 nfs_buf_remfree(fbp);
2d21ac55
A
242 /* disassociate buffer from any nfsnode */
243 if (fbp->nb_np) {
55e303ae
A
244 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
245 LIST_REMOVE(fbp, nb_vnbufs);
246 fbp->nb_vnbufs.le_next = NFSNOLIST;
247 }
2d21ac55 248 fbp->nb_np = NULL;
55e303ae
A
249 }
250 LIST_REMOVE(fbp, nb_hash);
91447636
A
251 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
252 nfsbufcnt--;
253 nfsbufmetacnt--;
254 }
255
256 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
2d21ac55 257 NFSBUFCNTCHK();
91447636
A
258
259 lck_mtx_unlock(nfs_buf_mutex);
260
261 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
262 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
55e303ae 263 /* nuke any creds */
2d21ac55 264 if (IS_VALID_CRED(fbp->nb_rcred))
0c530ab8 265 kauth_cred_unref(&fbp->nb_rcred);
2d21ac55 266 if (IS_VALID_CRED(fbp->nb_wcred))
0c530ab8 267 kauth_cred_unref(&fbp->nb_wcred);
91447636
A
268 /* if buf was NB_META, dump buffer */
269 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
270 kfree(fbp->nb_data, fbp->nb_bufsize);
55e303ae 271 FREE(fbp, M_TEMP);
55e303ae 272 }
91447636 273
55e303ae
A
274}
275
91447636
A
276/*
277 * remove a buffer from the freelist
278 * (must be called with nfs_buf_mutex held)
279 */
55e303ae
A
280void
281nfs_buf_remfree(struct nfsbuf *bp)
282{
283 if (bp->nb_free.tqe_next == NFSNOLIST)
284 panic("nfsbuf not on free list");
285 if (ISSET(bp->nb_flags, NB_DELWRI)) {
286 nfsbufdelwricnt--;
287 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
91447636 288 } else if (ISSET(bp->nb_flags, NB_META)) {
483a1d10
A
289 nfsbuffreemetacnt--;
290 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
55e303ae
A
291 } else {
292 nfsbuffreecnt--;
293 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
294 }
295 bp->nb_free.tqe_next = NFSNOLIST;
2d21ac55 296 NFSBUFCNTCHK();
55e303ae
A
297}
298
299/*
300 * check for existence of nfsbuf in cache
301 */
91447636 302boolean_t
2d21ac55 303nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
91447636
A
304{
305 boolean_t rv;
306 lck_mtx_lock(nfs_buf_mutex);
2d21ac55 307 if (nfs_buf_incore(np, blkno))
91447636
A
308 rv = TRUE;
309 else
310 rv = FALSE;
311 lck_mtx_unlock(nfs_buf_mutex);
312 return (rv);
313}
314
315/*
316 * return incore buffer (must be called with nfs_buf_mutex held)
317 */
55e303ae 318struct nfsbuf *
2d21ac55 319nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
55e303ae
A
320{
321 /* Search hash chain */
2d21ac55 322 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
55e303ae 323 for (; bp != NULL; bp = bp->nb_hash.le_next)
2d21ac55 324 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
483a1d10 325 if (!ISSET(bp->nb_flags, NB_INVAL)) {
2d21ac55 326 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
483a1d10
A
327 return (bp);
328 }
329 }
55e303ae
A
330 return (NULL);
331}
332
333/*
334 * Check if it's OK to drop a page.
335 *
336 * Called by vnode_pager() on pageout request of non-dirty page.
337 * We need to make sure that it's not part of a delayed write.
338 * If it is, we can't let the VM drop it because we may need it
339 * later when/if we need to write the data (again).
340 */
341int
91447636 342nfs_buf_page_inval(vnode_t vp, off_t offset)
55e303ae 343{
2d21ac55 344 struct nfsmount *nmp = VTONMP(vp);
55e303ae 345 struct nfsbuf *bp;
91447636
A
346 int error = 0;
347
2d21ac55
A
348 if (!nmp)
349 return (ENXIO);
350
91447636 351 lck_mtx_lock(nfs_buf_mutex);
2d21ac55 352 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
55e303ae 353 if (!bp)
91447636 354 goto out;
55e303ae 355 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
91447636
A
356 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
357 error = EBUSY;
358 goto out;
359 }
55e303ae
A
360 /*
361 * If there's a dirty range in the buffer, check to
362 * see if this page intersects with the dirty range.
363 * If it does, we can't let the pager drop the page.
364 */
365 if (bp->nb_dirtyend > 0) {
366 int start = offset - NBOFF(bp);
367 if (bp->nb_dirtyend <= start ||
368 bp->nb_dirtyoff >= (start + PAGE_SIZE))
91447636
A
369 error = 0;
370 else
371 error = EBUSY;
55e303ae 372 }
91447636
A
373out:
374 lck_mtx_unlock(nfs_buf_mutex);
375 return (error);
55e303ae
A
376}
377
91447636
A
378/*
379 * set up the UPL for a buffer
380 * (must NOT be called with nfs_buf_mutex held)
381 */
55e303ae
A
382int
383nfs_buf_upl_setup(struct nfsbuf *bp)
384{
385 kern_return_t kret;
386 upl_t upl;
91447636 387 int upl_flags;
55e303ae
A
388
389 if (ISSET(bp->nb_flags, NB_PAGELIST))
390 return (0);
391
91447636 392 upl_flags = UPL_PRECIOUS;
2d21ac55 393 if (!ISSET(bp->nb_flags, NB_READ)) {
91447636
A
394 /*
395 * We're doing a "write", so we intend to modify
396 * the pages we're gathering.
397 */
398 upl_flags |= UPL_WILL_MODIFY;
399 }
2d21ac55 400 kret = ubc_create_upl(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
91447636 401 &upl, NULL, upl_flags);
55e303ae
A
402 if (kret == KERN_INVALID_ARGUMENT) {
403 /* vm object probably doesn't exist any more */
404 bp->nb_pagelist = NULL;
405 return (EINVAL);
406 }
407 if (kret != KERN_SUCCESS) {
408 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
409 bp->nb_pagelist = NULL;
410 return (EIO);
411 }
412
2d21ac55 413 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
55e303ae 414
55e303ae
A
415 bp->nb_pagelist = upl;
416 SET(bp->nb_flags, NB_PAGELIST);
55e303ae
A
417 return (0);
418}
419
91447636
A
420/*
421 * update buffer's valid/dirty info from UBC
422 * (must NOT be called with nfs_buf_mutex held)
423 */
55e303ae
A
424void
425nfs_buf_upl_check(struct nfsbuf *bp)
426{
427 upl_page_info_t *pl;
428 off_t filesize, fileoffset;
429 int i, npages;
430
431 if (!ISSET(bp->nb_flags, NB_PAGELIST))
432 return;
433
434 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
2d21ac55 435 filesize = ubc_getsize(NFSTOV(bp->nb_np));
55e303ae
A
436 fileoffset = NBOFF(bp);
437 if (fileoffset < filesize)
438 SET(bp->nb_flags, NB_CACHE);
439 else
440 CLR(bp->nb_flags, NB_CACHE);
441
442 pl = ubc_upl_pageinfo(bp->nb_pagelist);
443 bp->nb_valid = bp->nb_dirty = 0;
444
445 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
446 /* anything beyond the end of the file is not valid or dirty */
447 if (fileoffset >= filesize)
448 break;
449 if (!upl_valid_page(pl, i)) {
450 CLR(bp->nb_flags, NB_CACHE);
451 continue;
452 }
453 NBPGVALID_SET(bp,i);
2d21ac55 454 if (upl_dirty_page(pl, i))
55e303ae 455 NBPGDIRTY_SET(bp, i);
55e303ae
A
456 }
457 fileoffset = NBOFF(bp);
458 if (ISSET(bp->nb_flags, NB_CACHE)) {
459 bp->nb_validoff = 0;
460 bp->nb_validend = bp->nb_bufsize;
461 if (fileoffset + bp->nb_validend > filesize)
462 bp->nb_validend = filesize - fileoffset;
463 } else {
464 bp->nb_validoff = bp->nb_validend = -1;
465 }
466 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
467 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
468}
469
91447636
A
470/*
471 * make sure that a buffer is mapped
472 * (must NOT be called with nfs_buf_mutex held)
473 */
2d21ac55 474int
55e303ae
A
475nfs_buf_map(struct nfsbuf *bp)
476{
477 kern_return_t kret;
478
479 if (bp->nb_data)
480 return (0);
481 if (!ISSET(bp->nb_flags, NB_PAGELIST))
482 return (EINVAL);
483
484 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
485 if (kret != KERN_SUCCESS)
486 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
487 if (bp->nb_data == 0)
488 panic("ubc_upl_map mapped 0");
489 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
490 return (0);
491}
492
55e303ae
A
493/*
494 * normalize an nfsbuf's valid range
495 *
496 * the read/write code guarantees that we'll always have a valid
497 * region that is an integral number of pages. If either end
498 * of the valid range isn't page-aligned, it gets corrected
499 * here as we extend the valid range through all of the
500 * contiguous valid pages.
501 */
2d21ac55
A
502void
503nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
55e303ae
A
504{
505 int pg, npg;
506 /* pull validoff back to start of contiguous valid page range */
507 pg = bp->nb_validoff/PAGE_SIZE;
508 while (pg >= 0 && NBPGVALID(bp,pg))
509 pg--;
510 bp->nb_validoff = (pg+1) * PAGE_SIZE;
511 /* push validend forward to end of contiguous valid page range */
512 npg = bp->nb_bufsize/PAGE_SIZE;
513 pg = bp->nb_validend/PAGE_SIZE;
514 while (pg < npg && NBPGVALID(bp,pg))
515 pg++;
516 bp->nb_validend = pg * PAGE_SIZE;
517 /* clip to EOF */
91447636 518 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
55e303ae
A
519 bp->nb_validend = np->n_size % bp->nb_bufsize;
520}
521
522/*
2d21ac55
A
523 * process some entries on the delayed write queue
524 * (must be called with nfs_buf_mutex held)
55e303ae
A
525 */
526static void
2d21ac55 527nfs_buf_delwri_service(void)
55e303ae
A
528{
529 struct nfsbuf *bp;
2d21ac55
A
530 nfsnode_t np;
531 int error, i = 0;
55e303ae 532
55e303ae 533 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
2d21ac55 534 np = bp->nb_np;
55e303ae 535 nfs_buf_remfree(bp);
91447636
A
536 nfs_buf_refget(bp);
537 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
538 nfs_buf_refrele(bp);
539 if (error)
540 break;
2d21ac55 541 if (!bp->nb_np) {
91447636
A
542 /* buffer is no longer valid */
543 nfs_buf_drop(bp);
544 continue;
545 }
8f6c56a5
A
546 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
547 nfs_buf_check_write_verifier(np, bp);
55e303ae
A
548 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
549 /* put buffer at end of delwri list */
550 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
551 nfsbufdelwricnt++;
91447636
A
552 nfs_buf_drop(bp);
553 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 554 nfs_flushcommits(np, 1);
55e303ae 555 } else {
91447636
A
556 SET(bp->nb_flags, NB_ASYNC);
557 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
558 nfs_buf_write(bp);
559 }
560 i++;
91447636 561 lck_mtx_lock(nfs_buf_mutex);
55e303ae 562 }
2d21ac55
A
563}
564
565/*
566 * thread to service the delayed write queue when asked
567 */
568static void
569nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
570{
571 struct timespec ts = { 30, 0 };
572 int error = 0;
573
574 lck_mtx_lock(nfs_buf_mutex);
575 while (!error) {
576 nfs_buf_delwri_service();
577 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
578 }
579 nfsbufdelwrithd = NULL;
580 lck_mtx_unlock(nfs_buf_mutex);
581 thread_terminate(nfsbufdelwrithd);
582}
583
584/*
585 * try to push out some delayed/uncommitted writes
586 * ("locked" indicates whether nfs_buf_mutex is already held)
587 */
588static void
589nfs_buf_delwri_push(int locked)
590{
591 if (TAILQ_EMPTY(&nfsbufdelwri))
592 return;
593 if (!locked)
594 lck_mtx_lock(nfs_buf_mutex);
595 /* wake up the delayed write service thread */
596 if (nfsbufdelwrithd)
597 wakeup(&nfsbufdelwrithd);
598 else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS)
599 thread_deallocate(nfsbufdelwrithd);
600 /* otherwise, try to do some of the work ourselves */
601 if (!nfsbufdelwrithd)
602 nfs_buf_delwri_service();
91447636
A
603 if (!locked)
604 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
605}
606
607/*
91447636
A
608 * Get an nfs buffer.
609 *
610 * Returns errno on error, 0 otherwise.
611 * Any buffer is returned in *bpp.
612 *
613 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
614 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
615 *
616 * Check for existence of buffer in cache.
617 * Or attempt to reuse a buffer from one of the free lists.
618 * Or allocate a new buffer if we haven't already hit max allocation.
619 * Or wait for a free buffer.
620 *
621 * If available buffer found, prepare it, and return it.
622 *
623 * If the calling process is interrupted by a signal for
624 * an interruptible mount point, return EINTR.
55e303ae 625 */
91447636 626int
55e303ae 627nfs_buf_get(
2d21ac55 628 nfsnode_t np,
91447636 629 daddr64_t blkno,
55e303ae 630 int size,
2d21ac55 631 thread_t thd,
91447636
A
632 int flags,
633 struct nfsbuf **bpp)
55e303ae 634{
2d21ac55
A
635 vnode_t vp = NFSTOV(np);
636 struct nfsmount *nmp = VTONMP(vp);
55e303ae 637 struct nfsbuf *bp;
2d21ac55 638 int bufsize;
55e303ae 639 int slpflag = PCATCH;
91447636
A
640 int operation = (flags & NBLK_OPMASK);
641 int error = 0;
642 struct timespec ts;
55e303ae 643
2d21ac55 644 FSDBG_TOP(541, np, blkno, size, flags);
91447636 645 *bpp = NULL;
55e303ae
A
646
647 bufsize = size;
0c530ab8
A
648 if (bufsize > NFS_MAXBSIZE)
649 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
55e303ae 650
0c530ab8 651 if (!nmp) {
2d21ac55 652 FSDBG_BOT(541, np, blkno, 0, ENXIO);
0c530ab8
A
653 return (ENXIO);
654 }
55e303ae 655
2d21ac55 656 if (!UBCINFOEXISTS(vp)) {
91447636 657 operation = NBLK_META;
2d21ac55 658 } else if (bufsize < nmp->nm_biosize) {
55e303ae 659 /* reg files should always have biosize blocks */
2d21ac55 660 bufsize = nmp->nm_biosize;
91447636 661 }
55e303ae 662
91447636 663 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
2d21ac55
A
664 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
665 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
55e303ae
A
666
667 /* poke the delwri list */
91447636 668 nfs_buf_delwri_push(0);
55e303ae
A
669
670 /* sleep to let other threads run... */
671 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
2d21ac55 672 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
55e303ae
A
673 }
674
675loop:
91447636 676 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
677
678 /* check for existence of nfsbuf in cache */
2d21ac55 679 if ((bp = nfs_buf_incore(np, blkno))) {
55e303ae 680 /* if busy, set wanted and wait */
91447636
A
681 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
682 if (flags & NBLK_NOWAIT) {
683 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 684 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
91447636
A
685 return (0);
686 }
2d21ac55 687 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
91447636
A
688 SET(bp->nb_lflags, NBL_WANTED);
689
690 ts.tv_sec = 2;
691 ts.tv_nsec = 0;
36401178 692 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
2d21ac55 693 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
55e303ae 694 slpflag = 0;
2d21ac55 695 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
36401178 696 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 697 FSDBG_BOT(541, np, blkno, 0, error);
91447636 698 return (error);
55e303ae
A
699 }
700 goto loop;
701 }
702 if (bp->nb_bufsize != bufsize)
703 panic("nfsbuf size mismatch");
91447636
A
704 SET(bp->nb_lflags, NBL_BUSY);
705 SET(bp->nb_flags, NB_CACHE);
55e303ae
A
706 nfs_buf_remfree(bp);
707 /* additional paranoia: */
708 if (ISSET(bp->nb_flags, NB_PAGELIST))
709 panic("pagelist buffer was not busy");
710 goto buffer_setup;
711 }
712
91447636
A
713 if (flags & NBLK_ONLYVALID) {
714 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 715 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
91447636
A
716 return (0);
717 }
718
55e303ae
A
719 /*
720 * where to get a free buffer:
91447636 721 * - if meta and maxmeta reached, must reuse meta
55e303ae 722 * - alloc new if we haven't reached min bufs
483a1d10
A
723 * - if free lists are NOT empty
724 * - if free list is stale, use it
725 * - else if freemeta list is stale, use it
726 * - else if max bufs allocated, use least-time-to-stale
55e303ae
A
727 * - alloc new if we haven't reached max allowed
728 * - start clearing out delwri list and try again
729 */
730
91447636
A
731 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
732 /* if we've hit max meta buffers, must reuse a meta buffer */
733 bp = TAILQ_FIRST(&nfsbuffreemeta);
734 } else if ((nfsbufcnt > nfsbufmin) &&
483a1d10
A
735 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
736 /* try to pull an nfsbuf off a free list */
737 struct nfsbuf *lrubp, *metabp;
738 struct timeval now;
739 microuptime(&now);
740
91447636 741 /* if the next LRU or META buffer is invalid or stale, use it */
483a1d10 742 lrubp = TAILQ_FIRST(&nfsbuffree);
91447636
A
743 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
744 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
483a1d10
A
745 bp = lrubp;
746 metabp = TAILQ_FIRST(&nfsbuffreemeta);
91447636
A
747 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
748 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
483a1d10
A
749 bp = metabp;
750
751 if (!bp && (nfsbufcnt >= nfsbufmax)) {
752 /* we've already allocated all bufs, so */
753 /* choose the buffer that'll go stale first */
754 if (!metabp)
755 bp = lrubp;
756 else if (!lrubp)
757 bp = metabp;
758 else {
759 int32_t lru_stale_time, meta_stale_time;
760 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
761 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
762 if (lru_stale_time <= meta_stale_time)
763 bp = lrubp;
764 else
765 bp = metabp;
55e303ae 766 }
55e303ae 767 }
91447636 768 }
483a1d10 769
91447636
A
770 if (bp) {
771 /* we have a buffer to reuse */
2d21ac55 772 FSDBG(544, np, blkno, bp, bp->nb_flags);
91447636
A
773 nfs_buf_remfree(bp);
774 if (ISSET(bp->nb_flags, NB_DELWRI))
775 panic("nfs_buf_get: delwri");
776 SET(bp->nb_lflags, NBL_BUSY);
2d21ac55
A
777 /* disassociate buffer from previous nfsnode */
778 if (bp->nb_np) {
91447636
A
779 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
780 LIST_REMOVE(bp, nb_vnbufs);
781 bp->nb_vnbufs.le_next = NFSNOLIST;
483a1d10 782 }
2d21ac55 783 bp->nb_np = NULL;
91447636
A
784 }
785 LIST_REMOVE(bp, nb_hash);
786 /* nuke any creds we're holding */
2d21ac55 787 if (IS_VALID_CRED(bp->nb_rcred))
0c530ab8 788 kauth_cred_unref(&bp->nb_rcred);
2d21ac55 789 if (IS_VALID_CRED(bp->nb_wcred))
0c530ab8 790 kauth_cred_unref(&bp->nb_wcred);
91447636
A
791 /* if buf will no longer be NB_META, dump old buffer */
792 if (operation == NBLK_META) {
793 if (!ISSET(bp->nb_flags, NB_META))
794 nfsbufmetacnt++;
795 } else if (ISSET(bp->nb_flags, NB_META)) {
796 if (bp->nb_data) {
797 kfree(bp->nb_data, bp->nb_bufsize);
483a1d10
A
798 bp->nb_data = NULL;
799 }
91447636 800 nfsbufmetacnt--;
55e303ae 801 }
91447636
A
802 /* re-init buf fields */
803 bp->nb_error = 0;
804 bp->nb_validoff = bp->nb_validend = -1;
805 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
806 bp->nb_valid = 0;
807 bp->nb_dirty = 0;
8f6c56a5 808 bp->nb_verf = 0;
91447636
A
809 } else {
810 /* no buffer to reuse */
811 if ((nfsbufcnt < nfsbufmax) &&
812 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
483a1d10
A
813 /* just alloc a new one */
814 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
91447636
A
815 if (!bp) {
816 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 817 FSDBG_BOT(541, np, blkno, 0, error);
91447636
A
818 return (ENOMEM);
819 }
483a1d10 820 nfsbufcnt++;
2d21ac55
A
821
822 /*
823 * If any excess bufs, make sure the timer
824 * is running to free them up later.
825 */
826 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
827 nfs_buf_timer_on = 1;
828 nfs_interval_timer_start(nfs_buf_timer_call,
829 NFSBUF_FREE_PERIOD * 1000);
830 }
831
91447636
A
832 if (operation == NBLK_META)
833 nfsbufmetacnt++;
2d21ac55 834 NFSBUFCNTCHK();
483a1d10
A
835 /* init nfsbuf */
836 bzero(bp, sizeof(*bp));
837 bp->nb_free.tqe_next = NFSNOLIST;
838 bp->nb_validoff = bp->nb_validend = -1;
2d21ac55 839 FSDBG(545, np, blkno, bp, 0);
483a1d10
A
840 } else {
841 /* too many bufs... wait for buffers to free up */
2d21ac55 842 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
55e303ae 843
483a1d10 844 /* poke the delwri list */
91447636 845 nfs_buf_delwri_push(1);
483a1d10
A
846
847 nfsneedbuffer = 1;
36401178 848 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP, "nfsbufget", NULL);
2d21ac55 849 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
36401178 850 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 851 FSDBG_BOT(541, np, blkno, 0, error);
91447636 852 return (error);
483a1d10
A
853 }
854 goto loop;
55e303ae 855 }
55e303ae
A
856 }
857
55e303ae 858 /* setup nfsbuf */
91447636
A
859 bp->nb_lflags = NBL_BUSY;
860 bp->nb_flags = 0;
55e303ae
A
861 bp->nb_lblkno = blkno;
862 /* insert buf in hash */
91447636 863 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
2d21ac55
A
864 /* associate buffer with new nfsnode */
865 bp->nb_np = np;
55e303ae
A
866 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
867
868buffer_setup:
869
91447636
A
870 /* unlock hash */
871 lck_mtx_unlock(nfs_buf_mutex);
872
55e303ae 873 switch (operation) {
91447636 874 case NBLK_META:
55e303ae
A
875 SET(bp->nb_flags, NB_META);
876 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
91447636 877 kfree(bp->nb_data, bp->nb_bufsize);
55e303ae
A
878 bp->nb_data = NULL;
879 bp->nb_validoff = bp->nb_validend = -1;
880 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
881 bp->nb_valid = 0;
882 bp->nb_dirty = 0;
883 CLR(bp->nb_flags, NB_CACHE);
884 }
885 if (!bp->nb_data)
91447636
A
886 bp->nb_data = kalloc(bufsize);
887 if (!bp->nb_data) {
888 /* Ack! couldn't allocate the data buffer! */
2d21ac55 889 /* clean up buffer and return error */
91447636
A
890 lck_mtx_lock(nfs_buf_mutex);
891 LIST_REMOVE(bp, nb_vnbufs);
892 bp->nb_vnbufs.le_next = NFSNOLIST;
2d21ac55 893 bp->nb_np = NULL;
91447636
A
894 /* invalidate usage timestamp to allow immediate freeing */
895 NBUFSTAMPINVALIDATE(bp);
896 if (bp->nb_free.tqe_next != NFSNOLIST)
897 panic("nfsbuf on freelist");
898 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
899 nfsbuffreecnt++;
900 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 901 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
91447636
A
902 return (ENOMEM);
903 }
55e303ae
A
904 bp->nb_bufsize = bufsize;
905 break;
906
91447636
A
907 case NBLK_READ:
908 case NBLK_WRITE:
909 /*
910 * Set or clear NB_READ now to let the UPL subsystem know
911 * if we intend to modify the pages or not.
912 */
913 if (operation == NBLK_READ) {
914 SET(bp->nb_flags, NB_READ);
915 } else {
916 CLR(bp->nb_flags, NB_READ);
917 }
55e303ae
A
918 if (bufsize < PAGE_SIZE)
919 bufsize = PAGE_SIZE;
920 bp->nb_bufsize = bufsize;
921 bp->nb_validoff = bp->nb_validend = -1;
922
91447636 923 if (UBCINFOEXISTS(vp)) {
2d21ac55 924 /* set up upl */
55e303ae
A
925 if (nfs_buf_upl_setup(bp)) {
926 /* unable to create upl */
927 /* vm object must no longer exist */
2d21ac55 928 /* clean up buffer and return error */
91447636 929 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
930 LIST_REMOVE(bp, nb_vnbufs);
931 bp->nb_vnbufs.le_next = NFSNOLIST;
2d21ac55 932 bp->nb_np = NULL;
91447636
A
933 /* invalidate usage timestamp to allow immediate freeing */
934 NBUFSTAMPINVALIDATE(bp);
55e303ae
A
935 if (bp->nb_free.tqe_next != NFSNOLIST)
936 panic("nfsbuf on freelist");
937 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
938 nfsbuffreecnt++;
91447636 939 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 940 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
91447636 941 return (EIO);
55e303ae
A
942 }
943 nfs_buf_upl_check(bp);
944 }
945 break;
946
947 default:
948 panic("nfs_buf_get: %d unknown operation", operation);
949 }
950
91447636 951 *bpp = bp;
55e303ae 952
2d21ac55 953 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
55e303ae 954
91447636 955 return (0);
55e303ae
A
956}
957
958void
483a1d10 959nfs_buf_release(struct nfsbuf *bp, int freeup)
55e303ae 960{
2d21ac55
A
961 nfsnode_t np = bp->nb_np;
962 vnode_t vp;
483a1d10 963 struct timeval now;
91447636 964 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
55e303ae
A
965
966 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
967 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
968 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
969
2d21ac55
A
970 vp = np ? NFSTOV(np) : NULL;
971 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
55e303ae
A
972 int upl_flags;
973 upl_t upl;
974 int i, rv;
975
976 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
977 rv = nfs_buf_upl_setup(bp);
978 if (rv)
979 printf("nfs_buf_release: upl create failed %d\n", rv);
980 else
981 nfs_buf_upl_check(bp);
982 }
983 upl = bp->nb_pagelist;
984 if (!upl)
985 goto pagelist_cleanup_done;
986 if (bp->nb_data) {
987 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
988 panic("ubc_upl_unmap failed");
989 bp->nb_data = NULL;
990 }
2d21ac55
A
991 /*
992 * Abort the pages on error or: if this is an invalid or
993 * non-needcommit nocache buffer AND no pages are dirty.
994 */
995 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
996 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
997 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE)))
55e303ae
A
998 upl_flags = UPL_ABORT_DUMP_PAGES;
999 else
1000 upl_flags = 0;
1001 ubc_upl_abort(upl, upl_flags);
1002 goto pagelist_cleanup_done;
1003 }
1004 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
1005 if (!NBPGVALID(bp,i))
1006 ubc_upl_abort_range(upl,
1007 i*PAGE_SIZE, PAGE_SIZE,
1008 UPL_ABORT_DUMP_PAGES |
1009 UPL_ABORT_FREE_ON_EMPTY);
1010 else {
1011 if (NBPGDIRTY(bp,i))
1012 upl_flags = UPL_COMMIT_SET_DIRTY;
1013 else
1014 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1015 ubc_upl_commit_range(upl,
1016 i*PAGE_SIZE, PAGE_SIZE,
1017 upl_flags |
1018 UPL_COMMIT_INACTIVATE |
1019 UPL_COMMIT_FREE_ON_EMPTY);
1020 }
1021 }
1022pagelist_cleanup_done:
1023 /* was this the last buffer in the file? */
2d21ac55 1024 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
55e303ae 1025 /* if so, invalidate all pages of last buffer past EOF */
91447636 1026 off_t start, end;
2d21ac55 1027 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
0c530ab8 1028 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
91447636
A
1029 if (end > start) {
1030 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1031 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1032 }
55e303ae
A
1033 }
1034 CLR(bp->nb_flags, NB_PAGELIST);
1035 bp->nb_pagelist = NULL;
1036 }
1037
91447636
A
1038 lck_mtx_lock(nfs_buf_mutex);
1039
1040 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1041
55e303ae
A
1042 /* Wake up any processes waiting for any buffer to become free. */
1043 if (nfsneedbuffer) {
1044 nfsneedbuffer = 0;
91447636 1045 wakeup_needbuffer = 1;
55e303ae
A
1046 }
1047 /* Wake up any processes waiting for _this_ buffer to become free. */
91447636
A
1048 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1049 CLR(bp->nb_lflags, NBL_WANTED);
1050 wakeup_buffer = 1;
55e303ae
A
1051 }
1052
0c530ab8
A
1053 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1054 if (ISSET(bp->nb_flags, NB_ERROR) ||
1055 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
55e303ae
A
1056 SET(bp->nb_flags, NB_INVAL);
1057
1058 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
2d21ac55 1059 /* If it's invalid or empty, dissociate it from its nfsnode */
55e303ae
A
1060 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1061 LIST_REMOVE(bp, nb_vnbufs);
1062 bp->nb_vnbufs.le_next = NFSNOLIST;
1063 }
2d21ac55 1064 bp->nb_np = NULL;
55e303ae
A
1065 /* if this was a delayed write, wakeup anyone */
1066 /* waiting for delayed writes to complete */
1067 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1068 CLR(bp->nb_flags, NB_DELWRI);
2d21ac55
A
1069 nfs_nbdwrite--;
1070 NFSBUFCNTCHK();
91447636 1071 wakeup_nbdwrite = 1;
55e303ae 1072 }
91447636
A
1073 /* invalidate usage timestamp to allow immediate freeing */
1074 NBUFSTAMPINVALIDATE(bp);
55e303ae
A
1075 /* put buffer at head of free list */
1076 if (bp->nb_free.tqe_next != NFSNOLIST)
1077 panic("nfsbuf on freelist");
483a1d10 1078 SET(bp->nb_flags, NB_INVAL);
91447636
A
1079 if (ISSET(bp->nb_flags, NB_META)) {
1080 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1081 nfsbuffreemetacnt++;
1082 } else {
1083 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1084 nfsbuffreecnt++;
1085 }
55e303ae
A
1086 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1087 /* put buffer at end of delwri list */
1088 if (bp->nb_free.tqe_next != NFSNOLIST)
1089 panic("nfsbuf on freelist");
1090 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1091 nfsbufdelwricnt++;
91447636 1092 freeup = 0;
55e303ae 1093 } else {
483a1d10
A
1094 /* update usage timestamp */
1095 microuptime(&now);
1096 bp->nb_timestamp = now.tv_sec;
55e303ae
A
1097 /* put buffer at end of free list */
1098 if (bp->nb_free.tqe_next != NFSNOLIST)
1099 panic("nfsbuf on freelist");
483a1d10
A
1100 if (ISSET(bp->nb_flags, NB_META)) {
1101 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1102 nfsbuffreemetacnt++;
1103 } else {
1104 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1105 nfsbuffreecnt++;
1106 }
55e303ae
A
1107 }
1108
2d21ac55 1109 NFSBUFCNTCHK();
55e303ae
A
1110
1111 /* Unlock the buffer. */
2d21ac55 1112 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
91447636 1113 CLR(bp->nb_lflags, NBL_BUSY);
55e303ae
A
1114
1115 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
91447636
A
1116
1117 lck_mtx_unlock(nfs_buf_mutex);
1118
1119 if (wakeup_needbuffer)
1120 wakeup(&nfsneedbuffer);
1121 if (wakeup_buffer)
1122 wakeup(bp);
1123 if (wakeup_nbdwrite)
1124 wakeup(&nfs_nbdwrite);
1125 if (freeup)
1126 NFS_BUF_FREEUP();
55e303ae
A
1127}
1128
1129/*
1130 * Wait for operations on the buffer to complete.
1131 * When they do, extract and return the I/O's error value.
1132 */
1133int
1134nfs_buf_iowait(struct nfsbuf *bp)
1135{
1136 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1137
91447636
A
1138 lck_mtx_lock(nfs_buf_mutex);
1139
55e303ae 1140 while (!ISSET(bp->nb_flags, NB_DONE))
2d21ac55 1141 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
91447636
A
1142
1143 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1144
1145 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1146
1147 /* check for interruption of I/O, then errors. */
1148 if (ISSET(bp->nb_flags, NB_EINTR)) {
1149 CLR(bp->nb_flags, NB_EINTR);
1150 return (EINTR);
1151 } else if (ISSET(bp->nb_flags, NB_ERROR))
1152 return (bp->nb_error ? bp->nb_error : EIO);
1153 return (0);
1154}
1155
1156/*
1157 * Mark I/O complete on a buffer.
1158 */
1159void
1160nfs_buf_iodone(struct nfsbuf *bp)
1161{
55e303ae
A
1162
1163 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1164
1165 if (ISSET(bp->nb_flags, NB_DONE))
1166 panic("nfs_buf_iodone already");
55e303ae
A
1167
1168 if (!ISSET(bp->nb_flags, NB_READ)) {
1169 CLR(bp->nb_flags, NB_WRITEINPROG);
91447636
A
1170 /*
1171 * vnode_writedone() takes care of waking up
1172 * any throttled write operations
1173 */
2d21ac55 1174 vnode_writedone(NFSTOV(bp->nb_np));
55e303ae 1175 }
91447636
A
1176 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1177 SET(bp->nb_flags, NB_DONE); /* note that it's done */
483a1d10 1178 nfs_buf_release(bp, 1);
91447636
A
1179 } else { /* or just wakeup the buffer */
1180 lck_mtx_lock(nfs_buf_mutex);
1181 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1182 CLR(bp->nb_lflags, NBL_WANTED);
1183 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1184 wakeup(bp);
1185 }
1186
1187 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1188}
1189
1190void
2d21ac55 1191nfs_buf_write_delayed(struct nfsbuf *bp)
55e303ae 1192{
2d21ac55 1193 nfsnode_t np = bp->nb_np;
55e303ae
A
1194
1195 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1196 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1197
1198 /*
1199 * If the block hasn't been seen before:
1200 * (1) Mark it as having been seen,
2d21ac55 1201 * (2) Make sure it's on its node's correct block list,
55e303ae
A
1202 */
1203 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1204 SET(bp->nb_flags, NB_DELWRI);
55e303ae 1205 /* move to dirty list */
91447636 1206 lck_mtx_lock(nfs_buf_mutex);
2d21ac55
A
1207 nfs_nbdwrite++;
1208 NFSBUFCNTCHK();
55e303ae
A
1209 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1210 LIST_REMOVE(bp, nb_vnbufs);
2d21ac55 1211 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
91447636 1212 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1213 }
1214
1215 /*
1216 * If the vnode has "too many" write operations in progress
1217 * wait for them to finish the IO
1218 */
2d21ac55
A
1219 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1220
1221 /* the file is in a modified state, so make sure the flag's set */
1222 nfs_lock(np, NFS_NODE_LOCK_FORCE);
1223 np->n_flag |= NMODIFIED;
1224 nfs_unlock(np);
55e303ae
A
1225
1226 /*
2d21ac55
A
1227 * If we have too many delayed write buffers,
1228 * just fall back to doing the async write.
55e303ae
A
1229 */
1230 if (nfs_nbdwrite < 0)
1231 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
2d21ac55 1232 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
55e303ae
A
1233 /* issue async write */
1234 SET(bp->nb_flags, NB_ASYNC);
1235 nfs_buf_write(bp);
1236 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1237 return;
1238 }
2d21ac55 1239
55e303ae
A
1240 /* Otherwise, the "write" is done, so mark and release the buffer. */
1241 SET(bp->nb_flags, NB_DONE);
483a1d10 1242 nfs_buf_release(bp, 1);
55e303ae
A
1243 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1244 return;
1245}
1246
8f6c56a5
A
1247/*
1248 * Check that a "needcommit" buffer can still be committed.
1249 * If the write verifier has changed, we need to clear the
1250 * the needcommit flag.
1251 */
1252void
2d21ac55 1253nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
8f6c56a5
A
1254{
1255 struct nfsmount *nmp;
1256
1257 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
1258 return;
1259
2d21ac55
A
1260 nmp = NFSTONMP(np);
1261 if (!nmp)
1262 return;
1263 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf))
8f6c56a5
A
1264 return;
1265
2d21ac55
A
1266 /* write verifier changed, clear commit/wverf flags */
1267 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1268 bp->nb_verf = 0;
1269 nfs_lock(np, NFS_NODE_LOCK_FORCE);
8f6c56a5
A
1270 np->n_needcommitcnt--;
1271 CHECK_NEEDCOMMITCNT(np);
2d21ac55 1272 nfs_unlock(np);
8f6c56a5
A
1273}
1274
91447636
A
1275/*
1276 * add a reference to a buffer so it doesn't disappear while being used
1277 * (must be called with nfs_buf_mutex held)
1278 */
1279void
1280nfs_buf_refget(struct nfsbuf *bp)
1281{
1282 bp->nb_refs++;
1283}
1284/*
1285 * release a reference on a buffer
1286 * (must be called with nfs_buf_mutex held)
1287 */
1288void
1289nfs_buf_refrele(struct nfsbuf *bp)
1290{
1291 bp->nb_refs--;
1292}
1293
1294/*
1295 * mark a particular buffer as BUSY
1296 * (must be called with nfs_buf_mutex held)
1297 */
1298errno_t
1299nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1300{
1301 errno_t error;
1302 struct timespec ts;
1303
1304 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1305 /*
1306 * since the mutex_lock may block, the buffer
1307 * may become BUSY, so we need to recheck for
1308 * a NOWAIT request
1309 */
1310 if (flags & NBAC_NOWAIT)
1311 return (EBUSY);
1312 SET(bp->nb_lflags, NBL_WANTED);
1313
2d21ac55
A
1314 ts.tv_sec = (slptimeo/100);
1315 /* the hz value is 100; which leads to 10ms */
1316 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
91447636 1317
2d21ac55
A
1318 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1319 "nfs_buf_acquire", &ts);
1320 if (error)
1321 return (error);
1322 return (EAGAIN);
91447636
A
1323 }
1324 if (flags & NBAC_REMOVE)
1325 nfs_buf_remfree(bp);
1326 SET(bp->nb_lflags, NBL_BUSY);
1327
1328 return (0);
1329}
1330
1331/*
1332 * simply drop the BUSY status of a buffer
1333 * (must be called with nfs_buf_mutex held)
1334 */
1335void
1336nfs_buf_drop(struct nfsbuf *bp)
1337{
1338 int need_wakeup = 0;
1339
1340 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1341 panic("nfs_buf_drop: buffer not busy!");
1342 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
2d21ac55 1343 /* delay the actual wakeup until after we clear NBL_BUSY */
91447636
A
1344 need_wakeup = 1;
1345 }
1346 /* Unlock the buffer. */
1347 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1348
1349 if (need_wakeup)
1350 wakeup(bp);
1351}
1352
1353/*
1354 * prepare for iterating over an nfsnode's buffer list
1355 * this lock protects the queue manipulation
1356 * (must be called with nfs_buf_mutex held)
1357 */
1358int
2d21ac55 1359nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
91447636
A
1360{
1361 struct nfsbuflists *listheadp;
1362
1363 if (flags & NBI_DIRTY)
1364 listheadp = &np->n_dirtyblkhd;
1365 else
1366 listheadp = &np->n_cleanblkhd;
1367
1368 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1369 LIST_INIT(iterheadp);
1370 return(EWOULDBLOCK);
1371 }
1372
1373 while (np->n_bufiterflags & NBI_ITER) {
1374 np->n_bufiterflags |= NBI_ITERWANT;
2d21ac55 1375 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
91447636
A
1376 }
1377 if (LIST_EMPTY(listheadp)) {
1378 LIST_INIT(iterheadp);
1379 return(EINVAL);
1380 }
1381 np->n_bufiterflags |= NBI_ITER;
1382
1383 iterheadp->lh_first = listheadp->lh_first;
1384 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1385 LIST_INIT(listheadp);
1386
1387 return(0);
1388}
1389
1390/*
2d21ac55 1391 * clean up after iterating over an nfsnode's buffer list
91447636
A
1392 * this lock protects the queue manipulation
1393 * (must be called with nfs_buf_mutex held)
1394 */
1395void
2d21ac55 1396nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
91447636
A
1397{
1398 struct nfsbuflists * listheadp;
1399 struct nfsbuf *bp;
1400
1401 if (flags & NBI_DIRTY)
1402 listheadp = &np->n_dirtyblkhd;
1403 else
1404 listheadp = &np->n_cleanblkhd;
1405
1406 while (!LIST_EMPTY(iterheadp)) {
1407 bp = LIST_FIRST(iterheadp);
1408 LIST_REMOVE(bp, nb_vnbufs);
1409 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1410 }
1411
1412 np->n_bufiterflags &= ~NBI_ITER;
1413 if (np->n_bufiterflags & NBI_ITERWANT) {
1414 np->n_bufiterflags &= ~NBI_ITERWANT;
1415 wakeup(&np->n_bufiterflags);
1416 }
1417}
1418
1c79356b
A
1419
1420/*
2d21ac55 1421 * Read an NFS buffer for a file.
1c79356b
A
1422 */
1423int
2d21ac55 1424nfs_buf_read(struct nfsbuf *bp)
1c79356b 1425{
2d21ac55
A
1426 int error = 0;
1427 nfsnode_t np;
1428 thread_t thd;
1429 kauth_cred_t cred;
55e303ae 1430
2d21ac55
A
1431 np = bp->nb_np;
1432 cred = bp->nb_rcred;
1433 if (IS_VALID_CRED(cred))
1434 kauth_cred_ref(cred);
1435 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1c79356b 1436
2d21ac55
A
1437 /* sanity checks */
1438 if (!ISSET(bp->nb_flags, NB_READ))
1439 panic("nfs_buf_read: !NB_READ");
1440 if (ISSET(bp->nb_flags, NB_DONE))
1441 CLR(bp->nb_flags, NB_DONE);
91447636 1442
2d21ac55 1443 NFS_BUF_MAP(bp);
0c530ab8 1444
2d21ac55
A
1445 OSAddAtomic(1, (SInt32 *)&nfsstats.read_bios);
1446
1447 error = nfs_buf_read_rpc(bp, thd, cred);
1c79356b 1448 /*
2d21ac55
A
1449 * For async I/O, the callbacks will finish up the
1450 * read. Otherwise, the read has already been finished.
1c79356b 1451 */
2d21ac55
A
1452
1453 if (IS_VALID_CRED(cred))
1454 kauth_cred_unref(&cred);
1455 return (error);
1456}
1457
1458/*
1459 * finish the reading of a buffer
1460 */
1461void
1462nfs_buf_read_finish(struct nfsbuf *bp)
1463{
1464 nfsnode_t np = bp->nb_np;
1465 struct nfsmount *nmp;
1466
1467 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1468 /* update valid range */
1469 bp->nb_validoff = 0;
1470 bp->nb_validend = bp->nb_endio;
1471 if (bp->nb_endio < bp->nb_bufsize) {
1472 /*
1473 * The read may be short because we have unflushed writes
1474 * that are extending the file size and the reads hit the
1475 * (old) EOF on the server. So, just make sure nb_validend
1476 * correctly tracks EOF.
1477 * Note that the missing data should have already been zeroed
1478 * in nfs_buf_read_rpc_finish().
1479 */
1480 off_t boff = NBOFF(bp);
1481 if ((off_t)np->n_size >= (boff + bp->nb_bufsize))
1482 bp->nb_validend = bp->nb_bufsize;
1483 else if ((off_t)np->n_size >= boff)
1484 bp->nb_validend = np->n_size - boff;
1485 else
1486 bp->nb_validend = 0;
91447636 1487 }
2d21ac55
A
1488 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
1489 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL))
1490 bp->nb_validend = 0x100000000LL - NBOFF(bp);
1491 bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
1492 if (bp->nb_validend & PAGE_MASK) {
1493 /* zero-fill remainder of last page */
1494 bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend);
91447636 1495 }
2d21ac55
A
1496 }
1497 nfs_buf_iodone(bp);
1498}
1499
1500/*
1501 * initiate the NFS READ RPC(s) for a buffer
1502 */
1503int
1504nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1505{
1506 struct nfsmount *nmp;
1507 nfsnode_t np = bp->nb_np;
1508 int error = 0, nfsvers, async;
1509 int offset, length, nmrsize, nrpcs, len;
1510 off_t boff;
1511 struct nfsreq *req;
1512 struct nfsreq_cbinfo cb;
1513
1514 nmp = NFSTONMP(np);
1515 if (!nmp) {
1516 bp->nb_error = error = ENXIO;
1517 SET(bp->nb_flags, NB_ERROR);
1518 nfs_buf_iodone(bp);
1519 return (error);
1520 }
1521 nfsvers = nmp->nm_vers;
1522 nmrsize = nmp->nm_rsize;
1523
1524 boff = NBOFF(bp);
1525 offset = 0;
1526 length = bp->nb_bufsize;
1527
1528 if (nfsvers == NFS_VER2) {
1529 if (boff > 0xffffffffLL) {
1530 bp->nb_error = error = EFBIG;
1531 SET(bp->nb_flags, NB_ERROR);
1532 nfs_buf_iodone(bp);
91447636
A
1533 return (error);
1534 }
2d21ac55
A
1535 if ((boff + length - 1) > 0xffffffffLL)
1536 length = 0x100000000LL - boff;
91447636
A
1537 }
1538
2d21ac55
A
1539 /* Note: Can only do async I/O if nfsiods are configured. */
1540 async = (bp->nb_flags & NB_ASYNC);
1541 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1542 cb.rcb_bp = bp;
1543
1544 bp->nb_offio = bp->nb_endio = 0;
1545 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1546 if (async && (nrpcs > 1)) {
1547 SET(bp->nb_flags, NB_MULTASYNCRPC);
1548 } else {
1549 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1c79356b 1550 }
1c79356b 1551
2d21ac55
A
1552 while (length > 0) {
1553 if (ISSET(bp->nb_flags, NB_ERROR)) {
1554 error = bp->nb_error;
91447636 1555 break;
2d21ac55
A
1556 }
1557 len = (length > nmrsize) ? nmrsize : length;
1558 cb.rcb_args[0] = offset;
1559 cb.rcb_args[1] = len;
1560 req = NULL;
1561 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
1562 if (error)
1c79356b 1563 break;
2d21ac55
A
1564 offset += len;
1565 length -= len;
1566 if (async)
1567 continue;
1568 nfs_buf_read_rpc_finish(req);
1569 if (ISSET(bp->nb_flags, NB_ERROR)) {
1570 error = bp->nb_error;
1571 break;
1572 }
1573 }
55e303ae 1574
2d21ac55 1575 if (length > 0) {
55e303ae 1576 /*
2d21ac55
A
1577 * Something bad happened while trying to send the RPC(s).
1578 * Wait for any outstanding requests to complete.
55e303ae 1579 */
2d21ac55
A
1580 bp->nb_error = error;
1581 SET(bp->nb_flags, NB_ERROR);
1582 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1583 nrpcs = (length + nmrsize - 1) / nmrsize;
1584 lck_mtx_lock(nfs_buf_mutex);
1585 bp->nb_rpcs -= nrpcs;
1586 if (bp->nb_rpcs == 0) {
1587 /* No RPCs left, so the buffer's done */
1588 lck_mtx_unlock(nfs_buf_mutex);
1589 nfs_buf_iodone(bp);
1590 } else {
1591 /* wait for the last RPC to mark it done */
1592 while (bp->nb_rpcs > 0)
1593 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
1594 "nfs_buf_read_rpc_cancel", NULL);
1595 lck_mtx_unlock(nfs_buf_mutex);
55e303ae 1596 }
2d21ac55
A
1597 } else {
1598 nfs_buf_iodone(bp);
55e303ae 1599 }
2d21ac55 1600 }
55e303ae 1601
2d21ac55
A
1602 return (error);
1603}
1c79356b 1604
2d21ac55
A
1605/*
1606 * finish up an NFS READ RPC on a buffer
1607 */
1608void
1609nfs_buf_read_rpc_finish(struct nfsreq *req)
1610{
1611 struct nfsmount *nmp;
1612 size_t rlen;
1613 struct nfsreq_cbinfo cb;
1614 struct nfsbuf *bp;
1615 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1616 void *wakeme = NULL;
1617 struct nfsreq *rreq = NULL;
1618 nfsnode_t np;
1619 thread_t thd;
1620 kauth_cred_t cred;
1621 struct uio uio;
1622 struct iovec_32 io;
1623
1624finish:
1625 np = req->r_np;
1626 thd = req->r_thread;
1627 cred = req->r_cred;
1628 if (IS_VALID_CRED(cred))
1629 kauth_cred_ref(cred);
1630 cb = req->r_callback;
1631 bp = cb.rcb_bp;
1632
1633 nmp = NFSTONMP(np);
1634 if (!nmp) {
1635 SET(bp->nb_flags, NB_ERROR);
1636 bp->nb_error = error = ENXIO;
1637 }
1638 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1639 /* just drop it */
1640 nfs_request_async_cancel(req);
1641 goto out;
1642 }
1643
1644 nfsvers = nmp->nm_vers;
1645 offset = cb.rcb_args[0];
1646 rlen = length = cb.rcb_args[1];
1647
1648 uio.uio_iovs.iov32p = &io;
1649 uio.uio_iovcnt = 1;
1650 uio.uio_rw = UIO_READ;
1651#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
1652 uio.uio_segflg = UIO_SYSSPACE;
1653#else
1654 uio.uio_segflg = UIO_SYSSPACE32;
1655#endif
1656 io.iov_len = length;
1657 uio_uio_resid_set(&uio, io.iov_len);
1658 uio.uio_offset = NBOFF(bp) + offset;
1659 io.iov_base = (uintptr_t) bp->nb_data + offset;
1660
1661 /* finish the RPC */
1662 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, &uio, &rlen, &eof);
1663 if ((error == EINPROGRESS) && cb.rcb_func) {
1664 /* async request restarted */
1665 if (IS_VALID_CRED(cred))
1666 kauth_cred_unref(&cred);
1667 return;
1668 }
1669
1670 if (error) {
1671 SET(bp->nb_flags, NB_ERROR);
1672 bp->nb_error = error;
1673 goto out;
1674 }
1675
1676 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen)))
1677 bp->nb_endio = offset + rlen;
1678
1679 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1680 /* zero out the remaining data (up to EOF) */
1681 off_t rpcrem, eofrem, rem;
1682 rpcrem = (length - rlen);
1683 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1684 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
1685 if (rem > 0)
1686 bzero(bp->nb_data + offset + rlen, rem);
1687 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1688 /*
1689 * short read
1690 *
1691 * We haven't hit EOF and we didn't get all the data
1692 * requested, so we need to issue another read for the rest.
1693 * (Don't bother if the buffer already hit an error.)
1694 */
1695 offset += rlen;
1696 length -= rlen;
1697 cb.rcb_args[0] = offset;
1698 cb.rcb_args[1] = length;
1699 error = nmp->nm_funcs->nf_read_rpc_async(np, offset, length, thd, cred, &cb, &rreq);
1700 if (!error) {
1701 if (IS_VALID_CRED(cred))
1702 kauth_cred_unref(&cred);
1703 if (!cb.rcb_func) {
1704 /* if !async we'll need to wait for this RPC to finish */
1705 req = rreq;
1706 goto finish;
1707 }
1708 /*
1709 * We're done here.
1710 * Outstanding RPC count is unchanged.
1711 * Callback will be called when RPC is done.
1712 */
1713 return;
1714 }
1715 SET(bp->nb_flags, NB_ERROR);
1716 bp->nb_error = error;
1717 }
1718
1719out:
1720 if (IS_VALID_CRED(cred))
1721 kauth_cred_unref(&cred);
1722
1723 /*
1724 * Decrement outstanding RPC count on buffer
1725 * and call nfs_buf_read_finish on last RPC.
1726 *
1727 * (Note: when there are multiple async RPCs issued for a
1728 * buffer we need nfs_buffer_mutex to avoid problems when
1729 * aborting a partially-initiated set of RPCs)
1730 */
1731
1732 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
1733 if (multasyncrpc)
1734 lck_mtx_lock(nfs_buf_mutex);
1735
1736 bp->nb_rpcs--;
1737 finished = (bp->nb_rpcs == 0);
1738
1739 if (multasyncrpc)
1740 lck_mtx_unlock(nfs_buf_mutex);
1741
1742 if (finished) {
1743 if (multasyncrpc)
1744 wakeme = &bp->nb_rpcs;
1745 nfs_buf_read_finish(bp);
1746 if (wakeme)
1747 wakeup(wakeme);
1748 }
1749}
1750
1751/*
1752 * Do buffer readahead.
1753 * Initiate async I/O to read buffers not in cache.
1754 */
1755static int
1756nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1757{
1758 struct nfsmount *nmp = NFSTONMP(np);
1759 struct nfsbuf *bp;
1760 int error = 0, nra;
1761
1762 if (!nmp)
1763 return (ENXIO);
1764 if (nmp->nm_readahead <= 0)
1765 return (0);
1766 if (*rabnp > lastrabn)
1767 return (0);
1768
1769 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1770 /* check if block exists and is valid. */
1771 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp);
1772 if (error)
1773 break;
1774 if (!bp)
1775 continue;
1776 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
1777 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI|NB_NCRDAHEAD))) {
1778 CLR(bp->nb_flags, NB_CACHE);
1779 bp->nb_valid = 0;
1780 bp->nb_validoff = bp->nb_validend = -1;
1781 }
1782 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
1783 !ISSET(bp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1784 SET(bp->nb_flags, (NB_READ|NB_ASYNC));
1785 if (ioflag & IO_NOCACHE)
1786 SET(bp->nb_flags, NB_NCRDAHEAD);
1787 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
1788 kauth_cred_ref(cred);
1789 bp->nb_rcred = cred;
1790 }
1791 if ((error = nfs_buf_read(bp)))
1792 break;
1793 continue;
1794 }
1795 nfs_buf_release(bp, 1);
1796 }
1797 return (error);
1798}
1799
1800/*
1801 * NFS buffer I/O for reading files/directories.
1802 */
1803int
1804nfs_bioread(nfsnode_t np, struct uio *uio, int ioflag, int *eofflag, vfs_context_t ctx)
1805{
1806 vnode_t vp = NFSTOV(np);
1807 struct nfsbuf *bp = NULL;
1808 struct nfs_vattr nvattr;
1809 struct nfsmount *nmp = VTONMP(vp);
1810 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1, tlbn;
1811 off_t diff;
1812 int error = 0, n = 0, on = 0;
1813 int nfsvers, biosize;
1814 caddr_t dp;
1815 struct dirent *direntp = NULL;
1816 enum vtype vtype;
1817 thread_t thd;
1818 kauth_cred_t cred;
1819
1820 FSDBG_TOP(514, np, uio->uio_offset, uio_uio_resid(uio), ioflag);
1821
1822 if (uio_uio_resid(uio) == 0) {
1823 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
1824 return (0);
1825 }
1826 if (uio->uio_offset < 0) {
1827 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
1828 return (EINVAL);
1829 }
1830
1831 nfsvers = nmp->nm_vers;
1832 biosize = nmp->nm_biosize;
1833 thd = vfs_context_thread(ctx);
1834 cred = vfs_context_ucred(ctx);
1835
1836 vtype = vnode_vtype(vp);
1837 if ((vtype != VREG) && (vtype != VDIR)) {
1838 printf("nfs_bioread: type %x unexpected\n", vtype);
1839 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
1840 return (EINVAL);
1841 }
1842
1843 /*
1844 * For nfs, cache consistency can only be maintained approximately.
1845 * Although RFC1094 does not specify the criteria, the following is
1846 * believed to be compatible with the reference port.
1847 * For nfs:
1848 * If the file's modify time on the server has changed since the
1849 * last read rpc or you have written to the file,
1850 * you may have lost data cache consistency with the
1851 * server, so flush all of the file's data out of the cache.
1852 * Then force a getattr rpc to ensure that you have up to date
1853 * attributes.
1854 * NB: This implies that cache data can be read when up to
1855 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1856 * current attributes this could be forced by calling
1857 * NATTRINVALIDATE() before the nfs_getattr() call.
1858 */
1859
1860 if (ISSET(np->n_flag, NUPDATESIZE))
1861 nfs_data_update_size(np, 0);
1862
1863 if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
1864 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
1865 return (error);
1866 }
1867
1868 if (np->n_flag & NNEEDINVALIDATE) {
1869 np->n_flag &= ~NNEEDINVALIDATE;
1870 nfs_unlock(np);
1871 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1);
1872 if ((error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE))) {
1873 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
1874 return (error);
1875 }
1876 }
1877
1878 if (np->n_flag & NMODIFIED) {
1879 if (vtype == VDIR) {
1880 nfs_invaldir(np);
1881 nfs_unlock(np);
1882 error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1883 if (!error)
1884 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
1885 if (error) {
1886 FSDBG_BOT(514, np, 0xd1e0003, 0, error);
1887 return (error);
1888 }
1889 }
1890 NATTRINVALIDATE(np);
1891 error = nfs_getattr(np, &nvattr, ctx, 1);
1892 if (error) {
1893 nfs_unlock(np);
1894 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
1895 return (error);
1896 }
1897 if (vtype == VDIR) {
1898 /* if directory changed, purge any name cache entries */
1899 if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
1900 cache_purge(vp);
1901 NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
1902 }
1903 NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
1904 } else {
1905 error = nfs_getattr(np, &nvattr, ctx, 1);
1906 if (error) {
1907 nfs_unlock(np);
1908 FSDBG_BOT(514, np, 0xd1e0005, 0, error);
1909 return (error);
1910 }
1911 if (NFS_CHANGED(nfsvers, np, &nvattr)) {
1912 if (vtype == VDIR) {
1913 nfs_invaldir(np);
1914 /* purge name cache entries */
1915 if (NFS_CHANGED_NC(nfsvers, np, &nvattr))
1916 cache_purge(vp);
1917 }
1918 nfs_unlock(np);
1919 error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1);
1920 if (!error)
1921 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
1922 if (error) {
1923 FSDBG_BOT(514, np, 0xd1e0006, 0, error);
1924 return (error);
1925 }
1926 if (vtype == VDIR)
1927 NFS_CHANGED_UPDATE_NC(nfsvers, np, &nvattr);
1928 NFS_CHANGED_UPDATE(nfsvers, np, &nvattr);
1929 }
1930 }
1931
1932 nfs_unlock(np);
1933
1934 if (vtype == VREG) {
1935 if ((ioflag & IO_NOCACHE) && (uio_uio_resid(uio) < (2*biosize))) {
1936 /* We have only a block or so to read, just do the rpc directly. */
1937 error = nfs_read_rpc(np, uio, ctx);
1938 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
1939 return (error);
1940 }
1941 /*
1942 * set up readahead - which may be limited by:
1943 * + current request length (for IO_NOCACHE)
1944 * + readahead setting
1945 * + file size
1946 */
1947 if (nmp->nm_readahead > 0) {
1948 off_t end = uio->uio_offset + uio_uio_resid(uio);
1949 if (end > (off_t)np->n_size)
1950 end = np->n_size;
1951 rabn = uio->uio_offset / biosize;
1952 maxrabn = (end - 1) / biosize;
1953 if (!(ioflag & IO_NOCACHE) &&
1954 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread+1)))) {
1955 maxrabn += nmp->nm_readahead;
1956 if ((maxrabn * biosize) >= (off_t)np->n_size)
1957 maxrabn = ((off_t)np->n_size - 1)/biosize;
1958 }
1959 } else {
1960 rabn = maxrabn = 0;
1961 }
1962 }
1963
1964 do {
1965
1966 if (vtype == VREG) {
1967 nfs_data_lock(np, NFS_NODE_LOCK_SHARED);
1968 lbn = uio->uio_offset / biosize;
1969
1970 /*
1971 * Copy directly from any cached pages without grabbing the bufs.
1972 *
1973 * Note: for "nocache" reads, we don't copy directly from UBC
1974 * because any cached pages will be for readahead buffers that
1975 * need to be invalidated anyway before we finish this request.
1976 */
1977 if (!(ioflag & IO_NOCACHE) &&
1978 (uio->uio_segflg == UIO_USERSPACE32 ||
1979 uio->uio_segflg == UIO_USERSPACE64 ||
1980 uio->uio_segflg == UIO_USERSPACE)) {
1981 // LP64todo - fix this!
1982 int io_resid = uio_uio_resid(uio);
1983 diff = np->n_size - uio->uio_offset;
1984 if (diff < io_resid)
1985 io_resid = diff;
1986 if (io_resid > 0) {
1987 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
91447636 1988 if (error) {
2d21ac55
A
1989 nfs_data_unlock(np);
1990 FSDBG_BOT(514, np, uio->uio_offset, 0xcacefeed, error);
91447636
A
1991 return (error);
1992 }
2d21ac55
A
1993 }
1994 /* count any biocache reads that we just copied directly */
1995 if (lbn != (uio->uio_offset / biosize)) {
1996 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1997 FSDBG(514, np, 0xcacefeed, uio->uio_offset, error);
1998 }
1999 }
2000
2001 lbn = uio->uio_offset / biosize;
2002 on = uio->uio_offset % biosize;
2003 np->n_lastread = (uio->uio_offset - 1) / biosize;
2004
2005 /* adjust readahead block number, if necessary */
2006 if (rabn < lbn)
2007 rabn = lbn;
2008 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2009 if (rabn <= lastrabn) { /* start readaheads */
2010 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2011 if (error) {
2012 nfs_data_unlock(np);
2013 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
2014 return (error);
55e303ae 2015 }
1c79356b
A
2016 }
2017
91447636 2018 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
2d21ac55
A
2019 nfs_data_unlock(np);
2020 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
55e303ae
A
2021 return (0);
2022 }
2023
91447636 2024 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
55e303ae 2025
1c79356b
A
2026 /*
2027 * If the block is in the cache and has the required data
2028 * in a valid region, just copy it out.
2029 * Otherwise, get the block and write back/read in,
2030 * as required.
2031 */
2032again:
91447636 2033 // LP64todo - fix this!
2d21ac55 2034 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
1c79356b
A
2035 diff = np->n_size - uio->uio_offset;
2036 if (diff < n)
2037 n = diff;
55e303ae 2038
2d21ac55 2039 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
91447636 2040 if (error) {
2d21ac55
A
2041 nfs_data_unlock(np);
2042 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
2043 return (error);
2044 }
2045
2046 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2047 /*
2048 * IO_NOCACHE found a cached buffer.
2049 * Flush the buffer if it's dirty.
2050 * Invalidate the data if it wasn't just read
2051 * in as part of a "nocache readahead".
2052 */
2053 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2054 /* so write the buffer out and try again */
2055 SET(bp->nb_flags, NB_NOCACHE);
2056 goto flushbuffer;
2057 }
2058 if (!ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2059 CLR(bp->nb_flags, NB_CACHE);
2060 bp->nb_valid = 0;
2061 } else {
2062 CLR(bp->nb_flags, NB_NCRDAHEAD);
2063 }
55e303ae
A
2064 }
2065
2066 /* if any pages are valid... */
2067 if (bp->nb_valid) {
2068 /* ...check for any invalid pages in the read range */
2069 int pg, firstpg, lastpg, dirtypg;
2070 dirtypg = firstpg = lastpg = -1;
2071 pg = on/PAGE_SIZE;
2072 while (pg <= (on + n - 1)/PAGE_SIZE) {
2073 if (!NBPGVALID(bp,pg)) {
2074 if (firstpg < 0)
2075 firstpg = pg;
2076 lastpg = pg;
2077 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
2078 dirtypg = pg;
2079 pg++;
2080 }
2081
2082 /* if there are no invalid pages, we're all set */
2083 if (firstpg < 0) {
2084 if (bp->nb_validoff < 0) {
2085 /* valid range isn't set up, so */
2086 /* set it to what we know is valid */
91447636
A
2087 bp->nb_validoff = trunc_page(on);
2088 bp->nb_validend = round_page(on+n);
55e303ae
A
2089 nfs_buf_normalize_valid_range(np, bp);
2090 }
2091 goto buffer_ready;
2092 }
2093
2094 /* there are invalid pages in the read range */
2d21ac55
A
2095 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
2096 (((firstpg*PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg+1)*PAGE_SIZE) > bp->nb_dirtyoff))) {
2097 /* there are also dirty page(s) (or range) in the read range, */
55e303ae 2098 /* so write the buffer out and try again */
2d21ac55 2099flushbuffer:
55e303ae
A
2100 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2101 SET(bp->nb_flags, NB_ASYNC);
2d21ac55 2102 if (!IS_VALID_CRED(bp->nb_wcred)) {
91447636
A
2103 kauth_cred_ref(cred);
2104 bp->nb_wcred = cred;
2105 }
55e303ae
A
2106 error = nfs_buf_write(bp);
2107 if (error) {
2d21ac55
A
2108 nfs_data_unlock(np);
2109 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
55e303ae
A
2110 return (error);
2111 }
1c79356b
A
2112 goto again;
2113 }
55e303ae 2114 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
2d21ac55 2115 (lastpg - firstpg + 1) > (biosize/PAGE_SIZE)/2) {
55e303ae
A
2116 /* we need to read in more than half the buffer and the */
2117 /* buffer's not dirty, so just fetch the whole buffer */
2118 bp->nb_valid = 0;
2119 } else {
2120 /* read the page range in */
91447636
A
2121 uio_t auio;
2122 char uio_buf[ UIO_SIZEOF(1) ];
2123
55e303ae 2124 NFS_BUF_MAP(bp);
2d21ac55 2125 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
91447636
A
2126 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
2127 if (!auio) {
2128 error = ENOMEM;
2129 } else {
2d21ac55 2130 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
91447636 2131 ((lastpg - firstpg + 1) * PAGE_SIZE));
2d21ac55 2132 error = nfs_read_rpc(np, auio, ctx);
91447636 2133 }
55e303ae 2134 if (error) {
2d21ac55 2135 if (ioflag & IO_NOCACHE)
91447636 2136 SET(bp->nb_flags, NB_NOCACHE);
483a1d10 2137 nfs_buf_release(bp, 1);
2d21ac55
A
2138 nfs_data_unlock(np);
2139 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
55e303ae
A
2140 return (error);
2141 }
2142 /* Make sure that the valid range is set to cover this read. */
2143 bp->nb_validoff = trunc_page_32(on);
2144 bp->nb_validend = round_page_32(on+n);
2145 nfs_buf_normalize_valid_range(np, bp);
91447636 2146 if (uio_resid(auio) > 0) {
55e303ae
A
2147 /* if short read, must have hit EOF, */
2148 /* so zero the rest of the range */
91447636 2149 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
55e303ae
A
2150 }
2151 /* mark the pages (successfully read) as valid */
2152 for (pg=firstpg; pg <= lastpg; pg++)
2153 NBPGVALID_SET(bp,pg);
2154 }
1c79356b 2155 }
55e303ae
A
2156 /* if no pages are valid, read the whole block */
2157 if (!bp->nb_valid) {
2d21ac55
A
2158 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2159 kauth_cred_ref(cred);
2160 bp->nb_rcred = cred;
2161 }
55e303ae
A
2162 SET(bp->nb_flags, NB_READ);
2163 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2d21ac55 2164 error = nfs_buf_read(bp);
55e303ae 2165 if (error) {
2d21ac55 2166 nfs_data_unlock(np);
483a1d10 2167 nfs_buf_release(bp, 1);
2d21ac55 2168 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
55e303ae
A
2169 return (error);
2170 }
2171 }
2172buffer_ready:
55e303ae
A
2173 /* validate read range against valid range and clip */
2174 if (bp->nb_validend > 0) {
2175 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
2176 if (diff < n)
2177 n = diff;
2178 }
2179 if (n > 0)
2180 NFS_BUF_MAP(bp);
2d21ac55 2181 } else if (vtype == VDIR) {
91447636 2182 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
2d21ac55
A
2183 error = nfs_lock(np, NFS_NODE_LOCK_SHARED);
2184 if (error || (np->n_direofoffset && (uio->uio_offset >= np->n_direofoffset))) {
2185 if (!error)
2186 nfs_unlock(np);
2187 if (eofflag)
2188 *eofflag = 1;
2189 FSDBG_BOT(514, np, 0xde0f0001, 0, 0);
55e303ae 2190 return (0);
1c79356b 2191 }
2d21ac55 2192 nfs_unlock(np);
1c79356b
A
2193 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
2194 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
2d21ac55 2195 error = nfs_buf_get(np, lbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
91447636 2196 if (error) {
2d21ac55 2197 FSDBG_BOT(514, np, 0xd1e0012, 0, error);
91447636 2198 return (error);
55e303ae
A
2199 }
2200 if (!ISSET(bp->nb_flags, NB_CACHE)) {
2201 SET(bp->nb_flags, NB_READ);
2d21ac55
A
2202 error = nfs_buf_readdir(bp, ctx);
2203 if (error)
483a1d10 2204 nfs_buf_release(bp, 1);
fa4905b1 2205 while (error == NFSERR_BAD_COOKIE) {
2d21ac55
A
2206 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
2207 if (!error) {
2208 nfs_invaldir(np);
2209 nfs_unlock(np);
2210 }
2211 error = nfs_vinvalbuf(vp, 0, ctx, 1);
fa4905b1
A
2212 /*
2213 * Yuck! The directory has been modified on the
2214 * server. The only way to get the block is by
2215 * reading from the beginning to get all the
2216 * offset cookies.
2217 */
91447636 2218 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
2d21ac55
A
2219 if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED)))
2220 break;
fa4905b1 2221 if (np->n_direofoffset
91447636 2222 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
2d21ac55
A
2223 nfs_unlock(np);
2224 if (eofflag)
2225 *eofflag = 1;
2226 FSDBG_BOT(514, np, 0xde0f0002, 0, 0);
fa4905b1 2227 return (0);
55e303ae 2228 }
2d21ac55
A
2229 nfs_unlock(np);
2230 error = nfs_buf_get(np, tlbn, NFS_DIRBLKSIZ, thd, NBLK_READ, &bp);
91447636 2231 if (error) {
2d21ac55 2232 FSDBG_BOT(514, np, 0xd1e0013, 0, error);
91447636 2233 return (error);
55e303ae
A
2234 }
2235 if (!ISSET(bp->nb_flags, NB_CACHE)) {
2236 SET(bp->nb_flags, NB_READ);
2d21ac55 2237 error = nfs_buf_readdir(bp, ctx);
fa4905b1 2238 /*
55e303ae 2239 * no error + NB_INVAL == directory EOF,
fa4905b1
A
2240 * use the block.
2241 */
2d21ac55
A
2242 if (error == 0 && ISSET(bp->nb_flags, NB_INVAL)) {
2243 if (eofflag)
2244 *eofflag = 1;
fa4905b1 2245 break;
2d21ac55 2246 }
fa4905b1
A
2247 }
2248 /*
2249 * An error will throw away the block and the
2250 * for loop will break out. If no error and this
2251 * is not the block we want, we throw away the
2252 * block and go for the next one via the for loop.
2253 */
2d21ac55 2254 if (error || (tlbn < lbn))
483a1d10 2255 nfs_buf_release(bp, 1);
fa4905b1
A
2256 }
2257 }
2258 /*
2259 * The above while is repeated if we hit another cookie
2260 * error. If we hit an error and it wasn't a cookie error,
2261 * we give up.
2262 */
55e303ae 2263 if (error) {
2d21ac55 2264 FSDBG_BOT(514, np, 0xd1e0014, 0, error);
fa4905b1 2265 return (error);
55e303ae 2266 }
1c79356b 2267 }
1c79356b
A
2268 /*
2269 * Make sure we use a signed variant of min() since
2270 * the second term may be negative.
2271 */
91447636
A
2272 // LP64todo - fix this!
2273 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
fa4905b1 2274 /*
55e303ae
A
2275 * We keep track of the directory eof in
2276 * np->n_direofoffset and chop it off as an
2277 * extra step right here.
fa4905b1 2278 */
2d21ac55
A
2279 if ((error = nfs_lock(np, NFS_NODE_LOCK_SHARED))) {
2280 FSDBG_BOT(514, np, 0xd1e0115, 0, error);
2281 return (error);
2282 }
fa4905b1
A
2283 if (np->n_direofoffset &&
2284 n > np->n_direofoffset - uio->uio_offset)
2285 n = np->n_direofoffset - uio->uio_offset;
2d21ac55 2286 nfs_unlock(np);
55e303ae
A
2287 /*
2288 * Make sure that we return an integral number of entries so
2289 * that any subsequent calls will start copying from the start
2290 * of the next entry.
2291 *
2292 * If the current value of n has the last entry cut short,
2293 * set n to copy everything up to the last entry instead.
2294 */
2295 if (n > 0) {
2296 dp = bp->nb_data + on;
2297 while (dp < (bp->nb_data + on + n)) {
2298 direntp = (struct dirent *)dp;
2299 dp += direntp->d_reclen;
2300 }
2301 if (dp > (bp->nb_data + on + n))
2302 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
2303 }
2d21ac55 2304 }
1c79356b 2305
2d21ac55 2306 if (n > 0)
55e303ae 2307 error = uiomove(bp->nb_data + on, (int)n, uio);
2d21ac55
A
2308
2309 if (vtype == VREG) {
2310 if (ioflag & IO_NOCACHE)
91447636 2311 SET(bp->nb_flags, NB_NOCACHE);
2d21ac55
A
2312 nfs_buf_release(bp, 1);
2313 nfs_data_unlock(np);
2314 np->n_lastread = (uio->uio_offset - 1) / biosize;
2315 } else {
2316 nfs_buf_release(bp, 1);
1c79356b 2317 }
91447636 2318 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
2d21ac55 2319 FSDBG_BOT(514, np, uio->uio_offset, uio_uio_resid(uio), error);
1c79356b
A
2320 return (error);
2321}
2322
2d21ac55
A
2323/*
2324 * limit the number of outstanding async I/O writes
2325 */
2326static int
2327nfs_async_write_start(struct nfsmount *nmp)
2328{
2329 int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0;
2330 struct timespec ts = {1, 0};
2331
2332 if (nfs_max_async_writes <= 0)
2333 return (0);
2334 lck_mtx_lock(&nmp->nm_lock);
36401178 2335 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
2d21ac55
A
2336 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))
2337 break;
36401178
A
2338 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag|(PZERO-1), "nfsasyncwrites", &ts);
2339 slpflag = 0;
2d21ac55
A
2340 }
2341 if (!error)
2342 nmp->nm_asyncwrites++;
2343 lck_mtx_unlock(&nmp->nm_lock);
2344 return (error);
2345}
2346static void
2347nfs_async_write_done(struct nfsmount *nmp)
2348{
2349 if (nmp->nm_asyncwrites <= 0)
2350 return;
2351 lck_mtx_lock(&nmp->nm_lock);
2352 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes)
2353 wakeup(&nmp->nm_asyncwrites);
2354 lck_mtx_unlock(&nmp->nm_lock);
2355}
fa4905b1 2356
1c79356b 2357/*
2d21ac55
A
2358 * write (or commit) the given NFS buffer
2359 *
2360 * Commit the buffer if we can.
2361 * Write out any dirty range.
2362 * If any dirty pages remain, write them out.
2363 * Mark buffer done.
2364 *
2365 * For async requests, all the work beyond sending the initial
2366 * write RPC is handled in the RPC callback(s).
1c79356b
A
2367 */
2368int
2d21ac55 2369nfs_buf_write(struct nfsbuf *bp)
1c79356b 2370{
2d21ac55
A
2371 int error = 0, oldflags, async;
2372 nfsnode_t np;
2373 thread_t thd;
91447636 2374 kauth_cred_t cred;
2d21ac55
A
2375 proc_t p = current_proc();
2376 int iomode, doff, dend, firstpg, lastpg;
2377 uint32_t pagemask;
91447636 2378
2d21ac55 2379 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
91447636 2380
2d21ac55
A
2381 if (!ISSET(bp->nb_lflags, NBL_BUSY))
2382 panic("nfs_buf_write: buffer is not busy???");
91447636 2383
2d21ac55
A
2384 np = bp->nb_np;
2385 async = ISSET(bp->nb_flags, NB_ASYNC);
2386 oldflags = bp->nb_flags;
91447636 2387
2d21ac55
A
2388 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
2389 if (ISSET(oldflags, NB_DELWRI)) {
2390 lck_mtx_lock(nfs_buf_mutex);
2391 nfs_nbdwrite--;
2392 NFSBUFCNTCHK();
2393 lck_mtx_unlock(nfs_buf_mutex);
2394 wakeup(&nfs_nbdwrite);
91447636 2395 }
2d21ac55
A
2396
2397 /* move to clean list */
2398 if (ISSET(oldflags, (NB_ASYNC|NB_DELWRI))) {
2399 lck_mtx_lock(nfs_buf_mutex);
2400 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2401 LIST_REMOVE(bp, nb_vnbufs);
2402 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2403 lck_mtx_unlock(nfs_buf_mutex);
1c79356b 2404 }
2d21ac55 2405 vnode_startwrite(NFSTOV(np));
0c530ab8 2406
2d21ac55
A
2407 if (p && p->p_stats)
2408 OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock);
0c530ab8 2409
2d21ac55
A
2410 cred = bp->nb_wcred;
2411 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ))
2412 cred = bp->nb_rcred; /* shouldn't really happen, but... */
2413 if (IS_VALID_CRED(cred))
2414 kauth_cred_ref(cred);
2415 thd = async ? NULL : current_thread();
2416
2417 /* We need to make sure the pages are locked before doing I/O. */
2418 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(NFSTOV(np))) {
2419 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2420 error = nfs_buf_upl_setup(bp);
55e303ae 2421 if (error) {
2d21ac55
A
2422 printf("nfs_buf_write: upl create failed %d\n", error);
2423 SET(bp->nb_flags, NB_ERROR);
2424 bp->nb_error = error = EIO;
2425 nfs_buf_iodone(bp);
2426 goto out;
55e303ae 2427 }
2d21ac55 2428 nfs_buf_upl_check(bp);
1c79356b
A
2429 }
2430 }
55e303ae 2431
2d21ac55
A
2432 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
2433 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2434 nfs_buf_check_write_verifier(np, bp);
2435 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2436 struct nfsmount *nmp = NFSTONMP(np);
2437 if (!nmp) {
2438 SET(bp->nb_flags, NB_ERROR);
2439 bp->nb_error = error = EIO;
2440 nfs_buf_iodone(bp);
2441 goto out;
2442 }
2443 SET(bp->nb_flags, NB_WRITEINPROG);
2444 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
2445 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred);
2446 CLR(bp->nb_flags, NB_WRITEINPROG);
2447 if (error) {
2448 if (error != NFSERR_STALEWRITEVERF) {
2449 SET(bp->nb_flags, NB_ERROR);
2450 bp->nb_error = error;
55e303ae 2451 }
2d21ac55
A
2452 nfs_buf_iodone(bp);
2453 goto out;
2454 }
2455 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2456 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2457 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2458 np->n_needcommitcnt--;
2459 CHECK_NEEDCOMMITCNT(np);
2460 nfs_unlock(np);
2461 }
2462 if (!error && (bp->nb_dirtyend > 0)) {
2463 /* sanity check the dirty range */
2464 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2465 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2466 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
2467 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
55e303ae 2468 }
91447636 2469 }
2d21ac55
A
2470 if (!error && (bp->nb_dirtyend > 0)) {
2471 /* there's a dirty range that needs to be written out */
2472 NFS_BUF_MAP(bp);
2473
2474 doff = bp->nb_dirtyoff;
2475 dend = bp->nb_dirtyend;
2476
2477 /* if doff page is dirty, move doff to start of page */
2478 if (NBPGDIRTY(bp, doff / PAGE_SIZE))
2479 doff -= doff & PAGE_MASK;
2480 /* try to expand write range to include preceding dirty pages */
2481 if (!(doff & PAGE_MASK))
2482 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE))
2483 doff -= PAGE_SIZE;
2484 /* if dend page is dirty, move dend to start of next page */
2485 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2486 dend = round_page_32(dend);
2487 /* try to expand write range to include trailing dirty pages */
2488 if (!(dend & PAGE_MASK))
2489 while ((dend < bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE))
2490 dend += PAGE_SIZE;
2491 /* make sure to keep dend clipped to EOF */
2492 if ((NBOFF(bp) + dend) > (off_t) np->n_size)
2493 dend = np->n_size - NBOFF(bp);
2494 /* calculate range of complete pages being written */
2495 firstpg = round_page_32(doff) / PAGE_SIZE;
2496 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2497 /* calculate mask for that page range */
2498 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
91447636 2499
fa4905b1 2500 /*
2d21ac55
A
2501 * compare page mask to nb_dirty; if there are other dirty pages
2502 * then write FILESYNC; otherwise, write UNSTABLE if async and
2503 * not needcommit/stable; otherwise write FILESYNC
fa4905b1 2504 */
2d21ac55
A
2505 if (bp->nb_dirty & ~pagemask)
2506 iomode = NFS_WRITE_FILESYNC;
2507 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
2508 iomode = NFS_WRITE_UNSTABLE;
2509 else
2510 iomode = NFS_WRITE_FILESYNC;
55e303ae 2511
2d21ac55
A
2512 /* write the whole contiguous dirty range */
2513 bp->nb_offio = doff;
2514 bp->nb_endio = dend;
55e303ae 2515
2d21ac55 2516 OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
55e303ae 2517
2d21ac55
A
2518 SET(bp->nb_flags, NB_WRITEINPROG);
2519 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
55e303ae 2520 /*
2d21ac55
A
2521 * For async I/O, the callbacks will finish up the
2522 * write and push out any dirty pages. Otherwise,
2523 * the write has already been finished and any dirty
2524 * pages pushed out.
55e303ae 2525 */
2d21ac55
A
2526 } else {
2527 if (!error && bp->nb_dirty) /* write out any dirty pages */
2528 error = nfs_buf_write_dirty_pages(bp, thd, cred);
2529 nfs_buf_iodone(bp);
2530 }
2531 /* note: bp is still valid only for !async case */
2532out:
2533 if (!async) {
2534 error = nfs_buf_iowait(bp);
2535 /* move to clean list */
2536 if (oldflags & NB_DELWRI) {
2537 lck_mtx_lock(nfs_buf_mutex);
2538 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2539 LIST_REMOVE(bp, nb_vnbufs);
2540 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2541 lck_mtx_unlock(nfs_buf_mutex);
2542 }
2543 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2544 nfs_buf_release(bp, 1);
2545 /* check if we need to invalidate (and we can) */
2546 if ((np->n_flag & NNEEDINVALIDATE) &&
2547 !(np->n_bflag & (NBINVALINPROG|NBFLUSHINPROG))) {
2548 int invalidate = 0;
2549 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2550 if (np->n_flag & NNEEDINVALIDATE) {
2551 invalidate = 1;
2552 np->n_flag &= ~NNEEDINVALIDATE;
55e303ae 2553 }
2d21ac55
A
2554 nfs_unlock(np);
2555 if (invalidate) {
2556 /*
2557 * There was a write error and we need to
2558 * invalidate attrs and flush buffers in
2559 * order to sync up with the server.
2560 * (if this write was extending the file,
2561 * we may no longer know the correct size)
2562 *
2563 * But we couldn't call vinvalbuf while holding
2564 * the buffer busy. So we call vinvalbuf() after
2565 * releasing the buffer.
2566 */
2567 nfs_vinvalbuf2(NFSTOV(np), V_SAVE|V_IGNORE_WRITEERR, thd, cred, 1);
55e303ae 2568 }
55e303ae 2569 }
2d21ac55
A
2570 }
2571
2572 if (IS_VALID_CRED(cred))
2573 kauth_cred_unref(&cred);
2574 return (error);
2575}
55e303ae 2576
2d21ac55
A
2577/*
2578 * finish the writing of a buffer
2579 */
2580void
2581nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2582{
2583 nfsnode_t np = bp->nb_np;
2584 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2585 int firstpg, lastpg;
2586 uint32_t pagemask;
2587
2588 if ((error == EINTR) || (error == ERESTART)) {
2589 CLR(bp->nb_flags, NB_ERROR);
2590 SET(bp->nb_flags, NB_EINTR);
2591 }
2592
2593 if (!error) {
2594 /* calculate range of complete pages being written */
2595 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2596 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2597 /* calculate mask for that page range written */
2598 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2599 /* clear dirty bits for pages we've written */
2600 bp->nb_dirty &= ~pagemask;
2601 }
2602
2603 /* manage needcommit state */
2604 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2605 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2606 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2607 np->n_needcommitcnt++;
2608 nfs_unlock(np);
2609 SET(bp->nb_flags, NB_NEEDCOMMIT);
2610 }
2611 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2612 bp->nb_dirtyoff = bp->nb_offio;
2613 bp->nb_dirtyend = bp->nb_endio;
2614 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2615 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2616 np->n_needcommitcnt--;
2617 CHECK_NEEDCOMMITCNT(np);
2618 nfs_unlock(np);
2619 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2620 }
2621
2622 CLR(bp->nb_flags, NB_WRITEINPROG);
2623
2624 /*
2625 * For an unstable write, the buffer is still treated as dirty until
2626 * a commit (or stable (re)write) is performed. Buffers needing only
2627 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2628 *
2629 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2630 * because that would cause the buffer to be dropped. The buffer is
2631 * still valid and simply needs to be written again.
2632 */
2633 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2634 CLR(bp->nb_flags, NB_INVAL);
2635 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2636 SET(bp->nb_flags, NB_DELWRI);
2637 lck_mtx_lock(nfs_buf_mutex);
2638 nfs_nbdwrite++;
2639 NFSBUFCNTCHK();
2640 lck_mtx_unlock(nfs_buf_mutex);
2641 }
fa4905b1 2642 /*
2d21ac55
A
2643 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2644 * clean list, we have to reassign it back to the dirty one. Ugh.
fa4905b1 2645 */
2d21ac55
A
2646 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2647 /* move to dirty list */
2648 lck_mtx_lock(nfs_buf_mutex);
2649 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2650 LIST_REMOVE(bp, nb_vnbufs);
2651 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2652 lck_mtx_unlock(nfs_buf_mutex);
2653 }
2654 } else {
2655 /* either there's an error or we don't need to commit */
2656 if (error) {
2657 /*
2658 * There was a write error and we need to invalidate
2659 * attrs and flush buffers in order to sync up with the
2660 * server. (if this write was extending the file, we
2661 * may no longer know the correct size)
2662 *
2663 * But we can't call vinvalbuf while holding this
2664 * buffer busy. Set a flag to do it after releasing
2665 * the buffer.
2666 */
2667 nfs_lock(np, NFS_NODE_LOCK_FORCE);
2668 np->n_error = error;
2669 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2670 NATTRINVALIDATE(np);
2671 nfs_unlock(np);
2672 }
2673 /* clear the dirty range */
2674 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2675 }
55e303ae 2676
2d21ac55
A
2677 if (!error && bp->nb_dirty)
2678 nfs_buf_write_dirty_pages(bp, thd, cred);
2679 nfs_buf_iodone(bp);
2680}
fa4905b1 2681
2d21ac55
A
2682/*
2683 * write out any pages marked dirty in a buffer
2684 *
2685 * We do use unstable writes and follow up with a commit.
2686 * If we catch the write verifier changing we'll restart
2687 * do the writes filesync.
2688 */
2689int
2690nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2691{
2692 nfsnode_t np = bp->nb_np;
2693 struct nfsmount *nmp = NFSTONMP(np);
2694 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2695 uint32_t dirty = bp->nb_dirty;
2696 uint64_t wverf;
2697 struct uio uio;
2698 struct iovec_32 io;
55e303ae 2699
2d21ac55
A
2700 if (!bp->nb_dirty)
2701 return (0);
2702
2703 /* there are pages marked dirty that need to be written out */
2704 OSAddAtomic(1, (SInt32 *)&nfsstats.write_bios);
2705 NFS_BUF_MAP(bp);
2706 SET(bp->nb_flags, NB_WRITEINPROG);
2707 npages = bp->nb_bufsize / PAGE_SIZE;
2708 iomode = NFS_WRITE_UNSTABLE;
2709
2710 uio.uio_iovs.iov32p = &io;
2711 uio.uio_iovcnt = 1;
2712 uio.uio_rw = UIO_WRITE;
2713#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2714 uio.uio_segflg = UIO_SYSSPACE;
2715#else
2716 uio.uio_segflg = UIO_SYSSPACE32;
2717#endif
2718
2719again:
2720 dirty = bp->nb_dirty;
2721 wverf = bp->nb_verf;
2722 commit = NFS_WRITE_FILESYNC;
2723 for (pg = 0; pg < npages; pg++) {
2724 if (!NBPGDIRTY(bp, pg))
2725 continue;
2726 count = 1;
2727 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count))
2728 count++;
2729 /* write count pages starting with page pg */
2730 off = pg * PAGE_SIZE;
2731 len = count * PAGE_SIZE;
2732 /* clip writes to EOF */
2733 if (NBOFF(bp) + off + len > (off_t) np->n_size)
2734 len -= (NBOFF(bp) + off + len) - np->n_size;
2735 if (len > 0) {
2736 iomode2 = iomode;
2737 io.iov_len = len;
2738 uio_uio_resid_set(&uio, io.iov_len);
2739 uio.uio_offset = NBOFF(bp) + off;
2740 io.iov_base = (uintptr_t) bp->nb_data + off;
2741 error = nfs_write_rpc2(np, &uio, thd, cred, &iomode2, &bp->nb_verf);
2742 if (error)
2743 break;
2744 if (iomode2 < commit) /* Retain the lowest commitment level returned. */
2745 commit = iomode2;
2746 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2747 /* verifier changed, redo all the writes filesync */
2748 iomode = NFS_WRITE_FILESYNC;
2749 goto again;
fa4905b1
A
2750 }
2751 }
2d21ac55
A
2752 /* clear dirty bits */
2753 while (count--) {
2754 dirty &= ~(1 << pg);
2755 if (count) /* leave pg on last page */
2756 pg++;
2757 }
2758 }
2759 CLR(bp->nb_flags, NB_WRITEINPROG);
2760
2761 if (!error && (commit != NFS_WRITE_FILESYNC)) {
2762 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred);
2763 if (error == NFSERR_STALEWRITEVERF) {
2764 /* verifier changed, so we need to restart all the writes */
2765 iomode = NFS_WRITE_FILESYNC;
2766 goto again;
2767 }
2768 }
2769 if (!error) {
2770 bp->nb_dirty = dirty;
2771 } else {
2772 SET(bp->nb_flags, NB_ERROR);
2773 bp->nb_error = error;
2774 }
2775 return (error);
2776}
2777
2778/*
2779 * initiate the NFS WRITE RPC(s) for a buffer
2780 */
2781int
2782nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2783{
2784 struct nfsmount *nmp;
2785 nfsnode_t np = bp->nb_np;
2786 int error = 0, nfsvers, async;
2787 int offset, length, nmwsize, nrpcs, len;
2788 struct nfsreq *req;
2789 struct nfsreq_cbinfo cb;
2790 struct uio uio;
2791 struct iovec_32 io;
2792
2793 nmp = NFSTONMP(np);
2794 if (!nmp) {
2795 bp->nb_error = error = ENXIO;
2796 SET(bp->nb_flags, NB_ERROR);
2797 nfs_buf_iodone(bp);
2798 return (error);
2799 }
2800 nfsvers = nmp->nm_vers;
2801 nmwsize = nmp->nm_wsize;
2802
2803 offset = bp->nb_offio;
2804 length = bp->nb_endio - bp->nb_offio;
2805
2806 /* Note: Can only do async I/O if nfsiods are configured. */
2807 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2808 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2809 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2810 cb.rcb_bp = bp;
2811
2812 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2813 bp->nb_error = error = EFBIG;
2814 SET(bp->nb_flags, NB_ERROR);
2815 nfs_buf_iodone(bp);
2816 return (error);
2817 }
2818
2819 uio.uio_iovs.iov32p = &io;
2820 uio.uio_iovcnt = 1;
2821 uio.uio_rw = UIO_WRITE;
2822#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2823 uio.uio_segflg = UIO_SYSSPACE;
2824#else
2825 uio.uio_segflg = UIO_SYSSPACE32;
2826#endif
2827 io.iov_len = length;
2828 uio_uio_resid_set(&uio, io.iov_len);
2829 uio.uio_offset = NBOFF(bp) + offset;
2830 io.iov_base = (uintptr_t) bp->nb_data + offset;
2831
2832 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2833 if (async && (nrpcs > 1)) {
2834 SET(bp->nb_flags, NB_MULTASYNCRPC);
2835 } else {
2836 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2837 }
2838
2839 while (length > 0) {
2840 if (ISSET(bp->nb_flags, NB_ERROR)) {
2841 error = bp->nb_error;
2842 break;
2843 }
2844 len = (length > nmwsize) ? nmwsize : length;
2845 cb.rcb_args[0] = offset;
2846 cb.rcb_args[1] = len;
2847 if (async && ((error = nfs_async_write_start(nmp))))
2848 break;
2849 req = NULL;
2850 error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, len, thd, cred,
2851 iomode, &cb, &req);
2852 if (error) {
2853 if (async)
2854 nfs_async_write_done(nmp);
2855 break;
2856 }
2857 offset += len;
2858 length -= len;
2859 if (async)
2860 continue;
2861 nfs_buf_write_rpc_finish(req);
2862 }
2863
2864 if (length > 0) {
fa4905b1 2865 /*
2d21ac55
A
2866 * Something bad happened while trying to send the RPCs.
2867 * Wait for any outstanding requests to complete.
fa4905b1 2868 */
2d21ac55
A
2869 bp->nb_error = error;
2870 SET(bp->nb_flags, NB_ERROR);
2871 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2872 nrpcs = (length + nmwsize - 1) / nmwsize;
2873 lck_mtx_lock(nfs_buf_mutex);
2874 bp->nb_rpcs -= nrpcs;
2875 if (bp->nb_rpcs == 0) {
2876 /* No RPCs left, so the buffer's done */
2877 lck_mtx_unlock(nfs_buf_mutex);
2878 nfs_buf_write_finish(bp, thd, cred);
2879 } else {
2880 /* wait for the last RPC to mark it done */
2881 while (bp->nb_rpcs > 0)
2882 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
2883 "nfs_buf_write_rpc_cancel", NULL);
2884 lck_mtx_unlock(nfs_buf_mutex);
fa4905b1 2885 }
2d21ac55
A
2886 } else {
2887 nfs_buf_write_finish(bp, thd, cred);
2888 }
2889 }
55e303ae 2890
2d21ac55
A
2891 return (error);
2892}
2893
2894/*
2895 * finish up an NFS WRITE RPC on a buffer
2896 */
2897void
2898nfs_buf_write_rpc_finish(struct nfsreq *req)
2899{
2900 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2901 int committed = NFS_WRITE_FILESYNC;
2902 uint64_t wverf = 0;
2903 size_t rlen;
2904 void *wakeme = NULL;
2905 struct nfsreq_cbinfo cb;
2906 struct nfsreq *wreq = NULL;
2907 struct nfsbuf *bp;
2908 struct nfsmount *nmp;
2909 nfsnode_t np;
2910 thread_t thd;
2911 kauth_cred_t cred;
2912 struct uio uio;
2913 struct iovec_32 io;
2914
2915finish:
2916 np = req->r_np;
2917 thd = req->r_thread;
2918 cred = req->r_cred;
2919 if (IS_VALID_CRED(cred))
2920 kauth_cred_ref(cred);
2921 cb = req->r_callback;
2922 bp = cb.rcb_bp;
2923
2924 nmp = NFSTONMP(np);
2925 if (!nmp) {
2926 SET(bp->nb_flags, NB_ERROR);
2927 bp->nb_error = error = ENXIO;
2928 }
2929 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
2930 /* just drop it */
2931 nfs_request_async_cancel(req);
2932 goto out;
2933 }
2934 nfsvers = nmp->nm_vers;
2935
2936 offset = cb.rcb_args[0];
2937 rlen = length = cb.rcb_args[1];
2938
2939 /* finish the RPC */
2940 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
2941 if ((error == EINPROGRESS) && cb.rcb_func) {
2942 /* async request restarted */
2943 if (IS_VALID_CRED(cred))
2944 kauth_cred_unref(&cred);
2945 return;
2946 }
2947
2948 if (error) {
2949 SET(bp->nb_flags, NB_ERROR);
2950 bp->nb_error = error;
2951 }
2952 if (error || (nfsvers == NFS_VER2))
2953 goto out;
2954 if (rlen <= 0) {
2955 SET(bp->nb_flags, NB_ERROR);
2956 bp->nb_error = error = EIO;
2957 goto out;
2958 }
2959
2960 /* save lowest commit level returned */
2961 if (committed < bp->nb_commitlevel)
2962 bp->nb_commitlevel = committed;
2963
2964 /* check the write verifier */
2965 if (!bp->nb_verf) {
2966 bp->nb_verf = wverf;
2967 } else if (bp->nb_verf != wverf) {
2968 /* verifier changed, so buffer will need to be rewritten */
2969 bp->nb_flags |= NB_STALEWVERF;
2970 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
2971 bp->nb_verf = wverf;
2972 }
2973
2974 /*
2975 * check for a short write
2976 *
2977 * If the server didn't write all the data, then we
2978 * need to issue another write for the rest of it.
2979 * (Don't bother if the buffer hit an error or stale wverf.)
2980 */
2981 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF|NB_ERROR))) {
2982 offset += rlen;
2983 length -= rlen;
2984
2985 uio.uio_iovs.iov32p = &io;
2986 uio.uio_iovcnt = 1;
2987 uio.uio_rw = UIO_WRITE;
91447636 2988#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2d21ac55 2989 uio.uio_segflg = UIO_SYSSPACE;
91447636 2990#else
2d21ac55
A
2991 uio.uio_segflg = UIO_SYSSPACE32;
2992#endif
2993 io.iov_len = length;
2994 uio_uio_resid_set(&uio, io.iov_len);
2995 uio.uio_offset = NBOFF(bp) + offset;
2996 io.iov_base = (uintptr_t) bp->nb_data + offset;
55e303ae 2997
2d21ac55
A
2998 cb.rcb_args[0] = offset;
2999 cb.rcb_args[1] = length;
55e303ae 3000
2d21ac55
A
3001 error = nmp->nm_funcs->nf_write_rpc_async(np, &uio, length, thd, cred,
3002 NFS_WRITE_FILESYNC, &cb, &wreq);
3003 if (!error) {
3004 if (IS_VALID_CRED(cred))
3005 kauth_cred_unref(&cred);
3006 if (!cb.rcb_func) {
3007 /* if !async we'll need to wait for this RPC to finish */
3008 req = wreq;
3009 goto finish;
fa4905b1 3010 }
2d21ac55
A
3011 /*
3012 * We're done here.
3013 * Outstanding RPC count is unchanged.
3014 * Callback will be called when RPC is done.
3015 */
3016 return;
fa4905b1 3017 }
2d21ac55
A
3018 SET(bp->nb_flags, NB_ERROR);
3019 bp->nb_error = error;
3020 }
55e303ae 3021
2d21ac55
A
3022out:
3023 if (cb.rcb_func)
3024 nfs_async_write_done(nmp);
3025 /*
3026 * Decrement outstanding RPC count on buffer
3027 * and call nfs_buf_write_finish on last RPC.
3028 *
3029 * (Note: when there are multiple async RPCs issued for a
3030 * buffer we need nfs_buffer_mutex to avoid problems when
3031 * aborting a partially-initiated set of RPCs)
3032 */
3033 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
3034 if (multasyncrpc)
3035 lck_mtx_lock(nfs_buf_mutex);
3036
3037 bp->nb_rpcs--;
3038 finished = (bp->nb_rpcs == 0);
55e303ae 3039
2d21ac55
A
3040 if (multasyncrpc)
3041 lck_mtx_unlock(nfs_buf_mutex);
3042
3043 if (finished) {
3044 if (multasyncrpc)
3045 wakeme = &bp->nb_rpcs;
3046 nfs_buf_write_finish(bp, thd, cred);
3047 if (wakeme)
3048 wakeup(wakeme);
3049 }
3050
3051 if (IS_VALID_CRED(cred))
3052 kauth_cred_unref(&cred);
3053}
3054
3055/*
3056 * Send commit(s) for the given node's "needcommit" buffers
3057 */
3058int
3059nfs_flushcommits(nfsnode_t np, int nowait)
3060{
3061 struct nfsmount *nmp;
3062 struct nfsbuf *bp;
3063 struct nfsbuflists blist, commitlist;
3064 int error = 0, retv, wcred_set, flags, dirty;
3065 u_quad_t off, endoff, toff;
3066 u_int32_t count;
3067 kauth_cred_t wcred = NULL;
3068
3069 FSDBG_TOP(557, np, 0, 0, 0);
3070
3071 /*
3072 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3073 * server, but nas not been committed to stable storage on the server
3074 * yet. The byte range is worked out for as many nfsbufs as we can handle
3075 * and the commit rpc is done.
3076 */
3077 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3078 error = nfs_lock(np, NFS_NODE_LOCK_EXCLUSIVE);
3079 if (error)
3080 goto done;
1c79356b 3081 np->n_flag |= NMODIFIED;
2d21ac55
A
3082 nfs_unlock(np);
3083 }
1c79356b 3084
2d21ac55
A
3085 off = (u_quad_t)-1;
3086 endoff = 0;
3087 wcred_set = 0;
3088 LIST_INIT(&commitlist);
3089
3090 nmp = NFSTONMP(np);
3091 if (!nmp) {
3092 error = ENXIO;
3093 goto done;
3094 }
3095 if (nmp->nm_vers == NFS_VER2) {
3096 error = EINVAL;
3097 goto done;
3098 }
3099
3100 flags = NBI_DIRTY;
3101 if (nowait)
3102 flags |= NBI_NOWAIT;
3103 lck_mtx_lock(nfs_buf_mutex);
3104 if (!nfs_buf_iterprepare(np, &blist, flags)) {
3105 while ((bp = LIST_FIRST(&blist))) {
3106 LIST_REMOVE(bp, nb_vnbufs);
3107 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3108 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
3109 if (error)
3110 continue;
3111 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3112 nfs_buf_check_write_verifier(np, bp);
3113 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT))
3114 != (NB_DELWRI | NB_NEEDCOMMIT))) {
3115 nfs_buf_drop(bp);
3116 continue;
3117 }
3118 nfs_buf_remfree(bp);
3119 lck_mtx_unlock(nfs_buf_mutex);
3120 /*
3121 * we need a upl to see if the page has been
3122 * dirtied (think mmap) since the unstable write, and
3123 * also to prevent vm from paging it during our commit rpc
3124 */
3125 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3126 retv = nfs_buf_upl_setup(bp);
3127 if (retv) {
3128 /* unable to create upl */
3129 /* vm object must no longer exist */
3130 /* this could be fatal if we need */
3131 /* to write the data again, we'll see... */
3132 printf("nfs_flushcommits: upl create failed %d\n", retv);
3133 bp->nb_valid = bp->nb_dirty = 0;
3134 }
3135 }
3136 nfs_buf_upl_check(bp);
3137 lck_mtx_lock(nfs_buf_mutex);
3138
3139 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3140 FSDBG(557, bp->nb_validoff, bp->nb_validend,
3141 bp->nb_dirtyoff, bp->nb_dirtyend);
55e303ae 3142
55e303ae 3143 /*
2d21ac55
A
3144 * We used to check for dirty pages here; if there were any
3145 * we'd abort the commit and force the entire buffer to be
3146 * written again.
3147 *
3148 * Instead of doing that, we now go ahead and commit the dirty
3149 * range, and then leave the buffer around with dirty pages
3150 * that will be written out later.
55e303ae 3151 */
2d21ac55
A
3152
3153 /*
3154 * Work out if all buffers are using the same cred
3155 * so we can deal with them all with one commit.
3156 *
3157 * Note: creds in bp's must be obtained by kauth_cred_ref
3158 * on the same original cred in order for them to be equal.
3159 */
3160 if (wcred_set == 0) {
3161 wcred = bp->nb_wcred;
3162 if (!IS_VALID_CRED(wcred))
3163 panic("nfs: needcommit w/out wcred");
3164 wcred_set = 1;
3165 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3166 wcred_set = -1;
3167 }
3168 SET(bp->nb_flags, NB_WRITEINPROG);
3169
3170 /*
3171 * A list of these buffers is kept so that the
3172 * second loop knows which buffers have actually
3173 * been committed. This is necessary, since there
3174 * may be a race between the commit rpc and new
3175 * uncommitted writes on the file.
3176 */
3177 LIST_REMOVE(bp, nb_vnbufs);
3178 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
3179 toff = NBOFF(bp) + bp->nb_dirtyoff;
3180 if (toff < off)
3181 off = toff;
3182 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
3183 if (toff > endoff)
3184 endoff = toff;
3185 }
3186 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3187 }
3188 lck_mtx_unlock(nfs_buf_mutex);
3189
3190 if (LIST_EMPTY(&commitlist)) {
3191 error = ENOBUFS;
3192 goto done;
3193 }
3194
3195 /*
3196 * Commit data on the server, as required.
3197 * If all bufs are using the same wcred, then use that with
3198 * one call for all of them, otherwise commit each one
3199 * separately.
3200 */
3201 if (wcred_set == 1) {
3202 /*
3203 * Note, it's possible the commit range could be >2^32-1.
3204 * If it is, we'll send one commit that covers the whole file.
3205 */
3206 if ((endoff - off) > 0xffffffff)
3207 count = 0;
3208 else
3209 count = (endoff - off);
3210 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred);
3211 } else {
3212 retv = 0;
3213 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3214 toff = NBOFF(bp) + bp->nb_dirtyoff;
3215 count = bp->nb_dirtyend - bp->nb_dirtyoff;
3216 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred);
3217 if (retv)
3218 break;
55e303ae 3219 }
2d21ac55
A
3220 }
3221
3222 /*
3223 * Now, either mark the blocks I/O done or mark the
3224 * blocks dirty, depending on whether the commit
3225 * succeeded.
3226 */
3227 while ((bp = LIST_FIRST(&commitlist))) {
3228 LIST_REMOVE(bp, nb_vnbufs);
3229 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
3230 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3231 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3232 np->n_needcommitcnt--;
3233 CHECK_NEEDCOMMITCNT(np);
3234 nfs_unlock(np);
3235
3236 if (retv) {
3237 /* move back to dirty list */
3238 lck_mtx_lock(nfs_buf_mutex);
3239 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3240 lck_mtx_unlock(nfs_buf_mutex);
3241 nfs_buf_release(bp, 1);
3242 continue;
1c79356b 3243 }
2d21ac55
A
3244
3245 vnode_startwrite(NFSTOV(np));
3246 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3247 lck_mtx_lock(nfs_buf_mutex);
3248 nfs_nbdwrite--;
3249 NFSBUFCNTCHK();
3250 lck_mtx_unlock(nfs_buf_mutex);
3251 wakeup(&nfs_nbdwrite);
1c79356b 3252 }
2d21ac55
A
3253 CLR(bp->nb_flags, (NB_READ|NB_DONE|NB_ERROR|NB_DELWRI));
3254 /* if block still has dirty pages, we don't want it to */
3255 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
3256 if (!(dirty = bp->nb_dirty))
3257 SET(bp->nb_flags, NB_ASYNC);
3258 else
3259 CLR(bp->nb_flags, NB_ASYNC);
1c79356b 3260
2d21ac55
A
3261 /* move to clean list */
3262 lck_mtx_lock(nfs_buf_mutex);
3263 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3264 lck_mtx_unlock(nfs_buf_mutex);
3265
3266 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3267
3268 nfs_buf_iodone(bp);
3269 if (dirty) {
3270 /* throw it back in as a delayed write buffer */
3271 CLR(bp->nb_flags, NB_DONE);
3272 nfs_buf_write_delayed(bp);
55e303ae 3273 }
2d21ac55 3274 }
1c79356b 3275
2d21ac55
A
3276done:
3277 FSDBG_BOT(557, np, 0, 0, error);
3278 return (error);
3279}
3280
3281/*
3282 * Flush all the blocks associated with a vnode.
3283 * Walk through the buffer pool and push any dirty pages
3284 * associated with the vnode.
3285 */
3286int
3287nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3288{
3289 struct nfsbuf *bp;
3290 struct nfsbuflists blist;
3291 struct nfsmount *nmp = NFSTONMP(np);
3292 int error = 0, error2, slptimeo = 0, slpflag = 0;
3293 int nfsvers, flags, passone = 1;
3294
3295 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3296
3297 if (!nmp) {
3298 error = ENXIO;
3299 goto out;
3300 }
3301 nfsvers = nmp->nm_vers;
3302 if (nmp->nm_flag & NFSMNT_INT)
3303 slpflag = PCATCH;
3304
3305 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3306 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3307 np->n_flag |= NMODIFIED;
3308 nfs_unlock(np);
3309 }
3310
3311 lck_mtx_lock(nfs_buf_mutex);
3312 while (np->n_bflag & NBFLUSHINPROG) {
3313 np->n_bflag |= NBFLUSHWANT;
3314 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3315 if (error) {
3316 lck_mtx_unlock(nfs_buf_mutex);
3317 goto out;
3318 }
3319 }
3320 np->n_bflag |= NBFLUSHINPROG;
3321
3322 /*
3323 * On the first pass, start async/unstable writes on all
3324 * delayed write buffers. Then wait for all writes to complete
3325 * and call nfs_flushcommits() to commit any uncommitted buffers.
3326 * On all subsequent passes, start STABLE writes on any remaining
3327 * dirty buffers. Then wait for all writes to complete.
3328 */
3329again:
3330 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3331 if (!NFSTONMP(np)) {
3332 lck_mtx_unlock(nfs_buf_mutex);
3333 error = ENXIO;
3334 goto done;
3335 }
3336
3337 /* Start/do any write(s) that are required. */
3338 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3339 while ((bp = LIST_FIRST(&blist))) {
3340 LIST_REMOVE(bp, nb_vnbufs);
3341 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3342 flags = (passone || (waitfor != MNT_WAIT)) ? NBAC_NOWAIT : 0;
3343 if (flags != NBAC_NOWAIT)
3344 nfs_buf_refget(bp);
3345 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3346 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
3347 if (error == EBUSY)
3348 break;
3349 if (error) {
3350 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3351 if (error2) {
3352 if (flags != NBAC_NOWAIT)
3353 nfs_buf_refrele(bp);
3354 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3355 lck_mtx_unlock(nfs_buf_mutex);
3356 error = error2;
3357 goto done;
3358 }
3359 if (slpflag == PCATCH) {
3360 slpflag = 0;
3361 slptimeo = 2 * hz;
3362 }
3363 }
3364 }
3365 if (flags != NBAC_NOWAIT)
3366 nfs_buf_refrele(bp);
3367 if (error == EBUSY)
3368 continue;
3369 if (!bp->nb_np) {
3370 /* buffer is no longer valid */
3371 nfs_buf_drop(bp);
3372 continue;
3373 }
3374 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3375 nfs_buf_check_write_verifier(np, bp);
3376 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3377 /* buffer is no longer dirty */
3378 nfs_buf_drop(bp);
3379 continue;
3380 }
3381 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
3382 if ((passone || (waitfor != MNT_WAIT)) &&
3383 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3384 nfs_buf_drop(bp);
3385 continue;
3386 }
3387 nfs_buf_remfree(bp);
3388 lck_mtx_unlock(nfs_buf_mutex);
3389 if (ISSET(bp->nb_flags, NB_ERROR)) {
3390 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3391 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3392 np->n_flag |= NWRITEERR;
3393 nfs_unlock(np);
3394 nfs_buf_release(bp, 1);
3395 lck_mtx_lock(nfs_buf_mutex);
3396 continue;
3397 }
3398 SET(bp->nb_flags, NB_ASYNC);
3399 if (!passone) {
3400 /* NB_STABLE forces this to be written FILESYNC */
3401 SET(bp->nb_flags, NB_STABLE);
3402 }
3403 nfs_buf_write(bp);
3404 lck_mtx_lock(nfs_buf_mutex);
3405 }
3406 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3407 }
3408 lck_mtx_unlock(nfs_buf_mutex);
3409
3410 if (waitfor == MNT_WAIT) {
3411 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3412 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3413 if (error2) {
3414 error = error2;
3415 goto done;
3416 }
3417 if (slpflag == PCATCH) {
3418 slpflag = 0;
3419 slptimeo = 2 * hz;
55e303ae 3420 }
2d21ac55
A
3421 }
3422 }
55e303ae 3423
2d21ac55
A
3424 if (nfsvers != NFS_VER2) {
3425 /* loop while it looks like there are still buffers to be */
3426 /* commited and nfs_flushcommits() seems to be handling them. */
3427 while (np->n_needcommitcnt)
3428 if (nfs_flushcommits(np, 0))
3429 break;
3430 }
55e303ae 3431
2d21ac55
A
3432 if (passone) {
3433 passone = 0;
3434 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3435 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3436 np->n_flag |= NMODIFIED;
3437 nfs_unlock(np);
3438 }
3439 lck_mtx_lock(nfs_buf_mutex);
3440 goto again;
3441 }
55e303ae 3442
2d21ac55
A
3443 if (waitfor == MNT_WAIT) {
3444 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
3445 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3446 np->n_flag |= NMODIFIED;
3447 nfs_unlock(np);
3448 }
3449 lck_mtx_lock(nfs_buf_mutex);
3450 if (!LIST_EMPTY(&np->n_dirtyblkhd))
3451 goto again;
3452 lck_mtx_unlock(nfs_buf_mutex);
3453 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3454 /* if we have no dirty blocks, we can clear the modified flag */
3455 if (!np->n_wrbusy)
3456 np->n_flag &= ~NMODIFIED;
3457 } else {
3458 nfs_lock(np, NFS_NODE_LOCK_FORCE);
0c530ab8
A
3459 }
3460
2d21ac55
A
3461 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3462 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3463 error = np->n_error;
3464 np->n_flag &= ~NWRITEERR;
3465 }
3466 nfs_unlock(np);
3467done:
3468 lck_mtx_lock(nfs_buf_mutex);
3469 flags = np->n_bflag;
3470 np->n_bflag &= ~(NBFLUSHINPROG|NBFLUSHWANT);
3471 lck_mtx_unlock(nfs_buf_mutex);
3472 if (flags & NBFLUSHWANT)
3473 wakeup(&np->n_bflag);
3474out:
3475 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
0c530ab8 3476 return (error);
1c79356b
A
3477}
3478
1c79356b 3479/*
55e303ae
A
3480 * Flush out and invalidate all buffers associated with a vnode.
3481 * Called with the underlying object locked.
1c79356b 3482 */
55e303ae 3483static int
91447636 3484nfs_vinvalbuf_internal(
2d21ac55 3485 nfsnode_t np,
91447636 3486 int flags,
2d21ac55 3487 thread_t thd,
91447636 3488 kauth_cred_t cred,
91447636
A
3489 int slpflag,
3490 int slptimeo)
1c79356b 3491{
55e303ae 3492 struct nfsbuf *bp;
91447636
A
3493 struct nfsbuflists blist;
3494 int list, error = 0;
9bccf70c 3495
55e303ae 3496 if (flags & V_SAVE) {
2d21ac55 3497 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR))))
55e303ae 3498 return (error);
9bccf70c
A
3499 }
3500
91447636 3501 lck_mtx_lock(nfs_buf_mutex);
55e303ae 3502 for (;;) {
91447636
A
3503 list = NBI_CLEAN;
3504 if (nfs_buf_iterprepare(np, &blist, list)) {
3505 list = NBI_DIRTY;
3506 if (nfs_buf_iterprepare(np, &blist, list))
3507 break;
3508 }
3509 while ((bp = LIST_FIRST(&blist))) {
3510 LIST_REMOVE(bp, nb_vnbufs);
3511 if (list == NBI_CLEAN)
3512 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3513 else
3514 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3515 nfs_buf_refget(bp);
3516 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2d21ac55 3517 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
91447636 3518 if (error != EAGAIN) {
2d21ac55 3519 FSDBG(554, np, bp, -1, error);
91447636
A
3520 nfs_buf_refrele(bp);
3521 nfs_buf_itercomplete(np, &blist, list);
3522 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
3523 return (error);
3524 }
55e303ae 3525 }
91447636 3526 nfs_buf_refrele(bp);
2d21ac55 3527 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
91447636 3528 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 3529 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
91447636 3530 (NBOFF(bp) < (off_t)np->n_size)) {
2d21ac55 3531 /* extra paranoia: make sure we're not */
55e303ae
A
3532 /* somehow leaving any dirty data around */
3533 int mustwrite = 0;
91447636
A
3534 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3535 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
55e303ae
A
3536 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3537 error = nfs_buf_upl_setup(bp);
3538 if (error == EINVAL) {
3539 /* vm object must no longer exist */
3540 /* hopefully we don't need to do */
3541 /* anything for this buffer */
3542 } else if (error)
91447636 3543 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
55e303ae
A
3544 bp->nb_valid = bp->nb_dirty = 0;
3545 }
3546 nfs_buf_upl_check(bp);
3547 /* check for any dirty data before the EOF */
2d21ac55 3548 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
55e303ae 3549 /* clip dirty range to EOF */
2d21ac55 3550 if (bp->nb_dirtyend > end) {
55e303ae 3551 bp->nb_dirtyend = end;
2d21ac55
A
3552 if (bp->nb_dirtyoff >= bp->nb_dirtyend)
3553 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3554 }
3555 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end))
3556 mustwrite++;
55e303ae
A
3557 }
3558 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2d21ac55
A
3559 if (bp->nb_dirty)
3560 mustwrite++;
91447636 3561 /* also make sure we'll have a credential to do the write */
0c530ab8 3562 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
91447636
A
3563 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3564 mustwrite = 0;
3565 }
55e303ae 3566 if (mustwrite) {
2d21ac55 3567 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
55e303ae
A
3568 if (!ISSET(bp->nb_flags, NB_PAGELIST))
3569 panic("nfs_vinvalbuf: dirty buffer without upl");
3570 /* gotta write out dirty data before invalidating */
3571 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3572 /* (NB_NOCACHE indicates buffer should be discarded) */
3573 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3574 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
0c530ab8 3575 if (!IS_VALID_CRED(bp->nb_wcred)) {
91447636
A
3576 kauth_cred_ref(cred);
3577 bp->nb_wcred = cred;
3578 }
55e303ae
A
3579 error = nfs_buf_write(bp);
3580 // Note: bp has been released
3581 if (error) {
3582 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2d21ac55 3583 nfs_lock(np, NFS_NODE_LOCK_FORCE);
55e303ae
A
3584 np->n_error = error;
3585 np->n_flag |= NWRITEERR;
2d21ac55 3586 nfs_unlock(np);
91447636
A
3587 /*
3588 * There was a write error and we need to
3589 * invalidate attrs to sync with server.
3590 * (if this write was extending the file,
3591 * we may no longer know the correct size)
3592 */
3593 NATTRINVALIDATE(np);
55e303ae
A
3594 error = 0;
3595 }
91447636
A
3596 lck_mtx_lock(nfs_buf_mutex);
3597 continue;
55e303ae
A
3598 }
3599 }
3600 SET(bp->nb_flags, NB_INVAL);
91447636 3601 // hold off on FREEUPs until we're done here
483a1d10 3602 nfs_buf_release(bp, 0);
91447636 3603 lck_mtx_lock(nfs_buf_mutex);
55e303ae 3604 }
91447636 3605 nfs_buf_itercomplete(np, &blist, list);
55e303ae 3606 }
2d21ac55
A
3607 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd))
3608 panic("nfs_vinvalbuf: flush/inval failed");
91447636 3609 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55
A
3610 if (!(flags & V_SAVE)) {
3611 nfs_lock(np, NFS_NODE_LOCK_FORCE);
3612 np->n_flag &= ~NMODIFIED;
3613 nfs_unlock(np);
3614 }
483a1d10 3615 NFS_BUF_FREEUP();
55e303ae 3616 return (0);
1c79356b
A
3617}
3618
55e303ae 3619
1c79356b
A
3620/*
3621 * Flush and invalidate all dirty buffers. If another process is already
3622 * doing the flush, just wait for completion.
3623 */
3624int
2d21ac55
A
3625nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3626{
3627 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3628}
3629
3630int
3631nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
1c79356b 3632{
2d21ac55
A
3633 nfsnode_t np = VTONFS(vp);
3634 struct nfsmount *nmp = VTONMP(vp);
3635 int error, rv, slpflag, slptimeo, nflags;
91447636 3636 off_t size;
1c79356b 3637
2d21ac55 3638 FSDBG_TOP(554, np, flags, intrflg, 0);
55e303ae 3639
2d21ac55 3640 if (nmp && !(nmp->nm_flag & NFSMNT_INT))
1c79356b
A
3641 intrflg = 0;
3642 if (intrflg) {
3643 slpflag = PCATCH;
3644 slptimeo = 2 * hz;
3645 } else {
3646 slpflag = 0;
3647 slptimeo = 0;
3648 }
1c79356b 3649
2d21ac55
A
3650 /* First wait for any other process doing a flush to complete. */
3651 lck_mtx_lock(nfs_buf_mutex);
3652 while (np->n_bflag & NBINVALINPROG) {
3653 np->n_bflag |= NBINVALWANT;
3654 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL);
55e303ae 3655 if (error) {
2d21ac55 3656 lck_mtx_unlock(nfs_buf_mutex);
55e303ae 3657 return (error);
1c79356b 3658 }
1c79356b 3659 }
2d21ac55
A
3660 np->n_bflag |= NBINVALINPROG;
3661 lck_mtx_unlock(nfs_buf_mutex);
3662
3663 /* Now, flush as required. */
3664 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3665 while (error) {
3666 FSDBG(554, np, 0, 0, error);
3667 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0)))
3668 goto done;
3669 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
1c79356b 3670 }
2d21ac55
A
3671
3672 /* get the pages out of vm also */
3673 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp)))
3674 if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE)))
91447636 3675 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2d21ac55
A
3676done:
3677 lck_mtx_lock(nfs_buf_mutex);
3678 nflags = np->n_bflag;
3679 np->n_bflag &= ~(NBINVALINPROG|NBINVALWANT);
3680 lck_mtx_unlock(nfs_buf_mutex);
3681 if (nflags & NBINVALWANT)
3682 wakeup(&np->n_bflag);
91447636 3683
2d21ac55
A
3684 FSDBG_BOT(554, np, flags, intrflg, error);
3685 return (error);
1c79356b
A
3686}
3687
3688/*
2d21ac55
A
3689 * Add an async I/O request to the mount's async I/O queue and make
3690 * sure that an nfsiod will service it.
1c79356b 3691 */
2d21ac55
A
3692void
3693nfs_asyncio_finish(struct nfsreq *req)
1c79356b
A
3694{
3695 struct nfsmount *nmp;
2d21ac55
A
3696 struct nfsiod *niod;
3697 int started = 0;
1c79356b 3698
2d21ac55 3699 FSDBG_TOP(552, nmp, 0, 0, 0);
1c79356b 3700again:
2d21ac55
A
3701 if (((nmp = req->r_nmp)) == NULL)
3702 return;
3703 lck_mtx_lock(nfsiod_mutex);
3704 niod = nmp->nm_niod;
3705
3706 /* grab an nfsiod if we don't have one already */
3707 if (!niod) {
3708 niod = TAILQ_FIRST(&nfsiodfree);
3709 if (niod) {
3710 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
3711 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
3712 niod->niod_nmp = nmp;
3713 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
1c79356b 3714 /*
2d21ac55
A
3715 * Try starting a new thread.
3716 * We may try a couple times if other callers
3717 * get the new threads before we do.
1c79356b 3718 */
2d21ac55
A
3719 lck_mtx_unlock(nfsiod_mutex);
3720 started++;
3721 if (!nfsiod_start())
3722 goto again;
3723 lck_mtx_lock(nfsiod_mutex);
1c79356b 3724 }
91447636 3725 }
55e303ae 3726
2d21ac55
A
3727 if (req->r_achain.tqe_next == NFSREQNOLIST)
3728 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
3729
3730 /* If this mount doesn't already have an nfsiod working on it... */
3731 if (!nmp->nm_niod) {
3732 if (niod) { /* give it the nfsiod we just grabbed */
3733 nmp->nm_niod = niod;
3734 lck_mtx_unlock(nfsiod_mutex);
3735 wakeup(niod);
3736 } else if (nfsiod_thread_count > 0) {
3737 /* just queue it up on nfsiod mounts queue */
3738 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
3739 lck_mtx_unlock(nfsiod_mutex);
3740 } else {
3741 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
3742 lck_mtx_unlock(nfsiod_mutex);
3743 /* we have no other option but to be persistent */
3744 started = 0;
3745 goto again;
1c79356b 3746 }
2d21ac55
A
3747 } else {
3748 lck_mtx_unlock(nfsiod_mutex);
1c79356b
A
3749 }
3750
2d21ac55
A
3751 FSDBG_BOT(552, nmp, 0, 0, 0);
3752}
1c79356b 3753
2d21ac55
A
3754/*
3755 * queue up async I/O request for resend
3756 */
3757void
3758nfs_asyncio_resend(struct nfsreq *req)
3759{
3760 struct nfsmount *nmp = req->r_nmp;
1c79356b 3761
2d21ac55
A
3762 if (!nmp)
3763 return;
3764 nfs_gss_clnt_rpcdone(req);
3765 lck_mtx_lock(&nmp->nm_lock);
3766 if (req->r_rchain.tqe_next == NFSREQNOLIST) {
3767 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
3768 req->r_flags |= R_RESENDQ;
1c79356b 3769 }
2d21ac55
A
3770 nfs_mount_sock_thread_wake(nmp);
3771 lck_mtx_unlock(&nmp->nm_lock);
1c79356b
A
3772}
3773
3774/*
2d21ac55 3775 * Read an NFS buffer for a directory.
1c79356b
A
3776 */
3777int
2d21ac55 3778nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
1c79356b 3779{
2d21ac55 3780 nfsnode_t np;
91447636 3781 vnode_t vp;
1c79356b 3782 struct nfsmount *nmp;
2d21ac55 3783 int error = 0, nfsvers;
1c79356b 3784 struct uio uio;
91447636 3785 struct iovec_32 io;
1c79356b 3786
2d21ac55
A
3787 np = bp->nb_np;
3788 vp = NFSTOV(np);
3789 nmp = VTONMP(vp);
3790 nfsvers = nmp->nm_vers;
3791 uio.uio_iovs.iov32p = &io;
3792 uio.uio_iovcnt = 1;
91447636 3793#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2d21ac55 3794 uio.uio_segflg = UIO_SYSSPACE;
91447636 3795#else
2d21ac55
A
3796 uio.uio_segflg = UIO_SYSSPACE32;
3797#endif
1c79356b 3798
2d21ac55
A
3799 /* sanity check */
3800 if (ISSET(bp->nb_flags, NB_DONE))
55e303ae 3801 CLR(bp->nb_flags, NB_DONE);
55e303ae 3802
2d21ac55
A
3803 uio.uio_rw = UIO_READ;
3804 io.iov_len = bp->nb_bufsize;
3805 uio_uio_resid_set(&uio, io.iov_len);
3806 io.iov_base = (uintptr_t) bp->nb_data;
3807 uio.uio_offset = NBOFF(bp);
55e303ae 3808
2d21ac55
A
3809 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
3810 if (nfsvers < NFS_VER4) {
3811 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
3812 error = nfs3_readdirplus_rpc(np, &uio, ctx);
3813 if (error == NFSERR_NOTSUPP) {
3814 lck_mtx_lock(&nmp->nm_lock);
3815 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
3816 lck_mtx_unlock(&nmp->nm_lock);
1c79356b 3817 }
fa4905b1 3818 }
2d21ac55
A
3819 if (!(nmp->nm_flag & NFSMNT_RDIRPLUS))
3820 error = nfs3_readdir_rpc(np, &uio, ctx);
3821 } else {
3822 error = nfs4_readdir_rpc(np, &uio, ctx);
3823 }
3824 if (error) {
55e303ae
A
3825 SET(bp->nb_flags, NB_ERROR);
3826 bp->nb_error = error;
2d21ac55
A
3827 } else {
3828 bp->nb_validoff = 0;
3829 bp->nb_validend = uio.uio_offset - NBOFF(bp);
3830 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
1c79356b 3831 }
1c79356b 3832
55e303ae 3833 nfs_buf_iodone(bp);
1c79356b
A
3834 return (error);
3835}