]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_bio.c
xnu-6153.141.1.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
CommitLineData
1c79356b 1/*
cb323159 2 * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
ea3f0419
A
67
68#include <nfs/nfs_conf.h>
69#if CONFIG_NFS_CLIENT
70
1c79356b
A
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/resourcevar.h>
74#include <sys/signalvar.h>
91447636
A
75#include <sys/proc_internal.h>
76#include <sys/kauth.h>
55e303ae 77#include <sys/malloc.h>
1c79356b 78#include <sys/vnode.h>
55e303ae 79#include <sys/dirent.h>
91447636 80#include <sys/mount_internal.h>
1c79356b 81#include <sys/kernel.h>
91447636
A
82#include <sys/ubc_internal.h>
83#include <sys/uio_internal.h>
6d2010ae 84#include <sys/kpi_mbuf.h>
1c79356b
A
85
86#include <sys/vm.h>
87#include <sys/vmparam.h>
88
89#include <sys/time.h>
90#include <kern/clock.h>
91447636
A
91#include <libkern/OSAtomic.h>
92#include <kern/kalloc.h>
2d21ac55 93#include <kern/thread_call.h>
1c79356b
A
94
95#include <nfs/rpcv2.h>
96#include <nfs/nfsproto.h>
97#include <nfs/nfs.h>
2d21ac55 98#include <nfs/nfs_gss.h>
1c79356b 99#include <nfs/nfsmount.h>
1c79356b 100#include <nfs/nfsnode.h>
91447636 101#include <sys/buf_internal.h>
2d21ac55 102#include <libkern/OSAtomic.h>
cb323159 103#include <os/refcnt.h>
1c79356b 104
39037602
A
105#define NFS_BIO_DBG(...) NFS_DBG(NFS_FAC_BIO, 7, ## __VA_ARGS__)
106
0a7de745 107kern_return_t thread_terminate(thread_t); /* XXX */
55e303ae 108
0a7de745 109#define NFSBUFHASH(np, lbn) \
91447636 110 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
0a7de745 111LIST_HEAD(nfsbufhashhead, nfsbuf) * nfsbufhashtbl;
483a1d10 112struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
55e303ae 113u_long nfsbufhash;
91447636 114int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
483a1d10 115int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
55e303ae 116int nfs_nbdwrite;
2d21ac55
A
117int nfs_buf_timer_on = 0;
118thread_t nfsbufdelwrithd = NULL;
483a1d10 119
91447636 120lck_grp_t *nfs_buf_lck_grp;
91447636
A
121lck_mtx_t *nfs_buf_mutex;
122
0a7de745
A
123#define NFSBUF_FREE_PERIOD 30 /* seconds */
124#define NFSBUF_LRU_STALE 120
125#define NFSBUF_META_STALE 240
483a1d10
A
126
127/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
0a7de745 128#define LRU_TO_FREEUP 6
483a1d10 129/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
0a7de745 130#define META_TO_FREEUP 3
483a1d10 131/* total number of nfsbufs nfs_buf_freeup() should attempt to free */
0a7de745 132#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
2d21ac55 133/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from timer */
0a7de745 134#define LRU_FREEUP_FRAC_ON_TIMER 8
2d21ac55 135/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from timer */
0a7de745 136#define META_FREEUP_FRAC_ON_TIMER 16
483a1d10 137/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
0a7de745 138#define LRU_FREEUP_MIN_FRAC 4
483a1d10 139/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
0a7de745 140#define META_FREEUP_MIN_FRAC 2
55e303ae 141
483a1d10 142#define NFS_BUF_FREEUP() \
91447636 143 do { \
0a7de745
A
144 /* only call nfs_buf_freeup() if it has work to do: */ \
145 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
146 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
147 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
148 nfs_buf_freeup(0); \
483a1d10 149 } while (0)
55e303ae
A
150
151/*
152 * Initialize nfsbuf lists
153 */
154void
155nfs_nbinit(void)
156{
2d21ac55
A
157 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", LCK_GRP_ATTR_NULL);
158 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, LCK_ATTR_NULL);
91447636
A
159
160 nfsbufcnt = nfsbufmetacnt =
0a7de745 161 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
91447636 162 nfsbufmin = 128;
2d21ac55
A
163 /* size nfsbufmax to cover at most half sane_size (w/default buf size) */
164 nfsbufmax = (sane_size >> PAGE_SHIFT) / (2 * (NFS_RWSIZE >> PAGE_SHIFT));
165 nfsbufmetamax = nfsbufmax / 4;
55e303ae
A
166 nfsneedbuffer = 0;
167 nfs_nbdwrite = 0;
91447636 168
0a7de745 169 nfsbufhashtbl = hashinit(nfsbufmax / 4, M_TEMP, &nfsbufhash);
91447636
A
170 TAILQ_INIT(&nfsbuffree);
171 TAILQ_INIT(&nfsbuffreemeta);
172 TAILQ_INIT(&nfsbufdelwri);
55e303ae
A
173}
174
2d21ac55
A
175/*
176 * Check periodically for stale/unused nfs bufs
177 */
178void
179nfs_buf_timer(__unused void *param0, __unused void *param1)
180{
181 nfs_buf_freeup(1);
182
183 lck_mtx_lock(nfs_buf_mutex);
184 if (nfsbufcnt <= nfsbufmin) {
185 nfs_buf_timer_on = 0;
186 lck_mtx_unlock(nfs_buf_mutex);
187 return;
188 }
189 lck_mtx_unlock(nfs_buf_mutex);
190
191 nfs_interval_timer_start(nfs_buf_timer_call,
0a7de745 192 NFSBUF_FREE_PERIOD * 1000);
2d21ac55
A
193}
194
55e303ae
A
195/*
196 * try to free up some excess, unused nfsbufs
197 */
483a1d10
A
198void
199nfs_buf_freeup(int timer)
55e303ae
A
200{
201 struct nfsbuf *fbp;
483a1d10
A
202 struct timeval now;
203 int count;
91447636
A
204 struct nfsbuffreehead nfsbuffreeup;
205
206 TAILQ_INIT(&nfsbuffreeup);
207
208 lck_mtx_lock(nfs_buf_mutex);
55e303ae 209
483a1d10 210 microuptime(&now);
55e303ae 211
91447636
A
212 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
213
0a7de745 214 count = timer ? nfsbuffreecnt / LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
483a1d10 215 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
55e303ae 216 fbp = TAILQ_FIRST(&nfsbuffree);
0a7de745 217 if (!fbp) {
55e303ae 218 break;
0a7de745 219 }
cb323159 220 if (os_ref_get_count(&fbp->nb_refs) > 1) {
91447636 221 break;
0a7de745 222 }
91447636 223 if (NBUFSTAMPVALID(fbp) &&
0a7de745 224 (fbp->nb_timestamp + (2 * NFSBUF_LRU_STALE)) > now.tv_sec) {
483a1d10 225 break;
0a7de745 226 }
483a1d10 227 nfs_buf_remfree(fbp);
2d21ac55
A
228 /* disassociate buffer from any nfsnode */
229 if (fbp->nb_np) {
483a1d10
A
230 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
231 LIST_REMOVE(fbp, nb_vnbufs);
232 fbp->nb_vnbufs.le_next = NFSNOLIST;
233 }
2d21ac55 234 fbp->nb_np = NULL;
483a1d10
A
235 }
236 LIST_REMOVE(fbp, nb_hash);
91447636 237 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
483a1d10
A
238 nfsbufcnt--;
239 }
240
0a7de745 241 count = timer ? nfsbuffreemetacnt / META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
483a1d10
A
242 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
243 fbp = TAILQ_FIRST(&nfsbuffreemeta);
0a7de745 244 if (!fbp) {
483a1d10 245 break;
0a7de745 246 }
cb323159 247 if (os_ref_get_count(&fbp->nb_refs) > 1) {
91447636 248 break;
0a7de745 249 }
91447636 250 if (NBUFSTAMPVALID(fbp) &&
0a7de745 251 (fbp->nb_timestamp + (2 * NFSBUF_META_STALE)) > now.tv_sec) {
483a1d10 252 break;
0a7de745 253 }
55e303ae 254 nfs_buf_remfree(fbp);
2d21ac55
A
255 /* disassociate buffer from any nfsnode */
256 if (fbp->nb_np) {
55e303ae
A
257 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
258 LIST_REMOVE(fbp, nb_vnbufs);
259 fbp->nb_vnbufs.le_next = NFSNOLIST;
260 }
2d21ac55 261 fbp->nb_np = NULL;
55e303ae
A
262 }
263 LIST_REMOVE(fbp, nb_hash);
91447636
A
264 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
265 nfsbufcnt--;
266 nfsbufmetacnt--;
267 }
268
269 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
2d21ac55 270 NFSBUFCNTCHK();
91447636
A
271
272 lck_mtx_unlock(nfs_buf_mutex);
273
274 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
275 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
55e303ae 276 /* nuke any creds */
0a7de745 277 if (IS_VALID_CRED(fbp->nb_rcred)) {
0c530ab8 278 kauth_cred_unref(&fbp->nb_rcred);
0a7de745
A
279 }
280 if (IS_VALID_CRED(fbp->nb_wcred)) {
0c530ab8 281 kauth_cred_unref(&fbp->nb_wcred);
0a7de745 282 }
91447636 283 /* if buf was NB_META, dump buffer */
0a7de745 284 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
91447636 285 kfree(fbp->nb_data, fbp->nb_bufsize);
0a7de745 286 }
55e303ae 287 FREE(fbp, M_TEMP);
55e303ae 288 }
55e303ae
A
289}
290
91447636
A
291/*
292 * remove a buffer from the freelist
293 * (must be called with nfs_buf_mutex held)
294 */
55e303ae
A
295void
296nfs_buf_remfree(struct nfsbuf *bp)
297{
0a7de745 298 if (bp->nb_free.tqe_next == NFSNOLIST) {
55e303ae 299 panic("nfsbuf not on free list");
0a7de745 300 }
55e303ae
A
301 if (ISSET(bp->nb_flags, NB_DELWRI)) {
302 nfsbufdelwricnt--;
303 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
91447636 304 } else if (ISSET(bp->nb_flags, NB_META)) {
483a1d10
A
305 nfsbuffreemetacnt--;
306 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
55e303ae
A
307 } else {
308 nfsbuffreecnt--;
309 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
310 }
311 bp->nb_free.tqe_next = NFSNOLIST;
2d21ac55 312 NFSBUFCNTCHK();
55e303ae
A
313}
314
315/*
316 * check for existence of nfsbuf in cache
317 */
91447636 318boolean_t
2d21ac55 319nfs_buf_is_incore(nfsnode_t np, daddr64_t blkno)
91447636
A
320{
321 boolean_t rv;
322 lck_mtx_lock(nfs_buf_mutex);
0a7de745 323 if (nfs_buf_incore(np, blkno)) {
91447636 324 rv = TRUE;
0a7de745 325 } else {
91447636 326 rv = FALSE;
0a7de745 327 }
91447636 328 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 329 return rv;
91447636
A
330}
331
332/*
333 * return incore buffer (must be called with nfs_buf_mutex held)
334 */
55e303ae 335struct nfsbuf *
2d21ac55 336nfs_buf_incore(nfsnode_t np, daddr64_t blkno)
55e303ae
A
337{
338 /* Search hash chain */
2d21ac55 339 struct nfsbuf * bp = NFSBUFHASH(np, blkno)->lh_first;
0a7de745 340 for (; bp != NULL; bp = bp->nb_hash.le_next) {
2d21ac55 341 if ((bp->nb_lblkno == blkno) && (bp->nb_np == np)) {
483a1d10 342 if (!ISSET(bp->nb_flags, NB_INVAL)) {
2d21ac55 343 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_np);
0a7de745 344 return bp;
483a1d10
A
345 }
346 }
0a7de745
A
347 }
348 return NULL;
55e303ae
A
349}
350
351/*
352 * Check if it's OK to drop a page.
353 *
354 * Called by vnode_pager() on pageout request of non-dirty page.
355 * We need to make sure that it's not part of a delayed write.
356 * If it is, we can't let the VM drop it because we may need it
357 * later when/if we need to write the data (again).
358 */
359int
91447636 360nfs_buf_page_inval(vnode_t vp, off_t offset)
55e303ae 361{
2d21ac55 362 struct nfsmount *nmp = VTONMP(vp);
55e303ae 363 struct nfsbuf *bp;
91447636
A
364 int error = 0;
365
0a7de745
A
366 if (nfs_mount_gone(nmp)) {
367 return ENXIO;
368 }
2d21ac55 369
91447636 370 lck_mtx_lock(nfs_buf_mutex);
2d21ac55 371 bp = nfs_buf_incore(VTONFS(vp), (daddr64_t)(offset / nmp->nm_biosize));
0a7de745 372 if (!bp) {
91447636 373 goto out;
0a7de745 374 }
55e303ae 375 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
91447636
A
376 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
377 error = EBUSY;
378 goto out;
379 }
55e303ae
A
380 /*
381 * If there's a dirty range in the buffer, check to
382 * see if this page intersects with the dirty range.
383 * If it does, we can't let the pager drop the page.
384 */
385 if (bp->nb_dirtyend > 0) {
386 int start = offset - NBOFF(bp);
b0d623f7
A
387 if ((bp->nb_dirtyend > start) &&
388 (bp->nb_dirtyoff < (start + PAGE_SIZE))) {
389 /*
390 * Before returning the bad news, move the
391 * buffer to the start of the delwri list and
392 * give the list a push to try to flush the
393 * buffer out.
394 */
91447636 395 error = EBUSY;
b0d623f7
A
396 nfs_buf_remfree(bp);
397 TAILQ_INSERT_HEAD(&nfsbufdelwri, bp, nb_free);
398 nfsbufdelwricnt++;
399 nfs_buf_delwri_push(1);
400 }
55e303ae 401 }
91447636
A
402out:
403 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 404 return error;
55e303ae
A
405}
406
91447636
A
407/*
408 * set up the UPL for a buffer
409 * (must NOT be called with nfs_buf_mutex held)
410 */
55e303ae
A
411int
412nfs_buf_upl_setup(struct nfsbuf *bp)
413{
414 kern_return_t kret;
415 upl_t upl;
91447636 416 int upl_flags;
55e303ae 417
0a7de745
A
418 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
419 return 0;
420 }
55e303ae 421
91447636 422 upl_flags = UPL_PRECIOUS;
2d21ac55 423 if (!ISSET(bp->nb_flags, NB_READ)) {
91447636
A
424 /*
425 * We're doing a "write", so we intend to modify
426 * the pages we're gathering.
427 */
428 upl_flags |= UPL_WILL_MODIFY;
429 }
5ba3f43e 430 kret = ubc_create_upl_kernel(NFSTOV(bp->nb_np), NBOFF(bp), bp->nb_bufsize,
0a7de745 431 &upl, NULL, upl_flags, VM_KERN_MEMORY_FILE);
55e303ae
A
432 if (kret == KERN_INVALID_ARGUMENT) {
433 /* vm object probably doesn't exist any more */
434 bp->nb_pagelist = NULL;
0a7de745 435 return EINVAL;
55e303ae
A
436 }
437 if (kret != KERN_SUCCESS) {
438 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
439 bp->nb_pagelist = NULL;
0a7de745 440 return EIO;
55e303ae
A
441 }
442
2d21ac55 443 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_np);
55e303ae 444
55e303ae
A
445 bp->nb_pagelist = upl;
446 SET(bp->nb_flags, NB_PAGELIST);
0a7de745 447 return 0;
55e303ae
A
448}
449
91447636
A
450/*
451 * update buffer's valid/dirty info from UBC
452 * (must NOT be called with nfs_buf_mutex held)
453 */
55e303ae
A
454void
455nfs_buf_upl_check(struct nfsbuf *bp)
456{
457 upl_page_info_t *pl;
458 off_t filesize, fileoffset;
459 int i, npages;
460
0a7de745 461 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
55e303ae 462 return;
0a7de745 463 }
55e303ae
A
464
465 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
2d21ac55 466 filesize = ubc_getsize(NFSTOV(bp->nb_np));
55e303ae 467 fileoffset = NBOFF(bp);
0a7de745 468 if (fileoffset < filesize) {
55e303ae 469 SET(bp->nb_flags, NB_CACHE);
0a7de745 470 } else {
55e303ae 471 CLR(bp->nb_flags, NB_CACHE);
0a7de745 472 }
55e303ae
A
473
474 pl = ubc_upl_pageinfo(bp->nb_pagelist);
475 bp->nb_valid = bp->nb_dirty = 0;
476
0a7de745 477 for (i = 0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
55e303ae 478 /* anything beyond the end of the file is not valid or dirty */
0a7de745 479 if (fileoffset >= filesize) {
55e303ae 480 break;
0a7de745 481 }
55e303ae
A
482 if (!upl_valid_page(pl, i)) {
483 CLR(bp->nb_flags, NB_CACHE);
484 continue;
485 }
0a7de745
A
486 NBPGVALID_SET(bp, i);
487 if (upl_dirty_page(pl, i)) {
55e303ae 488 NBPGDIRTY_SET(bp, i);
0a7de745 489 }
55e303ae
A
490 }
491 fileoffset = NBOFF(bp);
492 if (ISSET(bp->nb_flags, NB_CACHE)) {
493 bp->nb_validoff = 0;
494 bp->nb_validend = bp->nb_bufsize;
0a7de745 495 if (fileoffset + bp->nb_validend > filesize) {
55e303ae 496 bp->nb_validend = filesize - fileoffset;
0a7de745 497 }
55e303ae
A
498 } else {
499 bp->nb_validoff = bp->nb_validend = -1;
500 }
501 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
502 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
503}
504
91447636
A
505/*
506 * make sure that a buffer is mapped
507 * (must NOT be called with nfs_buf_mutex held)
508 */
2d21ac55 509int
55e303ae
A
510nfs_buf_map(struct nfsbuf *bp)
511{
512 kern_return_t kret;
513
0a7de745
A
514 if (bp->nb_data) {
515 return 0;
516 }
517 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
518 return EINVAL;
519 }
55e303ae 520
b0d623f7 521 kret = ubc_upl_map(bp->nb_pagelist, (vm_offset_t *)&(bp->nb_data));
0a7de745 522 if (kret != KERN_SUCCESS) {
55e303ae 523 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
0a7de745
A
524 }
525 if (bp->nb_data == 0) {
55e303ae 526 panic("ubc_upl_map mapped 0");
0a7de745 527 }
55e303ae 528 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
0a7de745 529 return 0;
55e303ae
A
530}
531
55e303ae
A
532/*
533 * normalize an nfsbuf's valid range
534 *
535 * the read/write code guarantees that we'll always have a valid
536 * region that is an integral number of pages. If either end
537 * of the valid range isn't page-aligned, it gets corrected
538 * here as we extend the valid range through all of the
539 * contiguous valid pages.
540 */
2d21ac55
A
541void
542nfs_buf_normalize_valid_range(nfsnode_t np, struct nfsbuf *bp)
55e303ae
A
543{
544 int pg, npg;
545 /* pull validoff back to start of contiguous valid page range */
0a7de745
A
546 pg = bp->nb_validoff / PAGE_SIZE;
547 while (pg >= 0 && NBPGVALID(bp, pg)) {
55e303ae 548 pg--;
0a7de745
A
549 }
550 bp->nb_validoff = (pg + 1) * PAGE_SIZE;
55e303ae 551 /* push validend forward to end of contiguous valid page range */
0a7de745
A
552 npg = bp->nb_bufsize / PAGE_SIZE;
553 pg = bp->nb_validend / PAGE_SIZE;
554 while (pg < npg && NBPGVALID(bp, pg)) {
55e303ae 555 pg++;
0a7de745 556 }
55e303ae
A
557 bp->nb_validend = pg * PAGE_SIZE;
558 /* clip to EOF */
0a7de745 559 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size) {
55e303ae 560 bp->nb_validend = np->n_size % bp->nb_bufsize;
0a7de745 561 }
55e303ae
A
562}
563
564/*
2d21ac55
A
565 * process some entries on the delayed write queue
566 * (must be called with nfs_buf_mutex held)
55e303ae 567 */
b0d623f7 568void
2d21ac55 569nfs_buf_delwri_service(void)
55e303ae
A
570{
571 struct nfsbuf *bp;
2d21ac55
A
572 nfsnode_t np;
573 int error, i = 0;
55e303ae 574
55e303ae 575 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
2d21ac55 576 np = bp->nb_np;
55e303ae 577 nfs_buf_remfree(bp);
91447636 578 nfs_buf_refget(bp);
0a7de745
A
579 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN) {
580 ;
581 }
91447636 582 nfs_buf_refrele(bp);
0a7de745 583 if (error) {
91447636 584 break;
0a7de745 585 }
2d21ac55 586 if (!bp->nb_np) {
91447636
A
587 /* buffer is no longer valid */
588 nfs_buf_drop(bp);
589 continue;
590 }
0a7de745 591 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
8f6c56a5 592 nfs_buf_check_write_verifier(np, bp);
0a7de745 593 }
55e303ae
A
594 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
595 /* put buffer at end of delwri list */
596 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
597 nfsbufdelwricnt++;
91447636
A
598 nfs_buf_drop(bp);
599 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 600 nfs_flushcommits(np, 1);
55e303ae 601 } else {
91447636
A
602 SET(bp->nb_flags, NB_ASYNC);
603 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
604 nfs_buf_write(bp);
605 }
606 i++;
91447636 607 lck_mtx_lock(nfs_buf_mutex);
55e303ae 608 }
2d21ac55
A
609}
610
611/*
612 * thread to service the delayed write queue when asked
613 */
b0d623f7 614void
2d21ac55
A
615nfs_buf_delwri_thread(__unused void *arg, __unused wait_result_t wr)
616{
cb323159 617 struct timespec ts = { .tv_sec = 30, .tv_nsec = 0 };
2d21ac55
A
618 int error = 0;
619
620 lck_mtx_lock(nfs_buf_mutex);
621 while (!error) {
622 nfs_buf_delwri_service();
623 error = msleep(&nfsbufdelwrithd, nfs_buf_mutex, 0, "nfsbufdelwri", &ts);
624 }
625 nfsbufdelwrithd = NULL;
626 lck_mtx_unlock(nfs_buf_mutex);
627 thread_terminate(nfsbufdelwrithd);
628}
629
630/*
631 * try to push out some delayed/uncommitted writes
632 * ("locked" indicates whether nfs_buf_mutex is already held)
633 */
b0d623f7 634void
2d21ac55
A
635nfs_buf_delwri_push(int locked)
636{
0a7de745 637 if (TAILQ_EMPTY(&nfsbufdelwri)) {
2d21ac55 638 return;
0a7de745
A
639 }
640 if (!locked) {
2d21ac55 641 lck_mtx_lock(nfs_buf_mutex);
0a7de745 642 }
2d21ac55 643 /* wake up the delayed write service thread */
0a7de745 644 if (nfsbufdelwrithd) {
2d21ac55 645 wakeup(&nfsbufdelwrithd);
0a7de745 646 } else if (kernel_thread_start(nfs_buf_delwri_thread, NULL, &nfsbufdelwrithd) == KERN_SUCCESS) {
2d21ac55 647 thread_deallocate(nfsbufdelwrithd);
0a7de745 648 }
2d21ac55 649 /* otherwise, try to do some of the work ourselves */
0a7de745 650 if (!nfsbufdelwrithd) {
2d21ac55 651 nfs_buf_delwri_service();
0a7de745
A
652 }
653 if (!locked) {
91447636 654 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 655 }
55e303ae
A
656}
657
658/*
91447636
A
659 * Get an nfs buffer.
660 *
661 * Returns errno on error, 0 otherwise.
662 * Any buffer is returned in *bpp.
663 *
664 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
665 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
666 *
667 * Check for existence of buffer in cache.
668 * Or attempt to reuse a buffer from one of the free lists.
669 * Or allocate a new buffer if we haven't already hit max allocation.
670 * Or wait for a free buffer.
671 *
672 * If available buffer found, prepare it, and return it.
673 *
674 * If the calling process is interrupted by a signal for
675 * an interruptible mount point, return EINTR.
55e303ae 676 */
91447636 677int
55e303ae 678nfs_buf_get(
2d21ac55 679 nfsnode_t np,
91447636 680 daddr64_t blkno,
b0d623f7 681 uint32_t size,
2d21ac55 682 thread_t thd,
91447636
A
683 int flags,
684 struct nfsbuf **bpp)
55e303ae 685{
2d21ac55
A
686 vnode_t vp = NFSTOV(np);
687 struct nfsmount *nmp = VTONMP(vp);
55e303ae 688 struct nfsbuf *bp;
b0d623f7 689 uint32_t bufsize;
55e303ae 690 int slpflag = PCATCH;
91447636
A
691 int operation = (flags & NBLK_OPMASK);
692 int error = 0;
693 struct timespec ts;
55e303ae 694
2d21ac55 695 FSDBG_TOP(541, np, blkno, size, flags);
91447636 696 *bpp = NULL;
55e303ae
A
697
698 bufsize = size;
0a7de745 699 if (bufsize > NFS_MAXBSIZE) {
0c530ab8 700 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
0a7de745 701 }
55e303ae 702
fe8ab488 703 if (nfs_mount_gone(nmp)) {
2d21ac55 704 FSDBG_BOT(541, np, blkno, 0, ENXIO);
0a7de745 705 return ENXIO;
0c530ab8 706 }
55e303ae 707
2d21ac55 708 if (!UBCINFOEXISTS(vp)) {
91447636 709 operation = NBLK_META;
b0d623f7 710 } else if (bufsize < (uint32_t)nmp->nm_biosize) {
55e303ae 711 /* reg files should always have biosize blocks */
2d21ac55 712 bufsize = nmp->nm_biosize;
91447636 713 }
55e303ae 714
91447636 715 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
2d21ac55
A
716 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES)) {
717 FSDBG_TOP(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
55e303ae
A
718
719 /* poke the delwri list */
91447636 720 nfs_buf_delwri_push(0);
55e303ae
A
721
722 /* sleep to let other threads run... */
723 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
2d21ac55 724 FSDBG_BOT(542, np, blkno, nfs_nbdwrite, NFS_A_LOT_OF_DELAYED_WRITES);
55e303ae
A
725 }
726
727loop:
91447636 728 lck_mtx_lock(nfs_buf_mutex);
55e303ae 729
6d2010ae
A
730 /* wait for any buffer invalidation/flushing to complete */
731 while (np->n_bflag & NBINVALINPROG) {
732 np->n_bflag |= NBINVALWANT;
733 ts.tv_sec = 2;
734 ts.tv_nsec = 0;
735 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts);
736 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
737 lck_mtx_unlock(nfs_buf_mutex);
738 FSDBG_BOT(541, np, blkno, 0, error);
0a7de745 739 return error;
6d2010ae 740 }
0a7de745 741 if (np->n_bflag & NBINVALINPROG) {
6d2010ae 742 slpflag = 0;
0a7de745 743 }
6d2010ae
A
744 }
745
55e303ae 746 /* check for existence of nfsbuf in cache */
2d21ac55 747 if ((bp = nfs_buf_incore(np, blkno))) {
55e303ae 748 /* if busy, set wanted and wait */
91447636
A
749 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
750 if (flags & NBLK_NOWAIT) {
751 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 752 FSDBG_BOT(541, np, blkno, bp, 0xbcbcbcbc);
0a7de745 753 return 0;
91447636 754 }
2d21ac55 755 FSDBG_TOP(543, np, blkno, bp, bp->nb_flags);
91447636
A
756 SET(bp->nb_lflags, NBL_WANTED);
757
758 ts.tv_sec = 2;
759 ts.tv_nsec = 0;
0a7de745
A
760 msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1) | PDROP,
761 "nfsbufget", (slpflag == PCATCH) ? NULL : &ts);
55e303ae 762 slpflag = 0;
2d21ac55 763 FSDBG_BOT(543, np, blkno, bp, bp->nb_flags);
36401178 764 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 765 FSDBG_BOT(541, np, blkno, 0, error);
0a7de745 766 return error;
55e303ae
A
767 }
768 goto loop;
769 }
0a7de745 770 if (bp->nb_bufsize != bufsize) {
55e303ae 771 panic("nfsbuf size mismatch");
0a7de745 772 }
91447636
A
773 SET(bp->nb_lflags, NBL_BUSY);
774 SET(bp->nb_flags, NB_CACHE);
55e303ae
A
775 nfs_buf_remfree(bp);
776 /* additional paranoia: */
0a7de745 777 if (ISSET(bp->nb_flags, NB_PAGELIST)) {
55e303ae 778 panic("pagelist buffer was not busy");
0a7de745 779 }
55e303ae
A
780 goto buffer_setup;
781 }
782
91447636
A
783 if (flags & NBLK_ONLYVALID) {
784 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 785 FSDBG_BOT(541, np, blkno, 0, 0x0000cace);
0a7de745 786 return 0;
91447636
A
787 }
788
55e303ae
A
789 /*
790 * where to get a free buffer:
91447636 791 * - if meta and maxmeta reached, must reuse meta
55e303ae 792 * - alloc new if we haven't reached min bufs
483a1d10
A
793 * - if free lists are NOT empty
794 * - if free list is stale, use it
795 * - else if freemeta list is stale, use it
796 * - else if max bufs allocated, use least-time-to-stale
55e303ae
A
797 * - alloc new if we haven't reached max allowed
798 * - start clearing out delwri list and try again
799 */
800
91447636
A
801 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
802 /* if we've hit max meta buffers, must reuse a meta buffer */
803 bp = TAILQ_FIRST(&nfsbuffreemeta);
804 } else if ((nfsbufcnt > nfsbufmin) &&
483a1d10
A
805 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
806 /* try to pull an nfsbuf off a free list */
807 struct nfsbuf *lrubp, *metabp;
808 struct timeval now;
809 microuptime(&now);
810
91447636 811 /* if the next LRU or META buffer is invalid or stale, use it */
483a1d10 812 lrubp = TAILQ_FIRST(&nfsbuffree);
91447636 813 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
0a7de745 814 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))) {
483a1d10 815 bp = lrubp;
0a7de745 816 }
483a1d10 817 metabp = TAILQ_FIRST(&nfsbuffreemeta);
91447636 818 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
0a7de745 819 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))) {
483a1d10 820 bp = metabp;
0a7de745 821 }
483a1d10
A
822
823 if (!bp && (nfsbufcnt >= nfsbufmax)) {
824 /* we've already allocated all bufs, so */
825 /* choose the buffer that'll go stale first */
0a7de745 826 if (!metabp) {
483a1d10 827 bp = lrubp;
0a7de745 828 } else if (!lrubp) {
483a1d10 829 bp = metabp;
0a7de745 830 } else {
483a1d10
A
831 int32_t lru_stale_time, meta_stale_time;
832 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
833 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
0a7de745 834 if (lru_stale_time <= meta_stale_time) {
483a1d10 835 bp = lrubp;
0a7de745 836 } else {
483a1d10 837 bp = metabp;
0a7de745 838 }
55e303ae 839 }
55e303ae 840 }
91447636 841 }
483a1d10 842
91447636
A
843 if (bp) {
844 /* we have a buffer to reuse */
2d21ac55 845 FSDBG(544, np, blkno, bp, bp->nb_flags);
91447636 846 nfs_buf_remfree(bp);
0a7de745 847 if (ISSET(bp->nb_flags, NB_DELWRI)) {
91447636 848 panic("nfs_buf_get: delwri");
0a7de745 849 }
91447636 850 SET(bp->nb_lflags, NBL_BUSY);
2d21ac55
A
851 /* disassociate buffer from previous nfsnode */
852 if (bp->nb_np) {
91447636
A
853 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
854 LIST_REMOVE(bp, nb_vnbufs);
855 bp->nb_vnbufs.le_next = NFSNOLIST;
483a1d10 856 }
2d21ac55 857 bp->nb_np = NULL;
91447636
A
858 }
859 LIST_REMOVE(bp, nb_hash);
860 /* nuke any creds we're holding */
0a7de745 861 if (IS_VALID_CRED(bp->nb_rcred)) {
0c530ab8 862 kauth_cred_unref(&bp->nb_rcred);
0a7de745
A
863 }
864 if (IS_VALID_CRED(bp->nb_wcred)) {
0c530ab8 865 kauth_cred_unref(&bp->nb_wcred);
0a7de745 866 }
91447636
A
867 /* if buf will no longer be NB_META, dump old buffer */
868 if (operation == NBLK_META) {
0a7de745 869 if (!ISSET(bp->nb_flags, NB_META)) {
91447636 870 nfsbufmetacnt++;
0a7de745 871 }
91447636
A
872 } else if (ISSET(bp->nb_flags, NB_META)) {
873 if (bp->nb_data) {
874 kfree(bp->nb_data, bp->nb_bufsize);
483a1d10
A
875 bp->nb_data = NULL;
876 }
91447636 877 nfsbufmetacnt--;
55e303ae 878 }
91447636
A
879 /* re-init buf fields */
880 bp->nb_error = 0;
881 bp->nb_validoff = bp->nb_validend = -1;
882 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
883 bp->nb_valid = 0;
884 bp->nb_dirty = 0;
8f6c56a5 885 bp->nb_verf = 0;
91447636
A
886 } else {
887 /* no buffer to reuse */
888 if ((nfsbufcnt < nfsbufmax) &&
889 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
483a1d10
A
890 /* just alloc a new one */
891 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
91447636
A
892 if (!bp) {
893 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 894 FSDBG_BOT(541, np, blkno, 0, error);
0a7de745 895 return ENOMEM;
91447636 896 }
483a1d10 897 nfsbufcnt++;
2d21ac55
A
898
899 /*
900 * If any excess bufs, make sure the timer
901 * is running to free them up later.
902 */
903 if (nfsbufcnt > nfsbufmin && !nfs_buf_timer_on) {
904 nfs_buf_timer_on = 1;
905 nfs_interval_timer_start(nfs_buf_timer_call,
0a7de745 906 NFSBUF_FREE_PERIOD * 1000);
2d21ac55
A
907 }
908
0a7de745 909 if (operation == NBLK_META) {
91447636 910 nfsbufmetacnt++;
0a7de745 911 }
2d21ac55 912 NFSBUFCNTCHK();
483a1d10
A
913 /* init nfsbuf */
914 bzero(bp, sizeof(*bp));
cb323159
A
915 os_ref_init(&bp->nb_refs, NULL);
916
483a1d10
A
917 bp->nb_free.tqe_next = NFSNOLIST;
918 bp->nb_validoff = bp->nb_validend = -1;
2d21ac55 919 FSDBG(545, np, blkno, bp, 0);
483a1d10
A
920 } else {
921 /* too many bufs... wait for buffers to free up */
2d21ac55 922 FSDBG_TOP(546, np, blkno, nfsbufcnt, nfsbufmax);
55e303ae 923
483a1d10 924 /* poke the delwri list */
91447636 925 nfs_buf_delwri_push(1);
483a1d10
A
926
927 nfsneedbuffer = 1;
0a7de745 928 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH | PDROP, "nfsbufget", NULL);
2d21ac55 929 FSDBG_BOT(546, np, blkno, nfsbufcnt, nfsbufmax);
36401178 930 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 931 FSDBG_BOT(541, np, blkno, 0, error);
0a7de745 932 return error;
483a1d10
A
933 }
934 goto loop;
55e303ae 935 }
55e303ae
A
936 }
937
b0d623f7
A
938 /* set up nfsbuf */
939 SET(bp->nb_lflags, NBL_BUSY);
91447636 940 bp->nb_flags = 0;
55e303ae
A
941 bp->nb_lblkno = blkno;
942 /* insert buf in hash */
91447636 943 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
2d21ac55
A
944 /* associate buffer with new nfsnode */
945 bp->nb_np = np;
55e303ae
A
946 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
947
948buffer_setup:
949
91447636
A
950 /* unlock hash */
951 lck_mtx_unlock(nfs_buf_mutex);
952
55e303ae 953 switch (operation) {
91447636 954 case NBLK_META:
55e303ae
A
955 SET(bp->nb_flags, NB_META);
956 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
91447636 957 kfree(bp->nb_data, bp->nb_bufsize);
55e303ae
A
958 bp->nb_data = NULL;
959 bp->nb_validoff = bp->nb_validend = -1;
960 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
961 bp->nb_valid = 0;
962 bp->nb_dirty = 0;
963 CLR(bp->nb_flags, NB_CACHE);
964 }
0a7de745 965 if (!bp->nb_data) {
91447636 966 bp->nb_data = kalloc(bufsize);
0a7de745 967 }
91447636
A
968 if (!bp->nb_data) {
969 /* Ack! couldn't allocate the data buffer! */
2d21ac55 970 /* clean up buffer and return error */
91447636
A
971 lck_mtx_lock(nfs_buf_mutex);
972 LIST_REMOVE(bp, nb_vnbufs);
973 bp->nb_vnbufs.le_next = NFSNOLIST;
2d21ac55 974 bp->nb_np = NULL;
91447636
A
975 /* invalidate usage timestamp to allow immediate freeing */
976 NBUFSTAMPINVALIDATE(bp);
0a7de745 977 if (bp->nb_free.tqe_next != NFSNOLIST) {
91447636 978 panic("nfsbuf on freelist");
0a7de745 979 }
91447636
A
980 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
981 nfsbuffreecnt++;
982 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 983 FSDBG_BOT(541, np, blkno, 0xb00, ENOMEM);
0a7de745 984 return ENOMEM;
91447636 985 }
55e303ae
A
986 bp->nb_bufsize = bufsize;
987 break;
988
91447636
A
989 case NBLK_READ:
990 case NBLK_WRITE:
991 /*
992 * Set or clear NB_READ now to let the UPL subsystem know
993 * if we intend to modify the pages or not.
994 */
995 if (operation == NBLK_READ) {
996 SET(bp->nb_flags, NB_READ);
997 } else {
998 CLR(bp->nb_flags, NB_READ);
999 }
0a7de745 1000 if (bufsize < PAGE_SIZE) {
55e303ae 1001 bufsize = PAGE_SIZE;
0a7de745 1002 }
55e303ae
A
1003 bp->nb_bufsize = bufsize;
1004 bp->nb_validoff = bp->nb_validend = -1;
1005
91447636 1006 if (UBCINFOEXISTS(vp)) {
2d21ac55 1007 /* set up upl */
55e303ae
A
1008 if (nfs_buf_upl_setup(bp)) {
1009 /* unable to create upl */
1010 /* vm object must no longer exist */
2d21ac55 1011 /* clean up buffer and return error */
91447636 1012 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
1013 LIST_REMOVE(bp, nb_vnbufs);
1014 bp->nb_vnbufs.le_next = NFSNOLIST;
2d21ac55 1015 bp->nb_np = NULL;
91447636
A
1016 /* invalidate usage timestamp to allow immediate freeing */
1017 NBUFSTAMPINVALIDATE(bp);
0a7de745 1018 if (bp->nb_free.tqe_next != NFSNOLIST) {
55e303ae 1019 panic("nfsbuf on freelist");
0a7de745 1020 }
55e303ae
A
1021 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1022 nfsbuffreecnt++;
91447636 1023 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 1024 FSDBG_BOT(541, np, blkno, 0x2bc, EIO);
0a7de745 1025 return EIO;
55e303ae
A
1026 }
1027 nfs_buf_upl_check(bp);
1028 }
1029 break;
1030
1031 default:
1032 panic("nfs_buf_get: %d unknown operation", operation);
1033 }
1034
91447636 1035 *bpp = bp;
55e303ae 1036
2d21ac55 1037 FSDBG_BOT(541, np, blkno, bp, bp->nb_flags);
55e303ae 1038
0a7de745 1039 return 0;
55e303ae
A
1040}
1041
1042void
483a1d10 1043nfs_buf_release(struct nfsbuf *bp, int freeup)
55e303ae 1044{
2d21ac55
A
1045 nfsnode_t np = bp->nb_np;
1046 vnode_t vp;
483a1d10 1047 struct timeval now;
91447636 1048 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
55e303ae
A
1049
1050 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1051 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
1052 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
1053
2d21ac55
A
1054 vp = np ? NFSTOV(np) : NULL;
1055 if (vp && UBCINFOEXISTS(vp) && bp->nb_bufsize) {
b0d623f7 1056 int upl_flags, rv;
55e303ae 1057 upl_t upl;
b0d623f7 1058 uint32_t i;
55e303ae
A
1059
1060 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
1061 rv = nfs_buf_upl_setup(bp);
0a7de745 1062 if (rv) {
55e303ae 1063 printf("nfs_buf_release: upl create failed %d\n", rv);
0a7de745 1064 } else {
55e303ae 1065 nfs_buf_upl_check(bp);
0a7de745 1066 }
55e303ae
A
1067 }
1068 upl = bp->nb_pagelist;
0a7de745 1069 if (!upl) {
55e303ae 1070 goto pagelist_cleanup_done;
0a7de745 1071 }
55e303ae 1072 if (bp->nb_data) {
0a7de745 1073 if (ubc_upl_unmap(upl) != KERN_SUCCESS) {
55e303ae 1074 panic("ubc_upl_unmap failed");
0a7de745 1075 }
55e303ae
A
1076 bp->nb_data = NULL;
1077 }
2d21ac55
A
1078 /*
1079 * Abort the pages on error or: if this is an invalid or
1080 * non-needcommit nocache buffer AND no pages are dirty.
1081 */
1082 if (ISSET(bp->nb_flags, NB_ERROR) || (!bp->nb_dirty && (ISSET(bp->nb_flags, NB_INVAL) ||
1083 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))))) {
0a7de745 1084 if (ISSET(bp->nb_flags, (NB_READ | NB_INVAL | NB_NOCACHE))) {
55e303ae 1085 upl_flags = UPL_ABORT_DUMP_PAGES;
0a7de745 1086 } else {
55e303ae 1087 upl_flags = 0;
0a7de745 1088 }
55e303ae
A
1089 ubc_upl_abort(upl, upl_flags);
1090 goto pagelist_cleanup_done;
1091 }
0a7de745
A
1092 for (i = 0; i <= (bp->nb_bufsize - 1) / PAGE_SIZE; i++) {
1093 if (!NBPGVALID(bp, i)) {
55e303ae 1094 ubc_upl_abort_range(upl,
0a7de745
A
1095 i * PAGE_SIZE, PAGE_SIZE,
1096 UPL_ABORT_DUMP_PAGES |
1097 UPL_ABORT_FREE_ON_EMPTY);
1098 } else {
1099 if (NBPGDIRTY(bp, i)) {
55e303ae 1100 upl_flags = UPL_COMMIT_SET_DIRTY;
0a7de745 1101 } else {
55e303ae 1102 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
0a7de745
A
1103 }
1104
1105 if (!ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))) {
b0d623f7 1106 upl_flags |= UPL_COMMIT_CLEAR_PRECIOUS;
0a7de745 1107 }
b0d623f7 1108
55e303ae 1109 ubc_upl_commit_range(upl,
0a7de745
A
1110 i * PAGE_SIZE, PAGE_SIZE,
1111 upl_flags |
1112 UPL_COMMIT_INACTIVATE |
1113 UPL_COMMIT_FREE_ON_EMPTY);
55e303ae
A
1114 }
1115 }
1116pagelist_cleanup_done:
b0d623f7 1117 /* invalidate any pages past EOF */
2d21ac55 1118 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(np->n_size)) {
91447636 1119 off_t start, end;
2d21ac55 1120 start = trunc_page_64(np->n_size) + PAGE_SIZE_64;
0c530ab8 1121 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
0a7de745 1122 if (start < NBOFF(bp)) {
b0d623f7 1123 start = NBOFF(bp);
0a7de745 1124 }
91447636 1125 if (end > start) {
0a7de745 1126 if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) {
6d2010ae 1127 printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv);
0a7de745 1128 }
91447636 1129 }
55e303ae
A
1130 }
1131 CLR(bp->nb_flags, NB_PAGELIST);
1132 bp->nb_pagelist = NULL;
1133 }
1134
91447636
A
1135 lck_mtx_lock(nfs_buf_mutex);
1136
1137 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1138
55e303ae
A
1139 /* Wake up any processes waiting for any buffer to become free. */
1140 if (nfsneedbuffer) {
1141 nfsneedbuffer = 0;
91447636 1142 wakeup_needbuffer = 1;
55e303ae
A
1143 }
1144 /* Wake up any processes waiting for _this_ buffer to become free. */
91447636
A
1145 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1146 CLR(bp->nb_lflags, NBL_WANTED);
1147 wakeup_buffer = 1;
55e303ae
A
1148 }
1149
0c530ab8
A
1150 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1151 if (ISSET(bp->nb_flags, NB_ERROR) ||
0a7de745 1152 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI)))) {
55e303ae 1153 SET(bp->nb_flags, NB_INVAL);
0a7de745 1154 }
55e303ae
A
1155
1156 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
2d21ac55 1157 /* If it's invalid or empty, dissociate it from its nfsnode */
55e303ae
A
1158 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1159 LIST_REMOVE(bp, nb_vnbufs);
1160 bp->nb_vnbufs.le_next = NFSNOLIST;
1161 }
2d21ac55 1162 bp->nb_np = NULL;
55e303ae
A
1163 /* if this was a delayed write, wakeup anyone */
1164 /* waiting for delayed writes to complete */
1165 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1166 CLR(bp->nb_flags, NB_DELWRI);
2d21ac55
A
1167 nfs_nbdwrite--;
1168 NFSBUFCNTCHK();
91447636 1169 wakeup_nbdwrite = 1;
55e303ae 1170 }
91447636
A
1171 /* invalidate usage timestamp to allow immediate freeing */
1172 NBUFSTAMPINVALIDATE(bp);
55e303ae 1173 /* put buffer at head of free list */
0a7de745 1174 if (bp->nb_free.tqe_next != NFSNOLIST) {
55e303ae 1175 panic("nfsbuf on freelist");
0a7de745 1176 }
483a1d10 1177 SET(bp->nb_flags, NB_INVAL);
91447636
A
1178 if (ISSET(bp->nb_flags, NB_META)) {
1179 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1180 nfsbuffreemetacnt++;
1181 } else {
1182 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1183 nfsbuffreecnt++;
1184 }
55e303ae
A
1185 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1186 /* put buffer at end of delwri list */
0a7de745 1187 if (bp->nb_free.tqe_next != NFSNOLIST) {
55e303ae 1188 panic("nfsbuf on freelist");
0a7de745 1189 }
55e303ae
A
1190 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1191 nfsbufdelwricnt++;
91447636 1192 freeup = 0;
55e303ae 1193 } else {
483a1d10
A
1194 /* update usage timestamp */
1195 microuptime(&now);
1196 bp->nb_timestamp = now.tv_sec;
55e303ae 1197 /* put buffer at end of free list */
0a7de745 1198 if (bp->nb_free.tqe_next != NFSNOLIST) {
55e303ae 1199 panic("nfsbuf on freelist");
0a7de745 1200 }
483a1d10
A
1201 if (ISSET(bp->nb_flags, NB_META)) {
1202 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1203 nfsbuffreemetacnt++;
1204 } else {
1205 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1206 nfsbuffreecnt++;
1207 }
55e303ae
A
1208 }
1209
2d21ac55 1210 NFSBUFCNTCHK();
55e303ae
A
1211
1212 /* Unlock the buffer. */
2d21ac55 1213 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE));
91447636 1214 CLR(bp->nb_lflags, NBL_BUSY);
55e303ae
A
1215
1216 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
91447636
A
1217
1218 lck_mtx_unlock(nfs_buf_mutex);
1219
0a7de745 1220 if (wakeup_needbuffer) {
91447636 1221 wakeup(&nfsneedbuffer);
0a7de745
A
1222 }
1223 if (wakeup_buffer) {
91447636 1224 wakeup(bp);
0a7de745
A
1225 }
1226 if (wakeup_nbdwrite) {
91447636 1227 wakeup(&nfs_nbdwrite);
0a7de745
A
1228 }
1229 if (freeup) {
91447636 1230 NFS_BUF_FREEUP();
0a7de745 1231 }
55e303ae
A
1232}
1233
1234/*
1235 * Wait for operations on the buffer to complete.
1236 * When they do, extract and return the I/O's error value.
1237 */
1238int
1239nfs_buf_iowait(struct nfsbuf *bp)
1240{
1241 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1242
91447636
A
1243 lck_mtx_lock(nfs_buf_mutex);
1244
0a7de745 1245 while (!ISSET(bp->nb_flags, NB_DONE)) {
2d21ac55 1246 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", NULL);
0a7de745 1247 }
91447636
A
1248
1249 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1250
1251 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1252
1253 /* check for interruption of I/O, then errors. */
1254 if (ISSET(bp->nb_flags, NB_EINTR)) {
1255 CLR(bp->nb_flags, NB_EINTR);
0a7de745
A
1256 return EINTR;
1257 } else if (ISSET(bp->nb_flags, NB_ERROR)) {
1258 return bp->nb_error ? bp->nb_error : EIO;
1259 }
1260 return 0;
55e303ae
A
1261}
1262
1263/*
1264 * Mark I/O complete on a buffer.
1265 */
1266void
1267nfs_buf_iodone(struct nfsbuf *bp)
1268{
55e303ae
A
1269 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1270
0a7de745 1271 if (ISSET(bp->nb_flags, NB_DONE)) {
55e303ae 1272 panic("nfs_buf_iodone already");
0a7de745 1273 }
55e303ae
A
1274
1275 if (!ISSET(bp->nb_flags, NB_READ)) {
1276 CLR(bp->nb_flags, NB_WRITEINPROG);
91447636
A
1277 /*
1278 * vnode_writedone() takes care of waking up
1279 * any throttled write operations
1280 */
2d21ac55 1281 vnode_writedone(NFSTOV(bp->nb_np));
b0d623f7
A
1282 nfs_node_lock_force(bp->nb_np);
1283 bp->nb_np->n_numoutput--;
1284 nfs_node_unlock(bp->nb_np);
55e303ae 1285 }
0a7de745
A
1286 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1287 SET(bp->nb_flags, NB_DONE); /* note that it's done */
483a1d10 1288 nfs_buf_release(bp, 1);
0a7de745
A
1289 } else { /* or just wakeup the buffer */
1290 lck_mtx_lock(nfs_buf_mutex);
1291 SET(bp->nb_flags, NB_DONE); /* note that it's done */
91447636 1292 CLR(bp->nb_lflags, NBL_WANTED);
0a7de745 1293 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1294 wakeup(bp);
1295 }
1296
1297 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1298}
1299
1300void
2d21ac55 1301nfs_buf_write_delayed(struct nfsbuf *bp)
55e303ae 1302{
2d21ac55 1303 nfsnode_t np = bp->nb_np;
55e303ae
A
1304
1305 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1306 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1307
1308 /*
1309 * If the block hasn't been seen before:
1310 * (1) Mark it as having been seen,
2d21ac55 1311 * (2) Make sure it's on its node's correct block list,
55e303ae
A
1312 */
1313 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1314 SET(bp->nb_flags, NB_DELWRI);
55e303ae 1315 /* move to dirty list */
91447636 1316 lck_mtx_lock(nfs_buf_mutex);
2d21ac55
A
1317 nfs_nbdwrite++;
1318 NFSBUFCNTCHK();
0a7de745 1319 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
55e303ae 1320 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 1321 }
2d21ac55 1322 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
91447636 1323 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1324 }
1325
1326 /*
1327 * If the vnode has "too many" write operations in progress
1328 * wait for them to finish the IO
1329 */
2d21ac55
A
1330 vnode_waitforwrites(NFSTOV(np), VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1331
1332 /* the file is in a modified state, so make sure the flag's set */
b0d623f7 1333 nfs_node_lock_force(np);
2d21ac55 1334 np->n_flag |= NMODIFIED;
b0d623f7 1335 nfs_node_unlock(np);
55e303ae
A
1336
1337 /*
2d21ac55
A
1338 * If we have too many delayed write buffers,
1339 * just fall back to doing the async write.
55e303ae 1340 */
0a7de745 1341 if (nfs_nbdwrite < 0) {
55e303ae 1342 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
0a7de745 1343 }
2d21ac55 1344 if (nfs_nbdwrite > NFS_A_LOT_OF_DELAYED_WRITES) {
55e303ae
A
1345 /* issue async write */
1346 SET(bp->nb_flags, NB_ASYNC);
1347 nfs_buf_write(bp);
1348 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1349 return;
1350 }
2d21ac55 1351
55e303ae
A
1352 /* Otherwise, the "write" is done, so mark and release the buffer. */
1353 SET(bp->nb_flags, NB_DONE);
483a1d10 1354 nfs_buf_release(bp, 1);
55e303ae
A
1355 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1356 return;
1357}
1358
8f6c56a5
A
1359/*
1360 * Check that a "needcommit" buffer can still be committed.
1361 * If the write verifier has changed, we need to clear the
1362 * the needcommit flag.
1363 */
1364void
2d21ac55 1365nfs_buf_check_write_verifier(nfsnode_t np, struct nfsbuf *bp)
8f6c56a5
A
1366{
1367 struct nfsmount *nmp;
1368
0a7de745 1369 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
8f6c56a5 1370 return;
0a7de745 1371 }
8f6c56a5 1372
2d21ac55 1373 nmp = NFSTONMP(np);
0a7de745 1374 if (nfs_mount_gone(nmp)) {
2d21ac55 1375 return;
0a7de745
A
1376 }
1377 if (!ISSET(bp->nb_flags, NB_STALEWVERF) && (bp->nb_verf == nmp->nm_verf)) {
8f6c56a5 1378 return;
0a7de745 1379 }
8f6c56a5 1380
2d21ac55
A
1381 /* write verifier changed, clear commit/wverf flags */
1382 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_STALEWVERF));
1383 bp->nb_verf = 0;
b0d623f7 1384 nfs_node_lock_force(np);
8f6c56a5
A
1385 np->n_needcommitcnt--;
1386 CHECK_NEEDCOMMITCNT(np);
b0d623f7 1387 nfs_node_unlock(np);
8f6c56a5
A
1388}
1389
91447636
A
1390/*
1391 * add a reference to a buffer so it doesn't disappear while being used
1392 * (must be called with nfs_buf_mutex held)
1393 */
1394void
1395nfs_buf_refget(struct nfsbuf *bp)
1396{
cb323159 1397 os_ref_retain_locked(&bp->nb_refs);
91447636
A
1398}
1399/*
1400 * release a reference on a buffer
1401 * (must be called with nfs_buf_mutex held)
1402 */
1403void
1404nfs_buf_refrele(struct nfsbuf *bp)
1405{
cb323159 1406 (void) os_ref_release_locked(&bp->nb_refs);
91447636
A
1407}
1408
1409/*
1410 * mark a particular buffer as BUSY
1411 * (must be called with nfs_buf_mutex held)
1412 */
1413errno_t
1414nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1415{
1416 errno_t error;
1417 struct timespec ts;
1418
1419 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
0a7de745 1420 /*
b0d623f7 1421 * since the lck_mtx_lock may block, the buffer
91447636
A
1422 * may become BUSY, so we need to recheck for
1423 * a NOWAIT request
1424 */
0a7de745
A
1425 if (flags & NBAC_NOWAIT) {
1426 return EBUSY;
1427 }
1428 SET(bp->nb_lflags, NBL_WANTED);
91447636 1429
0a7de745 1430 ts.tv_sec = (slptimeo / 100);
2d21ac55
A
1431 /* the hz value is 100; which leads to 10ms */
1432 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
91447636 1433
2d21ac55 1434 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
0a7de745
A
1435 "nfs_buf_acquire", &ts);
1436 if (error) {
1437 return error;
1438 }
1439 return EAGAIN;
1440 }
1441 if (flags & NBAC_REMOVE) {
1442 nfs_buf_remfree(bp);
91447636 1443 }
91447636
A
1444 SET(bp->nb_lflags, NBL_BUSY);
1445
0a7de745 1446 return 0;
91447636
A
1447}
1448
1449/*
1450 * simply drop the BUSY status of a buffer
1451 * (must be called with nfs_buf_mutex held)
1452 */
1453void
1454nfs_buf_drop(struct nfsbuf *bp)
1455{
1456 int need_wakeup = 0;
1457
0a7de745 1458 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
91447636 1459 panic("nfs_buf_drop: buffer not busy!");
0a7de745 1460 }
91447636 1461 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
0a7de745 1462 /* delay the actual wakeup until after we clear NBL_BUSY */
91447636
A
1463 need_wakeup = 1;
1464 }
1465 /* Unlock the buffer. */
1466 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1467
0a7de745
A
1468 if (need_wakeup) {
1469 wakeup(bp);
1470 }
91447636
A
1471}
1472
1473/*
1474 * prepare for iterating over an nfsnode's buffer list
1475 * this lock protects the queue manipulation
1476 * (must be called with nfs_buf_mutex held)
1477 */
1478int
2d21ac55 1479nfs_buf_iterprepare(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
91447636
A
1480{
1481 struct nfsbuflists *listheadp;
1482
0a7de745 1483 if (flags & NBI_DIRTY) {
91447636 1484 listheadp = &np->n_dirtyblkhd;
0a7de745 1485 } else {
91447636 1486 listheadp = &np->n_cleanblkhd;
0a7de745 1487 }
91447636
A
1488
1489 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
0a7de745
A
1490 LIST_INIT(iterheadp);
1491 return EWOULDBLOCK;
91447636
A
1492 }
1493
0a7de745
A
1494 while (np->n_bufiterflags & NBI_ITER) {
1495 np->n_bufiterflags |= NBI_ITERWANT;
2d21ac55 1496 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", NULL);
91447636
A
1497 }
1498 if (LIST_EMPTY(listheadp)) {
0a7de745
A
1499 LIST_INIT(iterheadp);
1500 return EINVAL;
91447636
A
1501 }
1502 np->n_bufiterflags |= NBI_ITER;
1503
1504 iterheadp->lh_first = listheadp->lh_first;
0a7de745 1505 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
91447636
A
1506 LIST_INIT(listheadp);
1507
0a7de745 1508 return 0;
91447636
A
1509}
1510
1511/*
2d21ac55 1512 * clean up after iterating over an nfsnode's buffer list
91447636
A
1513 * this lock protects the queue manipulation
1514 * (must be called with nfs_buf_mutex held)
1515 */
1516void
2d21ac55 1517nfs_buf_itercomplete(nfsnode_t np, struct nfsbuflists *iterheadp, int flags)
91447636
A
1518{
1519 struct nfsbuflists * listheadp;
1520 struct nfsbuf *bp;
1521
0a7de745 1522 if (flags & NBI_DIRTY) {
91447636 1523 listheadp = &np->n_dirtyblkhd;
0a7de745 1524 } else {
91447636 1525 listheadp = &np->n_cleanblkhd;
0a7de745 1526 }
91447636
A
1527
1528 while (!LIST_EMPTY(iterheadp)) {
1529 bp = LIST_FIRST(iterheadp);
1530 LIST_REMOVE(bp, nb_vnbufs);
1531 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1532 }
1533
1534 np->n_bufiterflags &= ~NBI_ITER;
1535 if (np->n_bufiterflags & NBI_ITERWANT) {
1536 np->n_bufiterflags &= ~NBI_ITERWANT;
1537 wakeup(&np->n_bufiterflags);
1538 }
1539}
1540
1c79356b
A
1541
1542/*
2d21ac55 1543 * Read an NFS buffer for a file.
1c79356b
A
1544 */
1545int
2d21ac55 1546nfs_buf_read(struct nfsbuf *bp)
1c79356b 1547{
2d21ac55
A
1548 int error = 0;
1549 nfsnode_t np;
1550 thread_t thd;
1551 kauth_cred_t cred;
55e303ae 1552
2d21ac55
A
1553 np = bp->nb_np;
1554 cred = bp->nb_rcred;
0a7de745 1555 if (IS_VALID_CRED(cred)) {
2d21ac55 1556 kauth_cred_ref(cred);
0a7de745 1557 }
2d21ac55 1558 thd = ISSET(bp->nb_flags, NB_ASYNC) ? NULL : current_thread();
1c79356b 1559
2d21ac55 1560 /* sanity checks */
0a7de745 1561 if (!ISSET(bp->nb_flags, NB_READ)) {
2d21ac55 1562 panic("nfs_buf_read: !NB_READ");
0a7de745
A
1563 }
1564 if (ISSET(bp->nb_flags, NB_DONE)) {
2d21ac55 1565 CLR(bp->nb_flags, NB_DONE);
0a7de745 1566 }
91447636 1567
2d21ac55 1568 NFS_BUF_MAP(bp);
0c530ab8 1569
316670eb 1570 OSAddAtomic64(1, &nfsstats.read_bios);
2d21ac55
A
1571
1572 error = nfs_buf_read_rpc(bp, thd, cred);
1c79356b 1573 /*
2d21ac55
A
1574 * For async I/O, the callbacks will finish up the
1575 * read. Otherwise, the read has already been finished.
1c79356b 1576 */
2d21ac55 1577
0a7de745 1578 if (IS_VALID_CRED(cred)) {
2d21ac55 1579 kauth_cred_unref(&cred);
0a7de745
A
1580 }
1581 return error;
2d21ac55
A
1582}
1583
1584/*
1585 * finish the reading of a buffer
1586 */
1587void
1588nfs_buf_read_finish(struct nfsbuf *bp)
1589{
1590 nfsnode_t np = bp->nb_np;
1591 struct nfsmount *nmp;
1592
1593 if (!ISSET(bp->nb_flags, NB_ERROR)) {
1594 /* update valid range */
1595 bp->nb_validoff = 0;
1596 bp->nb_validend = bp->nb_endio;
0a7de745 1597 if (bp->nb_endio < (int)bp->nb_bufsize) {
2d21ac55
A
1598 /*
1599 * The read may be short because we have unflushed writes
1600 * that are extending the file size and the reads hit the
1601 * (old) EOF on the server. So, just make sure nb_validend
1602 * correctly tracks EOF.
1603 * Note that the missing data should have already been zeroed
1604 * in nfs_buf_read_rpc_finish().
1605 */
1606 off_t boff = NBOFF(bp);
0a7de745 1607 if ((off_t)np->n_size >= (boff + bp->nb_bufsize)) {
2d21ac55 1608 bp->nb_validend = bp->nb_bufsize;
0a7de745 1609 } else if ((off_t)np->n_size >= boff) {
2d21ac55 1610 bp->nb_validend = np->n_size - boff;
0a7de745 1611 } else {
2d21ac55 1612 bp->nb_validend = 0;
0a7de745 1613 }
91447636 1614 }
2d21ac55 1615 if ((nmp = NFSTONMP(np)) && (nmp->nm_vers == NFS_VER2) &&
0a7de745 1616 ((NBOFF(bp) + bp->nb_validend) > 0x100000000LL)) {
2d21ac55 1617 bp->nb_validend = 0x100000000LL - NBOFF(bp);
0a7de745 1618 }
cb323159 1619 bp->nb_valid = (uint32_t)(1LLU << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1;
2d21ac55
A
1620 if (bp->nb_validend & PAGE_MASK) {
1621 /* zero-fill remainder of last page */
6d2010ae 1622 bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK));
91447636 1623 }
2d21ac55
A
1624 }
1625 nfs_buf_iodone(bp);
1626}
1627
1628/*
1629 * initiate the NFS READ RPC(s) for a buffer
1630 */
1631int
1632nfs_buf_read_rpc(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
1633{
1634 struct nfsmount *nmp;
1635 nfsnode_t np = bp->nb_np;
1636 int error = 0, nfsvers, async;
b0d623f7
A
1637 int offset, nrpcs;
1638 uint32_t nmrsize, length, len;
2d21ac55
A
1639 off_t boff;
1640 struct nfsreq *req;
1641 struct nfsreq_cbinfo cb;
1642
1643 nmp = NFSTONMP(np);
fe8ab488 1644 if (nfs_mount_gone(nmp)) {
2d21ac55
A
1645 bp->nb_error = error = ENXIO;
1646 SET(bp->nb_flags, NB_ERROR);
1647 nfs_buf_iodone(bp);
0a7de745 1648 return error;
2d21ac55
A
1649 }
1650 nfsvers = nmp->nm_vers;
1651 nmrsize = nmp->nm_rsize;
1652
1653 boff = NBOFF(bp);
1654 offset = 0;
1655 length = bp->nb_bufsize;
1656
1657 if (nfsvers == NFS_VER2) {
1658 if (boff > 0xffffffffLL) {
1659 bp->nb_error = error = EFBIG;
1660 SET(bp->nb_flags, NB_ERROR);
1661 nfs_buf_iodone(bp);
0a7de745 1662 return error;
91447636 1663 }
0a7de745 1664 if ((boff + length - 1) > 0xffffffffLL) {
2d21ac55 1665 length = 0x100000000LL - boff;
0a7de745 1666 }
91447636
A
1667 }
1668
2d21ac55
A
1669 /* Note: Can only do async I/O if nfsiods are configured. */
1670 async = (bp->nb_flags & NB_ASYNC);
1671 cb.rcb_func = async ? nfs_buf_read_rpc_finish : NULL;
1672 cb.rcb_bp = bp;
1673
1674 bp->nb_offio = bp->nb_endio = 0;
1675 bp->nb_rpcs = nrpcs = (length + nmrsize - 1) / nmrsize;
1676 if (async && (nrpcs > 1)) {
1677 SET(bp->nb_flags, NB_MULTASYNCRPC);
1678 } else {
1679 CLR(bp->nb_flags, NB_MULTASYNCRPC);
1c79356b 1680 }
1c79356b 1681
2d21ac55
A
1682 while (length > 0) {
1683 if (ISSET(bp->nb_flags, NB_ERROR)) {
1684 error = bp->nb_error;
91447636 1685 break;
2d21ac55
A
1686 }
1687 len = (length > nmrsize) ? nmrsize : length;
1688 cb.rcb_args[0] = offset;
1689 cb.rcb_args[1] = len;
cb323159 1690#if CONFIG_NFS4
0a7de745 1691 if (nmp->nm_vers >= NFS_VER4) {
b0d623f7 1692 cb.rcb_args[2] = nmp->nm_stategenid;
0a7de745 1693 }
cb323159 1694#endif
2d21ac55
A
1695 req = NULL;
1696 error = nmp->nm_funcs->nf_read_rpc_async(np, boff + offset, len, thd, cred, &cb, &req);
0a7de745 1697 if (error) {
1c79356b 1698 break;
0a7de745 1699 }
2d21ac55
A
1700 offset += len;
1701 length -= len;
0a7de745 1702 if (async) {
2d21ac55 1703 continue;
0a7de745 1704 }
2d21ac55
A
1705 nfs_buf_read_rpc_finish(req);
1706 if (ISSET(bp->nb_flags, NB_ERROR)) {
1707 error = bp->nb_error;
1708 break;
1709 }
1710 }
55e303ae 1711
2d21ac55 1712 if (length > 0) {
55e303ae 1713 /*
2d21ac55
A
1714 * Something bad happened while trying to send the RPC(s).
1715 * Wait for any outstanding requests to complete.
55e303ae 1716 */
2d21ac55
A
1717 bp->nb_error = error;
1718 SET(bp->nb_flags, NB_ERROR);
1719 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
1720 nrpcs = (length + nmrsize - 1) / nmrsize;
1721 lck_mtx_lock(nfs_buf_mutex);
1722 bp->nb_rpcs -= nrpcs;
1723 if (bp->nb_rpcs == 0) {
1724 /* No RPCs left, so the buffer's done */
1725 lck_mtx_unlock(nfs_buf_mutex);
1726 nfs_buf_iodone(bp);
1727 } else {
1728 /* wait for the last RPC to mark it done */
0a7de745 1729 while (bp->nb_rpcs > 0) {
2d21ac55 1730 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
0a7de745
A
1731 "nfs_buf_read_rpc_cancel", NULL);
1732 }
2d21ac55 1733 lck_mtx_unlock(nfs_buf_mutex);
55e303ae 1734 }
2d21ac55
A
1735 } else {
1736 nfs_buf_iodone(bp);
55e303ae 1737 }
2d21ac55 1738 }
55e303ae 1739
0a7de745 1740 return error;
2d21ac55 1741}
1c79356b 1742
2d21ac55
A
1743/*
1744 * finish up an NFS READ RPC on a buffer
1745 */
1746void
1747nfs_buf_read_rpc_finish(struct nfsreq *req)
1748{
1749 struct nfsmount *nmp;
1750 size_t rlen;
1751 struct nfsreq_cbinfo cb;
1752 struct nfsbuf *bp;
1753 int error = 0, nfsvers, offset, length, eof = 0, multasyncrpc, finished;
1754 void *wakeme = NULL;
1755 struct nfsreq *rreq = NULL;
1756 nfsnode_t np;
1757 thread_t thd;
1758 kauth_cred_t cred;
b0d623f7 1759 uio_t auio;
0a7de745 1760 char uio_buf[UIO_SIZEOF(1)];
2d21ac55
A
1761
1762finish:
1763 np = req->r_np;
1764 thd = req->r_thread;
1765 cred = req->r_cred;
0a7de745 1766 if (IS_VALID_CRED(cred)) {
2d21ac55 1767 kauth_cred_ref(cred);
0a7de745 1768 }
2d21ac55
A
1769 cb = req->r_callback;
1770 bp = cb.rcb_bp;
0a7de745 1771 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
6d2010ae 1772 nfs_request_ref(req, 0);
0a7de745 1773 }
2d21ac55
A
1774
1775 nmp = NFSTONMP(np);
fe8ab488 1776 if (nfs_mount_gone(nmp)) {
2d21ac55
A
1777 SET(bp->nb_flags, NB_ERROR);
1778 bp->nb_error = error = ENXIO;
1779 }
1780 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
1781 /* just drop it */
1782 nfs_request_async_cancel(req);
1783 goto out;
1784 }
1785
1786 nfsvers = nmp->nm_vers;
1787 offset = cb.rcb_args[0];
1788 rlen = length = cb.rcb_args[1];
1789
b0d623f7 1790 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
0a7de745 1791 UIO_READ, &uio_buf, sizeof(uio_buf));
b0d623f7 1792 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2d21ac55
A
1793
1794 /* finish the RPC */
b0d623f7 1795 error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof);
2d21ac55
A
1796 if ((error == EINPROGRESS) && cb.rcb_func) {
1797 /* async request restarted */
0a7de745 1798 if (cb.rcb_func) {
6d2010ae 1799 nfs_request_rele(req);
0a7de745
A
1800 }
1801 if (IS_VALID_CRED(cred)) {
2d21ac55 1802 kauth_cred_unref(&cred);
0a7de745 1803 }
2d21ac55
A
1804 return;
1805 }
cb323159 1806#if CONFIG_NFS4
b0d623f7
A
1807 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
1808 lck_mtx_lock(&nmp->nm_lock);
6d2010ae
A
1809 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
1810 NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
0a7de745 1811 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
6d2010ae 1812 nfs_need_recover(nmp, error);
b0d623f7
A
1813 }
1814 lck_mtx_unlock(&nmp->nm_lock);
6d2010ae
A
1815 if (np->n_flag & NREVOKE) {
1816 error = EIO;
1817 } else {
1818 if (error == NFSERR_GRACE) {
1819 if (cb.rcb_func) {
1820 /*
1821 * For an async I/O request, handle a grace delay just like
1822 * jukebox errors. Set the resend time and queue it up.
1823 */
1824 struct timeval now;
1825 if (req->r_nmrep.nmc_mhead) {
1826 mbuf_freem(req->r_nmrep.nmc_mhead);
1827 req->r_nmrep.nmc_mhead = NULL;
1828 }
1829 req->r_error = 0;
1830 microuptime(&now);
1831 lck_mtx_lock(&req->r_mtx);
1832 req->r_resendtime = now.tv_sec + 2;
1833 req->r_xid = 0; // get a new XID
1834 req->r_flags |= R_RESTART;
1835 req->r_start = 0;
1836 nfs_asyncio_resend(req);
1837 lck_mtx_unlock(&req->r_mtx);
0a7de745 1838 if (IS_VALID_CRED(cred)) {
6d2010ae 1839 kauth_cred_unref(&cred);
0a7de745 1840 }
6d2010ae
A
1841 /* Note: nfsreq reference taken will be dropped later when finished */
1842 return;
1843 }
1844 /* otherwise, just pause a couple seconds and retry */
0a7de745 1845 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
6d2010ae
A
1846 }
1847 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
1848 rlen = 0;
1849 goto readagain;
1850 }
b0d623f7
A
1851 }
1852 }
cb323159 1853#endif
2d21ac55
A
1854 if (error) {
1855 SET(bp->nb_flags, NB_ERROR);
1856 bp->nb_error = error;
1857 goto out;
1858 }
1859
0a7de745 1860 if ((rlen > 0) && (bp->nb_endio < (offset + (int)rlen))) {
2d21ac55 1861 bp->nb_endio = offset + rlen;
0a7de745 1862 }
2d21ac55
A
1863
1864 if ((nfsvers == NFS_VER2) || eof || (rlen == 0)) {
1865 /* zero out the remaining data (up to EOF) */
1866 off_t rpcrem, eofrem, rem;
1867 rpcrem = (length - rlen);
1868 eofrem = np->n_size - (NBOFF(bp) + offset + rlen);
1869 rem = (rpcrem < eofrem) ? rpcrem : eofrem;
0a7de745 1870 if (rem > 0) {
2d21ac55 1871 bzero(bp->nb_data + offset + rlen, rem);
0a7de745 1872 }
2d21ac55
A
1873 } else if (((int)rlen < length) && !ISSET(bp->nb_flags, NB_ERROR)) {
1874 /*
1875 * short read
1876 *
1877 * We haven't hit EOF and we didn't get all the data
1878 * requested, so we need to issue another read for the rest.
1879 * (Don't bother if the buffer already hit an error.)
1880 */
cb323159 1881#if CONFIG_NFS4
b0d623f7 1882readagain:
cb323159 1883#endif
2d21ac55
A
1884 offset += rlen;
1885 length -= rlen;
1886 cb.rcb_args[0] = offset;
1887 cb.rcb_args[1] = length;
cb323159 1888#if CONFIG_NFS4
0a7de745 1889 if (nmp->nm_vers >= NFS_VER4) {
b0d623f7 1890 cb.rcb_args[2] = nmp->nm_stategenid;
0a7de745 1891 }
cb323159 1892#endif
b0d623f7 1893 error = nmp->nm_funcs->nf_read_rpc_async(np, NBOFF(bp) + offset, length, thd, cred, &cb, &rreq);
2d21ac55 1894 if (!error) {
0a7de745 1895 if (IS_VALID_CRED(cred)) {
2d21ac55 1896 kauth_cred_unref(&cred);
0a7de745 1897 }
2d21ac55
A
1898 if (!cb.rcb_func) {
1899 /* if !async we'll need to wait for this RPC to finish */
1900 req = rreq;
b0d623f7 1901 rreq = NULL;
2d21ac55
A
1902 goto finish;
1903 }
6d2010ae 1904 nfs_request_rele(req);
2d21ac55
A
1905 /*
1906 * We're done here.
1907 * Outstanding RPC count is unchanged.
1908 * Callback will be called when RPC is done.
1909 */
1910 return;
1911 }
1912 SET(bp->nb_flags, NB_ERROR);
1913 bp->nb_error = error;
1914 }
1915
1916out:
0a7de745 1917 if (cb.rcb_func) {
6d2010ae 1918 nfs_request_rele(req);
0a7de745
A
1919 }
1920 if (IS_VALID_CRED(cred)) {
2d21ac55 1921 kauth_cred_unref(&cred);
0a7de745 1922 }
2d21ac55
A
1923
1924 /*
1925 * Decrement outstanding RPC count on buffer
1926 * and call nfs_buf_read_finish on last RPC.
1927 *
1928 * (Note: when there are multiple async RPCs issued for a
1929 * buffer we need nfs_buffer_mutex to avoid problems when
1930 * aborting a partially-initiated set of RPCs)
1931 */
1932
1933 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
0a7de745 1934 if (multasyncrpc) {
2d21ac55 1935 lck_mtx_lock(nfs_buf_mutex);
0a7de745 1936 }
2d21ac55
A
1937
1938 bp->nb_rpcs--;
1939 finished = (bp->nb_rpcs == 0);
1940
0a7de745 1941 if (multasyncrpc) {
2d21ac55 1942 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 1943 }
2d21ac55
A
1944
1945 if (finished) {
0a7de745 1946 if (multasyncrpc) {
2d21ac55 1947 wakeme = &bp->nb_rpcs;
0a7de745 1948 }
2d21ac55 1949 nfs_buf_read_finish(bp);
0a7de745 1950 if (wakeme) {
2d21ac55 1951 wakeup(wakeme);
0a7de745 1952 }
2d21ac55
A
1953 }
1954}
1955
1956/*
1957 * Do buffer readahead.
1958 * Initiate async I/O to read buffers not in cache.
1959 */
b0d623f7 1960int
2d21ac55
A
1961nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn, thread_t thd, kauth_cred_t cred)
1962{
1963 struct nfsmount *nmp = NFSTONMP(np);
1964 struct nfsbuf *bp;
6d2010ae
A
1965 int error = 0;
1966 uint32_t nra;
2d21ac55 1967
0a7de745
A
1968 if (nfs_mount_gone(nmp)) {
1969 return ENXIO;
1970 }
1971 if (nmp->nm_readahead <= 0) {
1972 return 0;
1973 }
1974 if (*rabnp > lastrabn) {
1975 return 0;
1976 }
2d21ac55
A
1977
1978 for (nra = 0; (nra < nmp->nm_readahead) && (*rabnp <= lastrabn); nra++, *rabnp = *rabnp + 1) {
1979 /* check if block exists and is valid. */
b0d623f7
A
1980 if ((*rabnp * nmp->nm_biosize) >= (off_t)np->n_size) {
1981 /* stop reading ahead if we're beyond EOF */
1982 *rabnp = lastrabn;
1983 break;
1984 }
0a7de745
A
1985 error = nfs_buf_get(np, *rabnp, nmp->nm_biosize, thd, NBLK_READ | NBLK_NOWAIT, &bp);
1986 if (error) {
2d21ac55 1987 break;
0a7de745 1988 }
b0d623f7
A
1989 nfs_node_lock_force(np);
1990 np->n_lastrahead = *rabnp;
1991 nfs_node_unlock(np);
0a7de745 1992 if (!bp) {
2d21ac55 1993 continue;
0a7de745 1994 }
2d21ac55 1995 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE) &&
0a7de745 1996 !bp->nb_dirty && !ISSET(bp->nb_flags, (NB_DELWRI | NB_NCRDAHEAD))) {
2d21ac55
A
1997 CLR(bp->nb_flags, NB_CACHE);
1998 bp->nb_valid = 0;
1999 bp->nb_validoff = bp->nb_validend = -1;
2000 }
2001 if ((bp->nb_dirtyend <= 0) && !bp->nb_dirty &&
0a7de745
A
2002 !ISSET(bp->nb_flags, (NB_CACHE | NB_DELWRI))) {
2003 SET(bp->nb_flags, (NB_READ | NB_ASYNC));
2004 if (ioflag & IO_NOCACHE) {
2d21ac55 2005 SET(bp->nb_flags, NB_NCRDAHEAD);
0a7de745 2006 }
2d21ac55
A
2007 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2008 kauth_cred_ref(cred);
2009 bp->nb_rcred = cred;
2010 }
0a7de745 2011 if ((error = nfs_buf_read(bp))) {
2d21ac55 2012 break;
0a7de745 2013 }
2d21ac55
A
2014 continue;
2015 }
2016 nfs_buf_release(bp, 1);
2017 }
0a7de745 2018 return error;
2d21ac55
A
2019}
2020
2021/*
b0d623f7 2022 * NFS buffer I/O for reading files.
2d21ac55
A
2023 */
2024int
b0d623f7 2025nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx)
2d21ac55
A
2026{
2027 vnode_t vp = NFSTOV(np);
2028 struct nfsbuf *bp = NULL;
2d21ac55 2029 struct nfsmount *nmp = VTONMP(vp);
b0d623f7 2030 daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1;
2d21ac55
A
2031 off_t diff;
2032 int error = 0, n = 0, on = 0;
b0d623f7 2033 int nfsvers, biosize, modified, readaheads = 0;
2d21ac55
A
2034 thread_t thd;
2035 kauth_cred_t cred;
b0d623f7 2036 int64_t io_resid;
2d21ac55 2037
b0d623f7 2038 FSDBG_TOP(514, np, uio_offset(uio), uio_resid(uio), ioflag);
2d21ac55
A
2039
2040 nfsvers = nmp->nm_vers;
2041 biosize = nmp->nm_biosize;
2042 thd = vfs_context_thread(ctx);
2043 cred = vfs_context_ucred(ctx);
2044
b0d623f7
A
2045 if (vnode_vtype(vp) != VREG) {
2046 printf("nfs_bioread: type %x unexpected\n", vnode_vtype(vp));
2d21ac55 2047 FSDBG_BOT(514, np, 0xd1e0016, 0, EINVAL);
0a7de745 2048 return EINVAL;
2d21ac55
A
2049 }
2050
2051 /*
b0d623f7 2052 * For NFS, cache consistency can only be maintained approximately.
2d21ac55
A
2053 * Although RFC1094 does not specify the criteria, the following is
2054 * believed to be compatible with the reference port.
0a7de745 2055 *
b0d623f7
A
2056 * If the file has changed since the last read RPC or you have
2057 * written to the file, you may have lost data cache consistency
2058 * with the server. So, check for a change, and flush all of the
2059 * file's data out of the cache.
2d21ac55 2060 * NB: This implies that cache data can be read when up to
b0d623f7
A
2061 * NFS_MAXATTRTIMO seconds out of date. If you find that you
2062 * need current attributes, nfs_getattr() can be forced to fetch
2063 * new attributes (via NATTRINVALIDATE() or NGA_UNCACHED).
2d21ac55
A
2064 */
2065
0a7de745 2066 if (ISSET(np->n_flag, NUPDATESIZE)) {
2d21ac55 2067 nfs_data_update_size(np, 0);
0a7de745 2068 }
2d21ac55 2069
b0d623f7 2070 if ((error = nfs_node_lock(np))) {
2d21ac55 2071 FSDBG_BOT(514, np, 0xd1e0222, 0, error);
0a7de745 2072 return error;
2d21ac55
A
2073 }
2074
2075 if (np->n_flag & NNEEDINVALIDATE) {
2076 np->n_flag &= ~NNEEDINVALIDATE;
b0d623f7 2077 nfs_node_unlock(np);
0a7de745
A
2078 error = nfs_vinvalbuf(vp, V_SAVE | V_IGNORE_WRITEERR, ctx, 1);
2079 if (!error) {
b0d623f7 2080 error = nfs_node_lock(np);
0a7de745 2081 }
b0d623f7 2082 if (error) {
2d21ac55 2083 FSDBG_BOT(514, np, 0xd1e0322, 0, error);
0a7de745 2084 return error;
2d21ac55
A
2085 }
2086 }
2087
b0d623f7
A
2088 modified = (np->n_flag & NMODIFIED);
2089 nfs_node_unlock(np);
2090 /* nfs_getattr() will check changed and purge caches */
6d2010ae 2091 error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED);
b0d623f7
A
2092 if (error) {
2093 FSDBG_BOT(514, np, 0xd1e0004, 0, error);
0a7de745 2094 return error;
2d21ac55
A
2095 }
2096
b0d623f7
A
2097 if (uio_resid(uio) == 0) {
2098 FSDBG_BOT(514, np, 0xd1e0001, 0, 0);
0a7de745 2099 return 0;
b0d623f7
A
2100 }
2101 if (uio_offset(uio) < 0) {
2102 FSDBG_BOT(514, np, 0xd1e0002, 0, EINVAL);
0a7de745 2103 return EINVAL;
b0d623f7 2104 }
2d21ac55 2105
b0d623f7
A
2106 /*
2107 * set up readahead - which may be limited by:
2108 * + current request length (for IO_NOCACHE)
2109 * + readahead setting
2110 * + file size
2111 */
2112 if (nmp->nm_readahead > 0) {
2113 off_t end = uio_offset(uio) + uio_resid(uio);
0a7de745 2114 if (end > (off_t)np->n_size) {
b0d623f7 2115 end = np->n_size;
0a7de745 2116 }
b0d623f7
A
2117 rabn = uio_offset(uio) / biosize;
2118 maxrabn = (end - 1) / biosize;
2119 nfs_node_lock_force(np);
2120 if (!(ioflag & IO_NOCACHE) &&
0a7de745 2121 (!rabn || (rabn == np->n_lastread) || (rabn == (np->n_lastread + 1)))) {
b0d623f7 2122 maxrabn += nmp->nm_readahead;
0a7de745
A
2123 if ((maxrabn * biosize) >= (off_t)np->n_size) {
2124 maxrabn = ((off_t)np->n_size - 1) / biosize;
2125 }
b0d623f7 2126 }
0a7de745 2127 if (maxrabn < np->n_lastrahead) {
b0d623f7 2128 np->n_lastrahead = -1;
0a7de745
A
2129 }
2130 if (rabn < np->n_lastrahead) {
b0d623f7 2131 rabn = np->n_lastrahead + 1;
0a7de745 2132 }
b0d623f7
A
2133 nfs_node_unlock(np);
2134 } else {
2135 rabn = maxrabn = 0;
2d21ac55
A
2136 }
2137
2138 do {
b0d623f7
A
2139 nfs_data_lock(np, NFS_DATA_LOCK_SHARED);
2140 lbn = uio_offset(uio) / biosize;
2d21ac55
A
2141
2142 /*
2143 * Copy directly from any cached pages without grabbing the bufs.
b0d623f7
A
2144 * (If we are NOCACHE and we've issued readahead requests, we need
2145 * to grab the NB_NCRDAHEAD bufs to drop them.)
2d21ac55 2146 */
b0d623f7
A
2147 if ((!(ioflag & IO_NOCACHE) || !readaheads) &&
2148 ((uio->uio_segflg == UIO_USERSPACE32 ||
0a7de745
A
2149 uio->uio_segflg == UIO_USERSPACE64 ||
2150 uio->uio_segflg == UIO_USERSPACE))) {
b0d623f7
A
2151 io_resid = uio_resid(uio);
2152 diff = np->n_size - uio_offset(uio);
0a7de745 2153 if (diff < io_resid) {
2d21ac55 2154 io_resid = diff;
0a7de745 2155 }
2d21ac55 2156 if (io_resid > 0) {
b0d623f7
A
2157 int count = (io_resid > INT_MAX) ? INT_MAX : io_resid;
2158 error = cluster_copy_ubc_data(vp, uio, &count, 0);
91447636 2159 if (error) {
2d21ac55 2160 nfs_data_unlock(np);
b0d623f7 2161 FSDBG_BOT(514, np, uio_offset(uio), 0xcacefeed, error);
0a7de745 2162 return error;
91447636 2163 }
2d21ac55
A
2164 }
2165 /* count any biocache reads that we just copied directly */
0a7de745
A
2166 if (lbn != (uio_offset(uio) / biosize)) {
2167 OSAddAtomic64((uio_offset(uio) / biosize) - lbn, &nfsstats.biocache_reads);
b0d623f7 2168 FSDBG(514, np, 0xcacefeed, uio_offset(uio), error);
2d21ac55
A
2169 }
2170 }
2171
b0d623f7
A
2172 lbn = uio_offset(uio) / biosize;
2173 on = uio_offset(uio) % biosize;
2174 nfs_node_lock_force(np);
2175 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2176 nfs_node_unlock(np);
2d21ac55 2177
6d2010ae
A
2178 if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) {
2179 nfs_data_unlock(np);
2180 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa);
0a7de745 2181 return 0;
6d2010ae
A
2182 }
2183
2d21ac55 2184 /* adjust readahead block number, if necessary */
0a7de745 2185 if (rabn < lbn) {
2d21ac55 2186 rabn = lbn;
0a7de745 2187 }
2d21ac55
A
2188 lastrabn = MIN(maxrabn, lbn + nmp->nm_readahead);
2189 if (rabn <= lastrabn) { /* start readaheads */
2190 error = nfs_buf_readahead(np, ioflag, &rabn, lastrabn, thd, cred);
2191 if (error) {
2192 nfs_data_unlock(np);
2193 FSDBG_BOT(514, np, 0xd1e000b, 1, error);
0a7de745 2194 return error;
55e303ae 2195 }
b0d623f7 2196 readaheads = 1;
1c79356b
A
2197 }
2198
316670eb 2199 OSAddAtomic64(1, &nfsstats.biocache_reads);
55e303ae 2200
1c79356b
A
2201 /*
2202 * If the block is in the cache and has the required data
2203 * in a valid region, just copy it out.
2204 * Otherwise, get the block and write back/read in,
2205 * as required.
2206 */
2207again:
b0d623f7
A
2208 io_resid = uio_resid(uio);
2209 n = (io_resid > (biosize - on)) ? (biosize - on) : io_resid;
2210 diff = np->n_size - uio_offset(uio);
0a7de745 2211 if (diff < n) {
1c79356b 2212 n = diff;
0a7de745 2213 }
55e303ae 2214
2d21ac55 2215 error = nfs_buf_get(np, lbn, biosize, thd, NBLK_READ, &bp);
91447636 2216 if (error) {
2d21ac55
A
2217 nfs_data_unlock(np);
2218 FSDBG_BOT(514, np, 0xd1e000c, 0, error);
0a7de745 2219 return error;
2d21ac55
A
2220 }
2221
2222 if ((ioflag & IO_NOCACHE) && ISSET(bp->nb_flags, NB_CACHE)) {
2223 /*
2224 * IO_NOCACHE found a cached buffer.
2225 * Flush the buffer if it's dirty.
2226 * Invalidate the data if it wasn't just read
2227 * in as part of a "nocache readahead".
2228 */
2229 if (bp->nb_dirty || (bp->nb_dirtyend > 0)) {
2230 /* so write the buffer out and try again */
2231 SET(bp->nb_flags, NB_NOCACHE);
2232 goto flushbuffer;
2233 }
b0d623f7 2234 if (ISSET(bp->nb_flags, NB_NCRDAHEAD)) {
2d21ac55 2235 CLR(bp->nb_flags, NB_NCRDAHEAD);
b0d623f7 2236 SET(bp->nb_flags, NB_NOCACHE);
2d21ac55 2237 }
55e303ae
A
2238 }
2239
2240 /* if any pages are valid... */
2241 if (bp->nb_valid) {
2242 /* ...check for any invalid pages in the read range */
2243 int pg, firstpg, lastpg, dirtypg;
2244 dirtypg = firstpg = lastpg = -1;
0a7de745
A
2245 pg = on / PAGE_SIZE;
2246 while (pg <= (on + n - 1) / PAGE_SIZE) {
2247 if (!NBPGVALID(bp, pg)) {
2248 if (firstpg < 0) {
55e303ae 2249 firstpg = pg;
0a7de745 2250 }
55e303ae 2251 lastpg = pg;
0a7de745 2252 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp, pg)) {
55e303ae 2253 dirtypg = pg;
0a7de745 2254 }
55e303ae
A
2255 pg++;
2256 }
2257
2258 /* if there are no invalid pages, we're all set */
2259 if (firstpg < 0) {
2260 if (bp->nb_validoff < 0) {
2261 /* valid range isn't set up, so */
2262 /* set it to what we know is valid */
91447636 2263 bp->nb_validoff = trunc_page(on);
0a7de745 2264 bp->nb_validend = round_page(on + n);
55e303ae
A
2265 nfs_buf_normalize_valid_range(np, bp);
2266 }
2267 goto buffer_ready;
2268 }
2269
2270 /* there are invalid pages in the read range */
2d21ac55 2271 if (((dirtypg > firstpg) && (dirtypg < lastpg)) ||
0a7de745 2272 (((firstpg * PAGE_SIZE) < bp->nb_dirtyend) && (((lastpg + 1) * PAGE_SIZE) > bp->nb_dirtyoff))) {
2d21ac55 2273 /* there are also dirty page(s) (or range) in the read range, */
55e303ae 2274 /* so write the buffer out and try again */
2d21ac55 2275flushbuffer:
55e303ae
A
2276 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2277 SET(bp->nb_flags, NB_ASYNC);
2d21ac55 2278 if (!IS_VALID_CRED(bp->nb_wcred)) {
91447636
A
2279 kauth_cred_ref(cred);
2280 bp->nb_wcred = cred;
2281 }
55e303ae
A
2282 error = nfs_buf_write(bp);
2283 if (error) {
2d21ac55
A
2284 nfs_data_unlock(np);
2285 FSDBG_BOT(514, np, 0xd1e000d, 0, error);
0a7de745 2286 return error;
55e303ae 2287 }
1c79356b
A
2288 goto again;
2289 }
55e303ae 2290 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
0a7de745 2291 (lastpg - firstpg + 1) > (biosize / PAGE_SIZE) / 2) {
55e303ae
A
2292 /* we need to read in more than half the buffer and the */
2293 /* buffer's not dirty, so just fetch the whole buffer */
2294 bp->nb_valid = 0;
2295 } else {
2296 /* read the page range in */
91447636 2297 uio_t auio;
0a7de745
A
2298 char uio_buf[UIO_SIZEOF(1)];
2299
55e303ae 2300 NFS_BUF_MAP(bp);
2d21ac55 2301 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
0a7de745 2302 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
91447636
A
2303 if (!auio) {
2304 error = ENOMEM;
2305 } else {
b0d623f7 2306 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + (firstpg * PAGE_SIZE)),
0a7de745 2307 ((lastpg - firstpg + 1) * PAGE_SIZE));
2d21ac55 2308 error = nfs_read_rpc(np, auio, ctx);
91447636 2309 }
55e303ae 2310 if (error) {
0a7de745 2311 if (ioflag & IO_NOCACHE) {
91447636 2312 SET(bp->nb_flags, NB_NOCACHE);
0a7de745 2313 }
483a1d10 2314 nfs_buf_release(bp, 1);
2d21ac55
A
2315 nfs_data_unlock(np);
2316 FSDBG_BOT(514, np, 0xd1e000e, 0, error);
0a7de745 2317 return error;
55e303ae
A
2318 }
2319 /* Make sure that the valid range is set to cover this read. */
2320 bp->nb_validoff = trunc_page_32(on);
0a7de745 2321 bp->nb_validend = round_page_32(on + n);
55e303ae 2322 nfs_buf_normalize_valid_range(np, bp);
91447636 2323 if (uio_resid(auio) > 0) {
55e303ae
A
2324 /* if short read, must have hit EOF, */
2325 /* so zero the rest of the range */
91447636 2326 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
55e303ae
A
2327 }
2328 /* mark the pages (successfully read) as valid */
0a7de745
A
2329 for (pg = firstpg; pg <= lastpg; pg++) {
2330 NBPGVALID_SET(bp, pg);
2331 }
55e303ae 2332 }
1c79356b 2333 }
55e303ae
A
2334 /* if no pages are valid, read the whole block */
2335 if (!bp->nb_valid) {
2d21ac55
A
2336 if (!IS_VALID_CRED(bp->nb_rcred) && IS_VALID_CRED(cred)) {
2337 kauth_cred_ref(cred);
2338 bp->nb_rcred = cred;
2339 }
55e303ae
A
2340 SET(bp->nb_flags, NB_READ);
2341 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2d21ac55 2342 error = nfs_buf_read(bp);
0a7de745 2343 if (ioflag & IO_NOCACHE) {
b0d623f7 2344 SET(bp->nb_flags, NB_NOCACHE);
0a7de745 2345 }
55e303ae 2346 if (error) {
2d21ac55 2347 nfs_data_unlock(np);
483a1d10 2348 nfs_buf_release(bp, 1);
2d21ac55 2349 FSDBG_BOT(514, np, 0xd1e000f, 0, error);
0a7de745 2350 return error;
55e303ae
A
2351 }
2352 }
2353buffer_ready:
55e303ae
A
2354 /* validate read range against valid range and clip */
2355 if (bp->nb_validend > 0) {
2356 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
0a7de745 2357 if (diff < n) {
55e303ae 2358 n = diff;
0a7de745 2359 }
55e303ae 2360 }
55e303ae 2361 if (n > 0) {
b0d623f7
A
2362 NFS_BUF_MAP(bp);
2363 error = uiomove(bp->nb_data + on, n, uio);
55e303ae 2364 }
2d21ac55 2365
cb323159 2366
2d21ac55
A
2367 nfs_buf_release(bp, 1);
2368 nfs_data_unlock(np);
b0d623f7
A
2369 nfs_node_lock_force(np);
2370 np->n_lastread = (uio_offset(uio) - 1) / biosize;
2371 nfs_node_unlock(np);
2372 } while (error == 0 && uio_resid(uio) > 0 && n > 0);
2373 FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), error);
0a7de745 2374 return error;
1c79356b
A
2375}
2376
2d21ac55
A
2377/*
2378 * limit the number of outstanding async I/O writes
2379 */
b0d623f7 2380int
2d21ac55
A
2381nfs_async_write_start(struct nfsmount *nmp)
2382{
6d2010ae 2383 int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0;
cb323159 2384 struct timespec ts = { .tv_sec = 1, .tv_nsec = 0 };
2d21ac55 2385
0a7de745
A
2386 if (nfs_max_async_writes <= 0) {
2387 return 0;
2388 }
2d21ac55 2389 lck_mtx_lock(&nmp->nm_lock);
36401178 2390 while ((nfs_max_async_writes > 0) && (nmp->nm_asyncwrites >= nfs_max_async_writes)) {
0a7de745 2391 if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) {
2d21ac55 2392 break;
0a7de745
A
2393 }
2394 msleep(&nmp->nm_asyncwrites, &nmp->nm_lock, slpflag | (PZERO - 1), "nfsasyncwrites", &ts);
36401178 2395 slpflag = 0;
2d21ac55 2396 }
0a7de745 2397 if (!error) {
2d21ac55 2398 nmp->nm_asyncwrites++;
0a7de745 2399 }
2d21ac55 2400 lck_mtx_unlock(&nmp->nm_lock);
0a7de745 2401 return error;
2d21ac55 2402}
b0d623f7 2403void
2d21ac55
A
2404nfs_async_write_done(struct nfsmount *nmp)
2405{
0a7de745 2406 if (nmp->nm_asyncwrites <= 0) {
2d21ac55 2407 return;
0a7de745 2408 }
2d21ac55 2409 lck_mtx_lock(&nmp->nm_lock);
0a7de745 2410 if (nmp->nm_asyncwrites-- >= nfs_max_async_writes) {
2d21ac55 2411 wakeup(&nmp->nm_asyncwrites);
0a7de745 2412 }
2d21ac55
A
2413 lck_mtx_unlock(&nmp->nm_lock);
2414}
fa4905b1 2415
1c79356b 2416/*
2d21ac55
A
2417 * write (or commit) the given NFS buffer
2418 *
2419 * Commit the buffer if we can.
2420 * Write out any dirty range.
2421 * If any dirty pages remain, write them out.
2422 * Mark buffer done.
2423 *
2424 * For async requests, all the work beyond sending the initial
2425 * write RPC is handled in the RPC callback(s).
1c79356b
A
2426 */
2427int
2d21ac55 2428nfs_buf_write(struct nfsbuf *bp)
1c79356b 2429{
2d21ac55
A
2430 int error = 0, oldflags, async;
2431 nfsnode_t np;
2432 thread_t thd;
91447636 2433 kauth_cred_t cred;
2d21ac55
A
2434 proc_t p = current_proc();
2435 int iomode, doff, dend, firstpg, lastpg;
2436 uint32_t pagemask;
91447636 2437
2d21ac55 2438 FSDBG_TOP(553, bp, NBOFF(bp), bp->nb_flags, 0);
91447636 2439
0a7de745 2440 if (!ISSET(bp->nb_lflags, NBL_BUSY)) {
2d21ac55 2441 panic("nfs_buf_write: buffer is not busy???");
0a7de745 2442 }
91447636 2443
2d21ac55
A
2444 np = bp->nb_np;
2445 async = ISSET(bp->nb_flags, NB_ASYNC);
2446 oldflags = bp->nb_flags;
91447636 2447
0a7de745 2448 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2d21ac55
A
2449 if (ISSET(oldflags, NB_DELWRI)) {
2450 lck_mtx_lock(nfs_buf_mutex);
2451 nfs_nbdwrite--;
2452 NFSBUFCNTCHK();
2453 lck_mtx_unlock(nfs_buf_mutex);
2454 wakeup(&nfs_nbdwrite);
91447636 2455 }
2d21ac55
A
2456
2457 /* move to clean list */
0a7de745 2458 if (ISSET(oldflags, (NB_ASYNC | NB_DELWRI))) {
2d21ac55 2459 lck_mtx_lock(nfs_buf_mutex);
0a7de745 2460 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2d21ac55 2461 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 2462 }
2d21ac55
A
2463 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2464 lck_mtx_unlock(nfs_buf_mutex);
1c79356b 2465 }
b0d623f7
A
2466 nfs_node_lock_force(np);
2467 np->n_numoutput++;
2468 nfs_node_unlock(np);
2d21ac55 2469 vnode_startwrite(NFSTOV(np));
0c530ab8 2470
0a7de745 2471 if (p && p->p_stats) {
b0d623f7 2472 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);
0a7de745 2473 }
0c530ab8 2474
2d21ac55 2475 cred = bp->nb_wcred;
0a7de745 2476 if (!IS_VALID_CRED(cred) && ISSET(bp->nb_flags, NB_READ)) {
2d21ac55 2477 cred = bp->nb_rcred; /* shouldn't really happen, but... */
0a7de745
A
2478 }
2479 if (IS_VALID_CRED(cred)) {
2d21ac55 2480 kauth_cred_ref(cred);
0a7de745 2481 }
2d21ac55
A
2482 thd = async ? NULL : current_thread();
2483
2484 /* We need to make sure the pages are locked before doing I/O. */
fe8ab488
A
2485 if (!ISSET(bp->nb_flags, NB_META)) {
2486 if (UBCINFOEXISTS(NFSTOV(np))) {
2487 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2488 error = nfs_buf_upl_setup(bp);
2489 if (error) {
2490 printf("nfs_buf_write: upl create failed %d\n", error);
2491 SET(bp->nb_flags, NB_ERROR);
2492 bp->nb_error = error = EIO;
2493 nfs_buf_iodone(bp);
2494 goto out;
2495 }
2496 nfs_buf_upl_check(bp);
55e303ae 2497 }
fe8ab488
A
2498 } else {
2499 /* We should never be in nfs_buf_write() with no UBCINFO. */
2500 printf("nfs_buf_write: ubcinfo already gone\n");
2501 SET(bp->nb_flags, NB_ERROR);
2502 bp->nb_error = error = EIO;
2503 nfs_buf_iodone(bp);
2504 goto out;
1c79356b
A
2505 }
2506 }
55e303ae 2507
2d21ac55 2508 /* If NB_NEEDCOMMIT is set, a commit RPC may do the trick. */
0a7de745 2509 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2d21ac55 2510 nfs_buf_check_write_verifier(np, bp);
0a7de745 2511 }
2d21ac55
A
2512 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2513 struct nfsmount *nmp = NFSTONMP(np);
fe8ab488 2514 if (nfs_mount_gone(nmp)) {
2d21ac55
A
2515 SET(bp->nb_flags, NB_ERROR);
2516 bp->nb_error = error = EIO;
2517 nfs_buf_iodone(bp);
2518 goto out;
2519 }
2520 SET(bp->nb_flags, NB_WRITEINPROG);
2521 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff,
0a7de745 2522 bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf);
2d21ac55
A
2523 CLR(bp->nb_flags, NB_WRITEINPROG);
2524 if (error) {
2525 if (error != NFSERR_STALEWRITEVERF) {
2526 SET(bp->nb_flags, NB_ERROR);
2527 bp->nb_error = error;
55e303ae 2528 }
2d21ac55
A
2529 nfs_buf_iodone(bp);
2530 goto out;
2531 }
2532 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2533 CLR(bp->nb_flags, NB_NEEDCOMMIT);
b0d623f7 2534 nfs_node_lock_force(np);
2d21ac55
A
2535 np->n_needcommitcnt--;
2536 CHECK_NEEDCOMMITCNT(np);
b0d623f7 2537 nfs_node_unlock(np);
2d21ac55
A
2538 }
2539 if (!error && (bp->nb_dirtyend > 0)) {
2540 /* sanity check the dirty range */
2541 if (NBOFF(bp) + bp->nb_dirtyend > (off_t) np->n_size) {
2542 bp->nb_dirtyend = np->n_size - NBOFF(bp);
0a7de745 2543 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2d21ac55 2544 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
0a7de745 2545 }
55e303ae 2546 }
91447636 2547 }
2d21ac55
A
2548 if (!error && (bp->nb_dirtyend > 0)) {
2549 /* there's a dirty range that needs to be written out */
2550 NFS_BUF_MAP(bp);
2551
2552 doff = bp->nb_dirtyoff;
2553 dend = bp->nb_dirtyend;
2554
2555 /* if doff page is dirty, move doff to start of page */
0a7de745 2556 if (NBPGDIRTY(bp, doff / PAGE_SIZE)) {
2d21ac55 2557 doff -= doff & PAGE_MASK;
0a7de745 2558 }
2d21ac55 2559 /* try to expand write range to include preceding dirty pages */
0a7de745
A
2560 if (!(doff & PAGE_MASK)) {
2561 while ((doff > 0) && NBPGDIRTY(bp, (doff - 1) / PAGE_SIZE)) {
2d21ac55 2562 doff -= PAGE_SIZE;
0a7de745
A
2563 }
2564 }
2d21ac55 2565 /* if dend page is dirty, move dend to start of next page */
0a7de745 2566 if ((dend & PAGE_MASK) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2d21ac55 2567 dend = round_page_32(dend);
0a7de745 2568 }
2d21ac55 2569 /* try to expand write range to include trailing dirty pages */
0a7de745
A
2570 if (!(dend & PAGE_MASK)) {
2571 while ((dend < (int)bp->nb_bufsize) && NBPGDIRTY(bp, dend / PAGE_SIZE)) {
2d21ac55 2572 dend += PAGE_SIZE;
0a7de745
A
2573 }
2574 }
2d21ac55 2575 /* make sure to keep dend clipped to EOF */
0a7de745 2576 if ((NBOFF(bp) + dend) > (off_t) np->n_size) {
2d21ac55 2577 dend = np->n_size - NBOFF(bp);
0a7de745 2578 }
2d21ac55
A
2579 /* calculate range of complete pages being written */
2580 firstpg = round_page_32(doff) / PAGE_SIZE;
2581 lastpg = (trunc_page_32(dend) - 1) / PAGE_SIZE;
2582 /* calculate mask for that page range */
2583 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
91447636 2584
fa4905b1 2585 /*
2d21ac55
A
2586 * compare page mask to nb_dirty; if there are other dirty pages
2587 * then write FILESYNC; otherwise, write UNSTABLE if async and
2588 * not needcommit/stable; otherwise write FILESYNC
fa4905b1 2589 */
0a7de745 2590 if (bp->nb_dirty & ~pagemask) {
2d21ac55 2591 iomode = NFS_WRITE_FILESYNC;
0a7de745 2592 } else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC) {
2d21ac55 2593 iomode = NFS_WRITE_UNSTABLE;
0a7de745 2594 } else {
2d21ac55 2595 iomode = NFS_WRITE_FILESYNC;
0a7de745 2596 }
55e303ae 2597
2d21ac55
A
2598 /* write the whole contiguous dirty range */
2599 bp->nb_offio = doff;
2600 bp->nb_endio = dend;
55e303ae 2601
316670eb 2602 OSAddAtomic64(1, &nfsstats.write_bios);
55e303ae 2603
2d21ac55
A
2604 SET(bp->nb_flags, NB_WRITEINPROG);
2605 error = nfs_buf_write_rpc(bp, iomode, thd, cred);
55e303ae 2606 /*
2d21ac55
A
2607 * For async I/O, the callbacks will finish up the
2608 * write and push out any dirty pages. Otherwise,
2609 * the write has already been finished and any dirty
2610 * pages pushed out.
55e303ae 2611 */
2d21ac55 2612 } else {
0a7de745 2613 if (!error && bp->nb_dirty) { /* write out any dirty pages */
2d21ac55 2614 error = nfs_buf_write_dirty_pages(bp, thd, cred);
0a7de745 2615 }
2d21ac55
A
2616 nfs_buf_iodone(bp);
2617 }
2618 /* note: bp is still valid only for !async case */
2619out:
2620 if (!async) {
2621 error = nfs_buf_iowait(bp);
2622 /* move to clean list */
2623 if (oldflags & NB_DELWRI) {
2624 lck_mtx_lock(nfs_buf_mutex);
0a7de745 2625 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2d21ac55 2626 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 2627 }
2d21ac55
A
2628 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2629 lck_mtx_unlock(nfs_buf_mutex);
2630 }
2631 FSDBG_BOT(553, bp, NBOFF(bp), bp->nb_flags, error);
2632 nfs_buf_release(bp, 1);
2633 /* check if we need to invalidate (and we can) */
2634 if ((np->n_flag & NNEEDINVALIDATE) &&
0a7de745 2635 !(np->n_bflag & (NBINVALINPROG | NBFLUSHINPROG))) {
2d21ac55 2636 int invalidate = 0;
b0d623f7 2637 nfs_node_lock_force(np);
2d21ac55
A
2638 if (np->n_flag & NNEEDINVALIDATE) {
2639 invalidate = 1;
2640 np->n_flag &= ~NNEEDINVALIDATE;
55e303ae 2641 }
b0d623f7 2642 nfs_node_unlock(np);
2d21ac55
A
2643 if (invalidate) {
2644 /*
2645 * There was a write error and we need to
2646 * invalidate attrs and flush buffers in
2647 * order to sync up with the server.
2648 * (if this write was extending the file,
2649 * we may no longer know the correct size)
2650 *
2651 * But we couldn't call vinvalbuf while holding
2652 * the buffer busy. So we call vinvalbuf() after
2653 * releasing the buffer.
2654 */
0a7de745 2655 nfs_vinvalbuf2(NFSTOV(np), V_SAVE | V_IGNORE_WRITEERR, thd, cred, 1);
55e303ae 2656 }
55e303ae 2657 }
2d21ac55
A
2658 }
2659
0a7de745 2660 if (IS_VALID_CRED(cred)) {
2d21ac55 2661 kauth_cred_unref(&cred);
0a7de745
A
2662 }
2663 return error;
2d21ac55 2664}
55e303ae 2665
2d21ac55
A
2666/*
2667 * finish the writing of a buffer
2668 */
2669void
2670nfs_buf_write_finish(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2671{
2672 nfsnode_t np = bp->nb_np;
2673 int error = (bp->nb_flags & NB_ERROR) ? bp->nb_error : 0;
2674 int firstpg, lastpg;
2675 uint32_t pagemask;
2676
2677 if ((error == EINTR) || (error == ERESTART)) {
2678 CLR(bp->nb_flags, NB_ERROR);
2679 SET(bp->nb_flags, NB_EINTR);
2680 }
2681
2682 if (!error) {
2683 /* calculate range of complete pages being written */
2684 firstpg = round_page_32(bp->nb_offio) / PAGE_SIZE;
2685 lastpg = (trunc_page_32(bp->nb_endio) - 1) / PAGE_SIZE;
2686 /* calculate mask for that page range written */
2687 pagemask = ((1 << (lastpg + 1)) - 1) & ~((1 << firstpg) - 1);
2688 /* clear dirty bits for pages we've written */
2689 bp->nb_dirty &= ~pagemask;
2690 }
2691
2692 /* manage needcommit state */
2693 if (!error && (bp->nb_commitlevel == NFS_WRITE_UNSTABLE)) {
2694 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
b0d623f7 2695 nfs_node_lock_force(np);
2d21ac55 2696 np->n_needcommitcnt++;
b0d623f7 2697 nfs_node_unlock(np);
2d21ac55
A
2698 SET(bp->nb_flags, NB_NEEDCOMMIT);
2699 }
2700 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2701 bp->nb_dirtyoff = bp->nb_offio;
2702 bp->nb_dirtyend = bp->nb_endio;
2703 } else if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
b0d623f7 2704 nfs_node_lock_force(np);
2d21ac55
A
2705 np->n_needcommitcnt--;
2706 CHECK_NEEDCOMMITCNT(np);
b0d623f7 2707 nfs_node_unlock(np);
2d21ac55
A
2708 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2709 }
2710
2711 CLR(bp->nb_flags, NB_WRITEINPROG);
2712
2713 /*
2714 * For an unstable write, the buffer is still treated as dirty until
2715 * a commit (or stable (re)write) is performed. Buffers needing only
2716 * a commit are marked with the NB_DELWRI and NB_NEEDCOMMIT flags.
2717 *
2718 * If the write was interrupted we set NB_EINTR. Don't set NB_ERROR
2719 * because that would cause the buffer to be dropped. The buffer is
2720 * still valid and simply needs to be written again.
2721 */
2722 if ((error == EINTR) || (error == ERESTART) || (!error && (bp->nb_flags & NB_NEEDCOMMIT))) {
2723 CLR(bp->nb_flags, NB_INVAL);
2724 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2725 SET(bp->nb_flags, NB_DELWRI);
2726 lck_mtx_lock(nfs_buf_mutex);
2727 nfs_nbdwrite++;
2728 NFSBUFCNTCHK();
2729 lck_mtx_unlock(nfs_buf_mutex);
2730 }
fa4905b1 2731 /*
2d21ac55
A
2732 * Since for the NB_ASYNC case, we've reassigned the buffer to the
2733 * clean list, we have to reassign it back to the dirty one. Ugh.
fa4905b1 2734 */
2d21ac55
A
2735 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2736 /* move to dirty list */
2737 lck_mtx_lock(nfs_buf_mutex);
0a7de745 2738 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
2d21ac55 2739 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 2740 }
2d21ac55
A
2741 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2742 lck_mtx_unlock(nfs_buf_mutex);
2743 }
2744 } else {
2745 /* either there's an error or we don't need to commit */
2746 if (error) {
2747 /*
2748 * There was a write error and we need to invalidate
2749 * attrs and flush buffers in order to sync up with the
2750 * server. (if this write was extending the file, we
2751 * may no longer know the correct size)
2752 *
2753 * But we can't call vinvalbuf while holding this
2754 * buffer busy. Set a flag to do it after releasing
2755 * the buffer.
2756 */
b0d623f7 2757 nfs_node_lock_force(np);
2d21ac55
A
2758 np->n_error = error;
2759 np->n_flag |= (NWRITEERR | NNEEDINVALIDATE);
2760 NATTRINVALIDATE(np);
b0d623f7 2761 nfs_node_unlock(np);
2d21ac55
A
2762 }
2763 /* clear the dirty range */
2764 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2765 }
55e303ae 2766
0a7de745 2767 if (!error && bp->nb_dirty) {
2d21ac55 2768 nfs_buf_write_dirty_pages(bp, thd, cred);
0a7de745 2769 }
2d21ac55
A
2770 nfs_buf_iodone(bp);
2771}
fa4905b1 2772
2d21ac55
A
2773/*
2774 * write out any pages marked dirty in a buffer
2775 *
2776 * We do use unstable writes and follow up with a commit.
2777 * If we catch the write verifier changing we'll restart
2778 * do the writes filesync.
2779 */
2780int
2781nfs_buf_write_dirty_pages(struct nfsbuf *bp, thread_t thd, kauth_cred_t cred)
2782{
2783 nfsnode_t np = bp->nb_np;
2784 struct nfsmount *nmp = NFSTONMP(np);
2785 int error = 0, commit, iomode, iomode2, len, pg, count, npages, off;
2786 uint32_t dirty = bp->nb_dirty;
2787 uint64_t wverf;
b0d623f7 2788 uio_t auio;
0a7de745 2789 char uio_buf[UIO_SIZEOF(1)];
55e303ae 2790
0a7de745
A
2791 if (!bp->nb_dirty) {
2792 return 0;
2793 }
2d21ac55
A
2794
2795 /* there are pages marked dirty that need to be written out */
316670eb 2796 OSAddAtomic64(1, &nfsstats.write_bios);
2d21ac55
A
2797 NFS_BUF_MAP(bp);
2798 SET(bp->nb_flags, NB_WRITEINPROG);
2799 npages = bp->nb_bufsize / PAGE_SIZE;
2800 iomode = NFS_WRITE_UNSTABLE;
2801
b0d623f7 2802 auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE,
0a7de745 2803 &uio_buf, sizeof(uio_buf));
2d21ac55
A
2804
2805again:
2806 dirty = bp->nb_dirty;
2807 wverf = bp->nb_verf;
2808 commit = NFS_WRITE_FILESYNC;
2809 for (pg = 0; pg < npages; pg++) {
0a7de745 2810 if (!NBPGDIRTY(bp, pg)) {
2d21ac55 2811 continue;
0a7de745 2812 }
2d21ac55 2813 count = 1;
0a7de745 2814 while (((pg + count) < npages) && NBPGDIRTY(bp, pg + count)) {
2d21ac55 2815 count++;
0a7de745 2816 }
2d21ac55
A
2817 /* write count pages starting with page pg */
2818 off = pg * PAGE_SIZE;
2819 len = count * PAGE_SIZE;
2820 /* clip writes to EOF */
0a7de745 2821 if (NBOFF(bp) + off + len > (off_t) np->n_size) {
2d21ac55 2822 len -= (NBOFF(bp) + off + len) - np->n_size;
0a7de745 2823 }
2d21ac55
A
2824 if (len > 0) {
2825 iomode2 = iomode;
b0d623f7
A
2826 uio_reset(auio, NBOFF(bp) + off, UIO_SYSSPACE, UIO_WRITE);
2827 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + off), len);
2828 error = nfs_write_rpc2(np, auio, thd, cred, &iomode2, &bp->nb_verf);
0a7de745 2829 if (error) {
2d21ac55 2830 break;
0a7de745
A
2831 }
2832 if (iomode2 < commit) { /* Retain the lowest commitment level returned. */
2d21ac55 2833 commit = iomode2;
0a7de745 2834 }
2d21ac55
A
2835 if ((commit != NFS_WRITE_FILESYNC) && (wverf != bp->nb_verf)) {
2836 /* verifier changed, redo all the writes filesync */
2837 iomode = NFS_WRITE_FILESYNC;
2838 goto again;
fa4905b1
A
2839 }
2840 }
2d21ac55
A
2841 /* clear dirty bits */
2842 while (count--) {
2843 dirty &= ~(1 << pg);
0a7de745 2844 if (count) { /* leave pg on last page */
2d21ac55 2845 pg++;
0a7de745 2846 }
2d21ac55
A
2847 }
2848 }
2849 CLR(bp->nb_flags, NB_WRITEINPROG);
2850
2851 if (!error && (commit != NFS_WRITE_FILESYNC)) {
6d2010ae 2852 error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf);
2d21ac55
A
2853 if (error == NFSERR_STALEWRITEVERF) {
2854 /* verifier changed, so we need to restart all the writes */
2855 iomode = NFS_WRITE_FILESYNC;
2856 goto again;
2857 }
2858 }
2859 if (!error) {
2860 bp->nb_dirty = dirty;
2861 } else {
2862 SET(bp->nb_flags, NB_ERROR);
2863 bp->nb_error = error;
2864 }
0a7de745 2865 return error;
2d21ac55
A
2866}
2867
2868/*
2869 * initiate the NFS WRITE RPC(s) for a buffer
2870 */
2871int
2872nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred)
2873{
2874 struct nfsmount *nmp;
2875 nfsnode_t np = bp->nb_np;
2876 int error = 0, nfsvers, async;
b0d623f7
A
2877 int offset, nrpcs;
2878 uint32_t nmwsize, length, len;
2d21ac55
A
2879 struct nfsreq *req;
2880 struct nfsreq_cbinfo cb;
b0d623f7 2881 uio_t auio;
0a7de745 2882 char uio_buf[UIO_SIZEOF(1)];
2d21ac55
A
2883
2884 nmp = NFSTONMP(np);
fe8ab488 2885 if (nfs_mount_gone(nmp)) {
2d21ac55
A
2886 bp->nb_error = error = ENXIO;
2887 SET(bp->nb_flags, NB_ERROR);
2888 nfs_buf_iodone(bp);
0a7de745 2889 return error;
2d21ac55
A
2890 }
2891 nfsvers = nmp->nm_vers;
2892 nmwsize = nmp->nm_wsize;
2893
2894 offset = bp->nb_offio;
2895 length = bp->nb_endio - bp->nb_offio;
2896
2897 /* Note: Can only do async I/O if nfsiods are configured. */
2898 async = (bp->nb_flags & NB_ASYNC) && (NFSIOD_MAX > 0);
2899 bp->nb_commitlevel = NFS_WRITE_FILESYNC;
2900 cb.rcb_func = async ? nfs_buf_write_rpc_finish : NULL;
2901 cb.rcb_bp = bp;
2902
2903 if ((nfsvers == NFS_VER2) && ((NBOFF(bp) + bp->nb_endio) > 0xffffffffLL)) {
2904 bp->nb_error = error = EFBIG;
2905 SET(bp->nb_flags, NB_ERROR);
2906 nfs_buf_iodone(bp);
0a7de745 2907 return error;
2d21ac55
A
2908 }
2909
b0d623f7 2910 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
0a7de745 2911 UIO_WRITE, &uio_buf, sizeof(uio_buf));
b0d623f7 2912 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
2d21ac55
A
2913
2914 bp->nb_rpcs = nrpcs = (length + nmwsize - 1) / nmwsize;
2915 if (async && (nrpcs > 1)) {
2916 SET(bp->nb_flags, NB_MULTASYNCRPC);
2917 } else {
2918 CLR(bp->nb_flags, NB_MULTASYNCRPC);
2919 }
2920
2921 while (length > 0) {
2922 if (ISSET(bp->nb_flags, NB_ERROR)) {
2923 error = bp->nb_error;
2924 break;
2925 }
2926 len = (length > nmwsize) ? nmwsize : length;
2927 cb.rcb_args[0] = offset;
2928 cb.rcb_args[1] = len;
cb323159 2929#if CONFIG_NFS4
0a7de745 2930 if (nmp->nm_vers >= NFS_VER4) {
b0d623f7 2931 cb.rcb_args[2] = nmp->nm_stategenid;
0a7de745 2932 }
cb323159 2933#endif
0a7de745 2934 if (async && ((error = nfs_async_write_start(nmp)))) {
2d21ac55 2935 break;
0a7de745 2936 }
2d21ac55 2937 req = NULL;
b0d623f7 2938 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, len, thd, cred,
0a7de745 2939 iomode, &cb, &req);
2d21ac55 2940 if (error) {
0a7de745 2941 if (async) {
2d21ac55 2942 nfs_async_write_done(nmp);
0a7de745 2943 }
2d21ac55
A
2944 break;
2945 }
2946 offset += len;
2947 length -= len;
0a7de745 2948 if (async) {
2d21ac55 2949 continue;
0a7de745 2950 }
2d21ac55
A
2951 nfs_buf_write_rpc_finish(req);
2952 }
2953
2954 if (length > 0) {
fa4905b1 2955 /*
2d21ac55
A
2956 * Something bad happened while trying to send the RPCs.
2957 * Wait for any outstanding requests to complete.
fa4905b1 2958 */
2d21ac55
A
2959 bp->nb_error = error;
2960 SET(bp->nb_flags, NB_ERROR);
2961 if (ISSET(bp->nb_flags, NB_MULTASYNCRPC)) {
2962 nrpcs = (length + nmwsize - 1) / nmwsize;
2963 lck_mtx_lock(nfs_buf_mutex);
2964 bp->nb_rpcs -= nrpcs;
2965 if (bp->nb_rpcs == 0) {
2966 /* No RPCs left, so the buffer's done */
2967 lck_mtx_unlock(nfs_buf_mutex);
2968 nfs_buf_write_finish(bp, thd, cred);
2969 } else {
2970 /* wait for the last RPC to mark it done */
0a7de745 2971 while (bp->nb_rpcs > 0) {
2d21ac55 2972 msleep(&bp->nb_rpcs, nfs_buf_mutex, 0,
0a7de745
A
2973 "nfs_buf_write_rpc_cancel", NULL);
2974 }
2d21ac55 2975 lck_mtx_unlock(nfs_buf_mutex);
fa4905b1 2976 }
2d21ac55
A
2977 } else {
2978 nfs_buf_write_finish(bp, thd, cred);
2979 }
6d2010ae 2980 /* It may have just been an interrupt... that's OK */
0a7de745 2981 if (!ISSET(bp->nb_flags, NB_ERROR)) {
6d2010ae 2982 error = 0;
0a7de745 2983 }
2d21ac55 2984 }
55e303ae 2985
0a7de745 2986 return error;
2d21ac55
A
2987}
2988
2989/*
2990 * finish up an NFS WRITE RPC on a buffer
2991 */
2992void
2993nfs_buf_write_rpc_finish(struct nfsreq *req)
2994{
2995 int error = 0, nfsvers, offset, length, multasyncrpc, finished;
2996 int committed = NFS_WRITE_FILESYNC;
2997 uint64_t wverf = 0;
2998 size_t rlen;
2999 void *wakeme = NULL;
3000 struct nfsreq_cbinfo cb;
3001 struct nfsreq *wreq = NULL;
3002 struct nfsbuf *bp;
3003 struct nfsmount *nmp;
3004 nfsnode_t np;
3005 thread_t thd;
3006 kauth_cred_t cred;
b0d623f7 3007 uio_t auio;
0a7de745 3008 char uio_buf[UIO_SIZEOF(1)];
2d21ac55
A
3009
3010finish:
3011 np = req->r_np;
3012 thd = req->r_thread;
3013 cred = req->r_cred;
0a7de745 3014 if (IS_VALID_CRED(cred)) {
2d21ac55 3015 kauth_cred_ref(cred);
0a7de745 3016 }
2d21ac55
A
3017 cb = req->r_callback;
3018 bp = cb.rcb_bp;
0a7de745 3019 if (cb.rcb_func) { /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */
6d2010ae 3020 nfs_request_ref(req, 0);
0a7de745 3021 }
2d21ac55
A
3022
3023 nmp = NFSTONMP(np);
fe8ab488 3024 if (nfs_mount_gone(nmp)) {
2d21ac55
A
3025 SET(bp->nb_flags, NB_ERROR);
3026 bp->nb_error = error = ENXIO;
3027 }
3028 if (error || ISSET(bp->nb_flags, NB_ERROR)) {
3029 /* just drop it */
3030 nfs_request_async_cancel(req);
3031 goto out;
3032 }
3033 nfsvers = nmp->nm_vers;
3034
3035 offset = cb.rcb_args[0];
3036 rlen = length = cb.rcb_args[1];
3037
3038 /* finish the RPC */
3039 error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf);
3040 if ((error == EINPROGRESS) && cb.rcb_func) {
3041 /* async request restarted */
0a7de745 3042 if (cb.rcb_func) {
6d2010ae 3043 nfs_request_rele(req);
0a7de745
A
3044 }
3045 if (IS_VALID_CRED(cred)) {
2d21ac55 3046 kauth_cred_unref(&cred);
0a7de745 3047 }
2d21ac55
A
3048 return;
3049 }
cb323159 3050#if CONFIG_NFS4
b0d623f7
A
3051 if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) {
3052 lck_mtx_lock(&nmp->nm_lock);
6d2010ae
A
3053 if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) {
3054 NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery",
0a7de745 3055 error, NBOFF(bp) + offset, cb.rcb_args[2], nmp->nm_stategenid);
6d2010ae 3056 nfs_need_recover(nmp, error);
b0d623f7
A
3057 }
3058 lck_mtx_unlock(&nmp->nm_lock);
6d2010ae
A
3059 if (np->n_flag & NREVOKE) {
3060 error = EIO;
3061 } else {
3062 if (error == NFSERR_GRACE) {
3063 if (cb.rcb_func) {
3064 /*
3065 * For an async I/O request, handle a grace delay just like
3066 * jukebox errors. Set the resend time and queue it up.
3067 */
3068 struct timeval now;
3069 if (req->r_nmrep.nmc_mhead) {
3070 mbuf_freem(req->r_nmrep.nmc_mhead);
3071 req->r_nmrep.nmc_mhead = NULL;
3072 }
3073 req->r_error = 0;
3074 microuptime(&now);
3075 lck_mtx_lock(&req->r_mtx);
3076 req->r_resendtime = now.tv_sec + 2;
3077 req->r_xid = 0; // get a new XID
3078 req->r_flags |= R_RESTART;
3079 req->r_start = 0;
3080 nfs_asyncio_resend(req);
3081 lck_mtx_unlock(&req->r_mtx);
0a7de745 3082 if (IS_VALID_CRED(cred)) {
6d2010ae 3083 kauth_cred_unref(&cred);
0a7de745 3084 }
6d2010ae
A
3085 /* Note: nfsreq reference taken will be dropped later when finished */
3086 return;
3087 }
3088 /* otherwise, just pause a couple seconds and retry */
0a7de745 3089 tsleep(&nmp->nm_state, (PZERO - 1), "nfsgrace", 2 * hz);
6d2010ae
A
3090 }
3091 if (!(error = nfs_mount_state_wait_for_recovery(nmp))) {
3092 rlen = 0;
3093 goto writeagain;
3094 }
b0d623f7
A
3095 }
3096 }
cb323159 3097#endif
2d21ac55
A
3098 if (error) {
3099 SET(bp->nb_flags, NB_ERROR);
3100 bp->nb_error = error;
3101 }
0a7de745 3102 if (error || (nfsvers == NFS_VER2)) {
2d21ac55 3103 goto out;
0a7de745 3104 }
2d21ac55
A
3105 if (rlen <= 0) {
3106 SET(bp->nb_flags, NB_ERROR);
3107 bp->nb_error = error = EIO;
3108 goto out;
3109 }
3110
3111 /* save lowest commit level returned */
0a7de745 3112 if (committed < bp->nb_commitlevel) {
2d21ac55 3113 bp->nb_commitlevel = committed;
0a7de745 3114 }
2d21ac55
A
3115
3116 /* check the write verifier */
3117 if (!bp->nb_verf) {
3118 bp->nb_verf = wverf;
3119 } else if (bp->nb_verf != wverf) {
3120 /* verifier changed, so buffer will need to be rewritten */
3121 bp->nb_flags |= NB_STALEWVERF;
3122 bp->nb_commitlevel = NFS_WRITE_UNSTABLE;
3123 bp->nb_verf = wverf;
3124 }
3125
3126 /*
3127 * check for a short write
3128 *
3129 * If the server didn't write all the data, then we
3130 * need to issue another write for the rest of it.
3131 * (Don't bother if the buffer hit an error or stale wverf.)
3132 */
0a7de745 3133 if (((int)rlen < length) && !(bp->nb_flags & (NB_STALEWVERF | NB_ERROR))) {
cb323159 3134#if CONFIG_NFS4
b0d623f7 3135writeagain:
cb323159 3136#endif
2d21ac55
A
3137 offset += rlen;
3138 length -= rlen;
3139
b0d623f7 3140 auio = uio_createwithbuffer(1, NBOFF(bp) + offset, UIO_SYSSPACE,
0a7de745 3141 UIO_WRITE, &uio_buf, sizeof(uio_buf));
b0d623f7 3142 uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + offset), length);
55e303ae 3143
2d21ac55
A
3144 cb.rcb_args[0] = offset;
3145 cb.rcb_args[1] = length;
cb323159 3146#if CONFIG_NFS4
0a7de745 3147 if (nmp->nm_vers >= NFS_VER4) {
b0d623f7 3148 cb.rcb_args[2] = nmp->nm_stategenid;
0a7de745 3149 }
cb323159 3150#endif
b0d623f7
A
3151 // XXX iomode should really match the original request
3152 error = nmp->nm_funcs->nf_write_rpc_async(np, auio, length, thd, cred,
0a7de745 3153 NFS_WRITE_FILESYNC, &cb, &wreq);
2d21ac55 3154 if (!error) {
0a7de745 3155 if (IS_VALID_CRED(cred)) {
2d21ac55 3156 kauth_cred_unref(&cred);
0a7de745 3157 }
2d21ac55
A
3158 if (!cb.rcb_func) {
3159 /* if !async we'll need to wait for this RPC to finish */
3160 req = wreq;
b0d623f7 3161 wreq = NULL;
2d21ac55 3162 goto finish;
fa4905b1 3163 }
6d2010ae 3164 nfs_request_rele(req);
2d21ac55
A
3165 /*
3166 * We're done here.
3167 * Outstanding RPC count is unchanged.
3168 * Callback will be called when RPC is done.
3169 */
3170 return;
fa4905b1 3171 }
2d21ac55
A
3172 SET(bp->nb_flags, NB_ERROR);
3173 bp->nb_error = error;
3174 }
55e303ae 3175
2d21ac55 3176out:
6d2010ae 3177 if (cb.rcb_func) {
2d21ac55 3178 nfs_async_write_done(nmp);
6d2010ae
A
3179 nfs_request_rele(req);
3180 }
2d21ac55
A
3181 /*
3182 * Decrement outstanding RPC count on buffer
3183 * and call nfs_buf_write_finish on last RPC.
3184 *
3185 * (Note: when there are multiple async RPCs issued for a
3186 * buffer we need nfs_buffer_mutex to avoid problems when
3187 * aborting a partially-initiated set of RPCs)
3188 */
3189 multasyncrpc = ISSET(bp->nb_flags, NB_MULTASYNCRPC);
0a7de745 3190 if (multasyncrpc) {
2d21ac55 3191 lck_mtx_lock(nfs_buf_mutex);
0a7de745 3192 }
2d21ac55
A
3193
3194 bp->nb_rpcs--;
3195 finished = (bp->nb_rpcs == 0);
55e303ae 3196
0a7de745 3197 if (multasyncrpc) {
2d21ac55 3198 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3199 }
2d21ac55
A
3200
3201 if (finished) {
0a7de745 3202 if (multasyncrpc) {
2d21ac55 3203 wakeme = &bp->nb_rpcs;
0a7de745 3204 }
2d21ac55 3205 nfs_buf_write_finish(bp, thd, cred);
0a7de745 3206 if (wakeme) {
2d21ac55 3207 wakeup(wakeme);
0a7de745 3208 }
2d21ac55
A
3209 }
3210
0a7de745 3211 if (IS_VALID_CRED(cred)) {
2d21ac55 3212 kauth_cred_unref(&cred);
0a7de745 3213 }
2d21ac55
A
3214}
3215
3216/*
0a7de745 3217 * Send commit(s) for the given node's "needcommit" buffers
2d21ac55
A
3218 */
3219int
3220nfs_flushcommits(nfsnode_t np, int nowait)
3221{
3222 struct nfsmount *nmp;
b0d623f7 3223 struct nfsbuf *bp, *prevlbp, *lbp;
2d21ac55
A
3224 struct nfsbuflists blist, commitlist;
3225 int error = 0, retv, wcred_set, flags, dirty;
3226 u_quad_t off, endoff, toff;
6d2010ae 3227 uint64_t wverf;
2d21ac55
A
3228 u_int32_t count;
3229 kauth_cred_t wcred = NULL;
3230
3231 FSDBG_TOP(557, np, 0, 0, 0);
3232
3233 /*
3234 * A nb_flags == (NB_DELWRI | NB_NEEDCOMMIT) block has been written to the
3235 * server, but nas not been committed to stable storage on the server
3236 * yet. The byte range is worked out for as many nfsbufs as we can handle
3237 * and the commit rpc is done.
3238 */
3239 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
b0d623f7 3240 error = nfs_node_lock(np);
0a7de745 3241 if (error) {
2d21ac55 3242 goto done;
0a7de745 3243 }
1c79356b 3244 np->n_flag |= NMODIFIED;
b0d623f7 3245 nfs_node_unlock(np);
2d21ac55 3246 }
1c79356b 3247
2d21ac55
A
3248 off = (u_quad_t)-1;
3249 endoff = 0;
3250 wcred_set = 0;
3251 LIST_INIT(&commitlist);
3252
3253 nmp = NFSTONMP(np);
fe8ab488 3254 if (nfs_mount_gone(nmp)) {
2d21ac55
A
3255 error = ENXIO;
3256 goto done;
3257 }
3258 if (nmp->nm_vers == NFS_VER2) {
3259 error = EINVAL;
3260 goto done;
3261 }
3262
3263 flags = NBI_DIRTY;
0a7de745 3264 if (nowait) {
2d21ac55 3265 flags |= NBI_NOWAIT;
0a7de745 3266 }
2d21ac55 3267 lck_mtx_lock(nfs_buf_mutex);
6d2010ae 3268 wverf = nmp->nm_verf;
2d21ac55
A
3269 if (!nfs_buf_iterprepare(np, &blist, flags)) {
3270 while ((bp = LIST_FIRST(&blist))) {
3271 LIST_REMOVE(bp, nb_vnbufs);
3272 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3273 error = nfs_buf_acquire(bp, NBAC_NOWAIT, 0, 0);
0a7de745 3274 if (error) {
2d21ac55 3275 continue;
0a7de745
A
3276 }
3277 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2d21ac55 3278 nfs_buf_check_write_verifier(np, bp);
0a7de745 3279 }
6d2010ae
A
3280 if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) ||
3281 (bp->nb_verf != wverf)) {
2d21ac55
A
3282 nfs_buf_drop(bp);
3283 continue;
3284 }
3285 nfs_buf_remfree(bp);
b0d623f7
A
3286
3287 /* buffer UPLs will be grabbed *in order* below */
2d21ac55
A
3288
3289 FSDBG(557, bp, bp->nb_flags, bp->nb_valid, bp->nb_dirty);
3290 FSDBG(557, bp->nb_validoff, bp->nb_validend,
0a7de745 3291 bp->nb_dirtyoff, bp->nb_dirtyend);
55e303ae 3292
2d21ac55
A
3293 /*
3294 * Work out if all buffers are using the same cred
3295 * so we can deal with them all with one commit.
3296 *
3297 * Note: creds in bp's must be obtained by kauth_cred_ref
3298 * on the same original cred in order for them to be equal.
3299 */
3300 if (wcred_set == 0) {
3301 wcred = bp->nb_wcred;
0a7de745 3302 if (!IS_VALID_CRED(wcred)) {
2d21ac55 3303 panic("nfs: needcommit w/out wcred");
0a7de745 3304 }
2d21ac55
A
3305 wcred_set = 1;
3306 } else if ((wcred_set == 1) && wcred != bp->nb_wcred) {
3307 wcred_set = -1;
3308 }
3309 SET(bp->nb_flags, NB_WRITEINPROG);
3310
3311 /*
b0d623f7
A
3312 * Add this buffer to the list of buffers we are committing.
3313 * Buffers are inserted into the list in ascending order so that
3314 * we can take the UPLs in order after the list is complete.
2d21ac55 3315 */
b0d623f7
A
3316 prevlbp = NULL;
3317 LIST_FOREACH(lbp, &commitlist, nb_vnbufs) {
0a7de745 3318 if (bp->nb_lblkno < lbp->nb_lblkno) {
b0d623f7 3319 break;
0a7de745 3320 }
b0d623f7
A
3321 prevlbp = lbp;
3322 }
2d21ac55 3323 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 3324 if (prevlbp) {
b0d623f7 3325 LIST_INSERT_AFTER(prevlbp, bp, nb_vnbufs);
0a7de745 3326 } else {
b0d623f7 3327 LIST_INSERT_HEAD(&commitlist, bp, nb_vnbufs);
0a7de745 3328 }
b0d623f7
A
3329
3330 /* update commit range start, end */
2d21ac55 3331 toff = NBOFF(bp) + bp->nb_dirtyoff;
0a7de745 3332 if (toff < off) {
2d21ac55 3333 off = toff;
0a7de745 3334 }
2d21ac55 3335 toff += (u_quad_t)(bp->nb_dirtyend - bp->nb_dirtyoff);
0a7de745 3336 if (toff > endoff) {
2d21ac55 3337 endoff = toff;
0a7de745 3338 }
2d21ac55
A
3339 }
3340 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3341 }
3342 lck_mtx_unlock(nfs_buf_mutex);
3343
3344 if (LIST_EMPTY(&commitlist)) {
3345 error = ENOBUFS;
3346 goto done;
3347 }
3348
b0d623f7
A
3349 /*
3350 * We need a UPL to prevent others from accessing the buffers during
3351 * our commit RPC(s).
3352 *
3353 * We used to also check for dirty pages here; if there were any we'd
3354 * abort the commit and force the entire buffer to be written again.
3355 * Instead of doing that, we just go ahead and commit the dirty range,
3356 * and then leave the buffer around with dirty pages that will be
3357 * written out later.
3358 */
3359 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3360 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3361 retv = nfs_buf_upl_setup(bp);
3362 if (retv) {
3363 /* Unable to create the UPL, the VM object probably no longer exists. */
3364 printf("nfs_flushcommits: upl create failed %d\n", retv);
3365 bp->nb_valid = bp->nb_dirty = 0;
3366 }
3367 }
3368 nfs_buf_upl_check(bp);
3369 }
3370
2d21ac55
A
3371 /*
3372 * Commit data on the server, as required.
3373 * If all bufs are using the same wcred, then use that with
3374 * one call for all of them, otherwise commit each one
3375 * separately.
3376 */
3377 if (wcred_set == 1) {
3378 /*
3379 * Note, it's possible the commit range could be >2^32-1.
3380 * If it is, we'll send one commit that covers the whole file.
3381 */
0a7de745 3382 if ((endoff - off) > 0xffffffff) {
2d21ac55 3383 count = 0;
0a7de745 3384 } else {
2d21ac55 3385 count = (endoff - off);
0a7de745 3386 }
6d2010ae 3387 retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf);
2d21ac55
A
3388 } else {
3389 retv = 0;
3390 LIST_FOREACH(bp, &commitlist, nb_vnbufs) {
3391 toff = NBOFF(bp) + bp->nb_dirtyoff;
3392 count = bp->nb_dirtyend - bp->nb_dirtyoff;
6d2010ae 3393 retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf);
0a7de745 3394 if (retv) {
2d21ac55 3395 break;
0a7de745 3396 }
55e303ae 3397 }
2d21ac55
A
3398 }
3399
3400 /*
3401 * Now, either mark the blocks I/O done or mark the
3402 * blocks dirty, depending on whether the commit
3403 * succeeded.
3404 */
3405 while ((bp = LIST_FIRST(&commitlist))) {
3406 LIST_REMOVE(bp, nb_vnbufs);
3407 FSDBG(557, bp, retv, bp->nb_flags, bp->nb_dirty);
b0d623f7 3408 nfs_node_lock_force(np);
2d21ac55
A
3409 CLR(bp->nb_flags, (NB_NEEDCOMMIT | NB_WRITEINPROG));
3410 np->n_needcommitcnt--;
3411 CHECK_NEEDCOMMITCNT(np);
b0d623f7 3412 nfs_node_unlock(np);
2d21ac55
A
3413
3414 if (retv) {
3415 /* move back to dirty list */
3416 lck_mtx_lock(nfs_buf_mutex);
3417 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3418 lck_mtx_unlock(nfs_buf_mutex);
3419 nfs_buf_release(bp, 1);
3420 continue;
1c79356b 3421 }
2d21ac55 3422
b0d623f7
A
3423 nfs_node_lock_force(np);
3424 np->n_numoutput++;
3425 nfs_node_unlock(np);
2d21ac55
A
3426 vnode_startwrite(NFSTOV(np));
3427 if (ISSET(bp->nb_flags, NB_DELWRI)) {
3428 lck_mtx_lock(nfs_buf_mutex);
3429 nfs_nbdwrite--;
3430 NFSBUFCNTCHK();
3431 lck_mtx_unlock(nfs_buf_mutex);
3432 wakeup(&nfs_nbdwrite);
1c79356b 3433 }
0a7de745 3434 CLR(bp->nb_flags, (NB_READ | NB_DONE | NB_ERROR | NB_DELWRI));
2d21ac55
A
3435 /* if block still has dirty pages, we don't want it to */
3436 /* be released in nfs_buf_iodone(). So, don't set NB_ASYNC. */
0a7de745 3437 if (!(dirty = bp->nb_dirty)) {
2d21ac55 3438 SET(bp->nb_flags, NB_ASYNC);
0a7de745 3439 } else {
2d21ac55 3440 CLR(bp->nb_flags, NB_ASYNC);
0a7de745 3441 }
1c79356b 3442
2d21ac55
A
3443 /* move to clean list */
3444 lck_mtx_lock(nfs_buf_mutex);
3445 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3446 lck_mtx_unlock(nfs_buf_mutex);
3447
3448 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3449
3450 nfs_buf_iodone(bp);
3451 if (dirty) {
3452 /* throw it back in as a delayed write buffer */
3453 CLR(bp->nb_flags, NB_DONE);
3454 nfs_buf_write_delayed(bp);
55e303ae 3455 }
2d21ac55 3456 }
1c79356b 3457
2d21ac55
A
3458done:
3459 FSDBG_BOT(557, np, 0, 0, error);
0a7de745 3460 return error;
2d21ac55
A
3461}
3462
3463/*
3464 * Flush all the blocks associated with a vnode.
0a7de745 3465 * Walk through the buffer pool and push any dirty pages
2d21ac55
A
3466 * associated with the vnode.
3467 */
3468int
3469nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr)
3470{
3471 struct nfsbuf *bp;
3472 struct nfsbuflists blist;
3473 struct nfsmount *nmp = NFSTONMP(np);
3474 int error = 0, error2, slptimeo = 0, slpflag = 0;
3475 int nfsvers, flags, passone = 1;
3476
3477 FSDBG_TOP(517, np, waitfor, ignore_writeerr, 0);
3478
fe8ab488 3479 if (nfs_mount_gone(nmp)) {
2d21ac55
A
3480 error = ENXIO;
3481 goto out;
3482 }
3483 nfsvers = nmp->nm_vers;
0a7de745 3484 if (NMFLAG(nmp, INTR)) {
2d21ac55 3485 slpflag = PCATCH;
0a7de745 3486 }
2d21ac55
A
3487
3488 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
b0d623f7 3489 nfs_node_lock_force(np);
2d21ac55 3490 np->n_flag |= NMODIFIED;
b0d623f7 3491 nfs_node_unlock(np);
2d21ac55
A
3492 }
3493
3494 lck_mtx_lock(nfs_buf_mutex);
3495 while (np->n_bflag & NBFLUSHINPROG) {
3496 np->n_bflag |= NBFLUSHWANT;
6d2010ae
A
3497 error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL);
3498 if ((error && (error != EWOULDBLOCK)) ||
3499 ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) {
2d21ac55
A
3500 lck_mtx_unlock(nfs_buf_mutex);
3501 goto out;
3502 }
3503 }
3504 np->n_bflag |= NBFLUSHINPROG;
3505
3506 /*
3507 * On the first pass, start async/unstable writes on all
3508 * delayed write buffers. Then wait for all writes to complete
3509 * and call nfs_flushcommits() to commit any uncommitted buffers.
3510 * On all subsequent passes, start STABLE writes on any remaining
3511 * dirty buffers. Then wait for all writes to complete.
3512 */
3513again:
3514 FSDBG(518, LIST_FIRST(&np->n_dirtyblkhd), np->n_flag, 0, 0);
3515 if (!NFSTONMP(np)) {
3516 lck_mtx_unlock(nfs_buf_mutex);
3517 error = ENXIO;
3518 goto done;
3519 }
3520
3521 /* Start/do any write(s) that are required. */
3522 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3523 while ((bp = LIST_FIRST(&blist))) {
3524 LIST_REMOVE(bp, nb_vnbufs);
3525 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
b0d623f7 3526 flags = (passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) ? NBAC_NOWAIT : 0;
0a7de745 3527 if (flags != NBAC_NOWAIT) {
2d21ac55 3528 nfs_buf_refget(bp);
0a7de745 3529 }
2d21ac55
A
3530 while ((error = nfs_buf_acquire(bp, flags, slpflag, slptimeo))) {
3531 FSDBG(524, bp, flags, bp->nb_lflags, bp->nb_flags);
0a7de745 3532 if (error == EBUSY) {
2d21ac55 3533 break;
0a7de745 3534 }
2d21ac55
A
3535 if (error) {
3536 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
3537 if (error2) {
0a7de745 3538 if (flags != NBAC_NOWAIT) {
2d21ac55 3539 nfs_buf_refrele(bp);
0a7de745 3540 }
2d21ac55
A
3541 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3542 lck_mtx_unlock(nfs_buf_mutex);
3543 error = error2;
3544 goto done;
3545 }
3546 if (slpflag == PCATCH) {
3547 slpflag = 0;
3548 slptimeo = 2 * hz;
3549 }
3550 }
3551 }
0a7de745 3552 if (flags != NBAC_NOWAIT) {
2d21ac55 3553 nfs_buf_refrele(bp);
0a7de745
A
3554 }
3555 if (error == EBUSY) {
2d21ac55 3556 continue;
0a7de745 3557 }
2d21ac55
A
3558 if (!bp->nb_np) {
3559 /* buffer is no longer valid */
3560 nfs_buf_drop(bp);
3561 continue;
3562 }
0a7de745 3563 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2d21ac55 3564 nfs_buf_check_write_verifier(np, bp);
0a7de745 3565 }
2d21ac55
A
3566 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3567 /* buffer is no longer dirty */
3568 nfs_buf_drop(bp);
3569 continue;
3570 }
3571 FSDBG(525, bp, passone, bp->nb_lflags, bp->nb_flags);
b0d623f7 3572 if ((passone || !(waitfor == MNT_WAIT || waitfor == MNT_DWAIT)) &&
2d21ac55
A
3573 ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3574 nfs_buf_drop(bp);
3575 continue;
3576 }
3577 nfs_buf_remfree(bp);
3578 lck_mtx_unlock(nfs_buf_mutex);
3579 if (ISSET(bp->nb_flags, NB_ERROR)) {
b0d623f7 3580 nfs_node_lock_force(np);
2d21ac55
A
3581 np->n_error = bp->nb_error ? bp->nb_error : EIO;
3582 np->n_flag |= NWRITEERR;
b0d623f7 3583 nfs_node_unlock(np);
2d21ac55
A
3584 nfs_buf_release(bp, 1);
3585 lck_mtx_lock(nfs_buf_mutex);
3586 continue;
3587 }
3588 SET(bp->nb_flags, NB_ASYNC);
3589 if (!passone) {
3590 /* NB_STABLE forces this to be written FILESYNC */
3591 SET(bp->nb_flags, NB_STABLE);
3592 }
3593 nfs_buf_write(bp);
3594 lck_mtx_lock(nfs_buf_mutex);
3595 }
3596 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3597 }
3598 lck_mtx_unlock(nfs_buf_mutex);
3599
b0d623f7 3600 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
0a7de745
A
3601 while ((error = vnode_waitforwrites(NFSTOV(np), 0, slpflag, slptimeo, "nfsflush"))) {
3602 error2 = nfs_sigintr(NFSTONMP(np), NULL, thd, 0);
2d21ac55 3603 if (error2) {
0a7de745 3604 error = error2;
2d21ac55
A
3605 goto done;
3606 }
3607 if (slpflag == PCATCH) {
3608 slpflag = 0;
3609 slptimeo = 2 * hz;
55e303ae 3610 }
2d21ac55
A
3611 }
3612 }
55e303ae 3613
2d21ac55
A
3614 if (nfsvers != NFS_VER2) {
3615 /* loop while it looks like there are still buffers to be */
3616 /* commited and nfs_flushcommits() seems to be handling them. */
0a7de745
A
3617 while (np->n_needcommitcnt) {
3618 if (nfs_flushcommits(np, 0)) {
2d21ac55 3619 break;
0a7de745
A
3620 }
3621 }
2d21ac55 3622 }
55e303ae 3623
2d21ac55
A
3624 if (passone) {
3625 passone = 0;
3626 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
b0d623f7 3627 nfs_node_lock_force(np);
2d21ac55 3628 np->n_flag |= NMODIFIED;
b0d623f7 3629 nfs_node_unlock(np);
2d21ac55
A
3630 }
3631 lck_mtx_lock(nfs_buf_mutex);
3632 goto again;
3633 }
55e303ae 3634
b0d623f7 3635 if (waitfor == MNT_WAIT || waitfor == MNT_DWAIT) {
2d21ac55 3636 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
b0d623f7 3637 nfs_node_lock_force(np);
2d21ac55 3638 np->n_flag |= NMODIFIED;
b0d623f7 3639 nfs_node_unlock(np);
2d21ac55
A
3640 }
3641 lck_mtx_lock(nfs_buf_mutex);
0a7de745 3642 if (!LIST_EMPTY(&np->n_dirtyblkhd)) {
2d21ac55 3643 goto again;
0a7de745 3644 }
2d21ac55 3645 lck_mtx_unlock(nfs_buf_mutex);
b0d623f7
A
3646 nfs_node_lock_force(np);
3647 /*
3648 * OK, it looks like there are no dirty blocks. If we have no
3649 * writes in flight and no one in the write code, we can clear
3650 * the modified flag. In order to make sure we see the latest
3651 * attributes and size, we also invalidate the attributes and
3652 * advance the attribute cache XID to guarantee that attributes
3653 * newer than our clearing of NMODIFIED will get loaded next.
3654 * (If we don't do this, it's possible for the flush's final
3655 * write/commit (xid1) to be executed in parallel with a subsequent
3656 * getattr request (xid2). The getattr could return attributes
3657 * from *before* the write/commit completed but the stale attributes
3658 * would be preferred because of the xid ordering.)
3659 */
3660 if (!np->n_wrbusy && !np->n_numoutput) {
2d21ac55 3661 np->n_flag &= ~NMODIFIED;
b0d623f7
A
3662 NATTRINVALIDATE(np);
3663 nfs_get_xid(&np->n_xid);
3664 }
2d21ac55 3665 } else {
b0d623f7 3666 nfs_node_lock_force(np);
0c530ab8
A
3667 }
3668
2d21ac55
A
3669 FSDBG(526, np->n_flag, np->n_error, 0, 0);
3670 if (!ignore_writeerr && (np->n_flag & NWRITEERR)) {
3671 error = np->n_error;
3672 np->n_flag &= ~NWRITEERR;
3673 }
b0d623f7 3674 nfs_node_unlock(np);
2d21ac55
A
3675done:
3676 lck_mtx_lock(nfs_buf_mutex);
3677 flags = np->n_bflag;
0a7de745 3678 np->n_bflag &= ~(NBFLUSHINPROG | NBFLUSHWANT);
2d21ac55 3679 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3680 if (flags & NBFLUSHWANT) {
2d21ac55 3681 wakeup(&np->n_bflag);
0a7de745 3682 }
2d21ac55
A
3683out:
3684 FSDBG_BOT(517, np, error, ignore_writeerr, 0);
0a7de745 3685 return error;
1c79356b
A
3686}
3687
1c79356b 3688/*
55e303ae
A
3689 * Flush out and invalidate all buffers associated with a vnode.
3690 * Called with the underlying object locked.
1c79356b 3691 */
b0d623f7 3692int
91447636 3693nfs_vinvalbuf_internal(
2d21ac55 3694 nfsnode_t np,
91447636 3695 int flags,
2d21ac55 3696 thread_t thd,
91447636 3697 kauth_cred_t cred,
91447636
A
3698 int slpflag,
3699 int slptimeo)
1c79356b 3700{
55e303ae 3701 struct nfsbuf *bp;
91447636
A
3702 struct nfsbuflists blist;
3703 int list, error = 0;
9bccf70c 3704
55e303ae 3705 if (flags & V_SAVE) {
0a7de745
A
3706 if ((error = nfs_flush(np, MNT_WAIT, thd, (flags & V_IGNORE_WRITEERR)))) {
3707 return error;
3708 }
9bccf70c
A
3709 }
3710
91447636 3711 lck_mtx_lock(nfs_buf_mutex);
55e303ae 3712 for (;;) {
91447636
A
3713 list = NBI_CLEAN;
3714 if (nfs_buf_iterprepare(np, &blist, list)) {
3715 list = NBI_DIRTY;
0a7de745 3716 if (nfs_buf_iterprepare(np, &blist, list)) {
91447636 3717 break;
0a7de745 3718 }
91447636
A
3719 }
3720 while ((bp = LIST_FIRST(&blist))) {
3721 LIST_REMOVE(bp, nb_vnbufs);
0a7de745 3722 if (list == NBI_CLEAN) {
91447636 3723 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
0a7de745 3724 } else {
91447636 3725 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
0a7de745 3726 }
91447636
A
3727 nfs_buf_refget(bp);
3728 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2d21ac55 3729 FSDBG(556, np, bp, NBOFF(bp), bp->nb_flags);
91447636 3730 if (error != EAGAIN) {
2d21ac55 3731 FSDBG(554, np, bp, -1, error);
91447636
A
3732 nfs_buf_refrele(bp);
3733 nfs_buf_itercomplete(np, &blist, list);
3734 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3735 return error;
55e303ae 3736 }
55e303ae 3737 }
91447636 3738 nfs_buf_refrele(bp);
2d21ac55 3739 FSDBG(554, np, bp, NBOFF(bp), bp->nb_flags);
91447636 3740 lck_mtx_unlock(nfs_buf_mutex);
2d21ac55 3741 if ((flags & V_SAVE) && UBCINFOEXISTS(NFSTOV(np)) && bp->nb_np &&
91447636 3742 (NBOFF(bp) < (off_t)np->n_size)) {
2d21ac55 3743 /* extra paranoia: make sure we're not */
55e303ae
A
3744 /* somehow leaving any dirty data around */
3745 int mustwrite = 0;
91447636
A
3746 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
3747 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
55e303ae
A
3748 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3749 error = nfs_buf_upl_setup(bp);
3750 if (error == EINVAL) {
3751 /* vm object must no longer exist */
3752 /* hopefully we don't need to do */
3753 /* anything for this buffer */
0a7de745 3754 } else if (error) {
91447636 3755 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
0a7de745 3756 }
55e303ae
A
3757 bp->nb_valid = bp->nb_dirty = 0;
3758 }
3759 nfs_buf_upl_check(bp);
3760 /* check for any dirty data before the EOF */
2d21ac55 3761 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
55e303ae 3762 /* clip dirty range to EOF */
2d21ac55 3763 if (bp->nb_dirtyend > end) {
55e303ae 3764 bp->nb_dirtyend = end;
0a7de745 3765 if (bp->nb_dirtyoff >= bp->nb_dirtyend) {
2d21ac55 3766 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
0a7de745 3767 }
2d21ac55 3768 }
0a7de745 3769 if ((bp->nb_dirtyend > 0) && (bp->nb_dirtyoff < end)) {
2d21ac55 3770 mustwrite++;
0a7de745 3771 }
55e303ae 3772 }
0a7de745
A
3773 bp->nb_dirty &= (1 << (round_page_32(end) / PAGE_SIZE)) - 1;
3774 if (bp->nb_dirty) {
2d21ac55 3775 mustwrite++;
0a7de745 3776 }
91447636 3777 /* also make sure we'll have a credential to do the write */
0c530ab8 3778 if (mustwrite && !IS_VALID_CRED(bp->nb_wcred) && !IS_VALID_CRED(cred)) {
91447636
A
3779 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
3780 mustwrite = 0;
3781 }
55e303ae 3782 if (mustwrite) {
2d21ac55 3783 FSDBG(554, np, bp, 0xd00dee, bp->nb_flags);
0a7de745 3784 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
55e303ae 3785 panic("nfs_vinvalbuf: dirty buffer without upl");
0a7de745 3786 }
55e303ae
A
3787 /* gotta write out dirty data before invalidating */
3788 /* (NB_STABLE indicates that data writes should be FILESYNC) */
3789 /* (NB_NOCACHE indicates buffer should be discarded) */
3790 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
3791 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
0c530ab8 3792 if (!IS_VALID_CRED(bp->nb_wcred)) {
91447636
A
3793 kauth_cred_ref(cred);
3794 bp->nb_wcred = cred;
3795 }
55e303ae
A
3796 error = nfs_buf_write(bp);
3797 // Note: bp has been released
3798 if (error) {
3799 FSDBG(554, bp, 0xd00dee, 0xbad, error);
b0d623f7 3800 nfs_node_lock_force(np);
6d2010ae
A
3801 if ((error != EINTR) && (error != ERESTART)) {
3802 np->n_error = error;
3803 np->n_flag |= NWRITEERR;
3804 }
91447636
A
3805 /*
3806 * There was a write error and we need to
3807 * invalidate attrs to sync with server.
3808 * (if this write was extending the file,
3809 * we may no longer know the correct size)
3810 */
3811 NATTRINVALIDATE(np);
b0d623f7 3812 nfs_node_unlock(np);
6d2010ae 3813 if ((error == EINTR) || (error == ERESTART)) {
b0d623f7
A
3814 /*
3815 * Abort on EINTR. If we don't, we could
3816 * be stuck in this loop forever because
3817 * the buffer will continue to stay dirty.
3818 */
3819 lck_mtx_lock(nfs_buf_mutex);
3820 nfs_buf_itercomplete(np, &blist, list);
3821 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3822 return error;
b0d623f7 3823 }
55e303ae
A
3824 error = 0;
3825 }
91447636
A
3826 lck_mtx_lock(nfs_buf_mutex);
3827 continue;
55e303ae
A
3828 }
3829 }
3830 SET(bp->nb_flags, NB_INVAL);
91447636 3831 // hold off on FREEUPs until we're done here
483a1d10 3832 nfs_buf_release(bp, 0);
91447636 3833 lck_mtx_lock(nfs_buf_mutex);
55e303ae 3834 }
91447636 3835 nfs_buf_itercomplete(np, &blist, list);
55e303ae 3836 }
0a7de745 3837 if (!LIST_EMPTY(&(np)->n_dirtyblkhd) || !LIST_EMPTY(&(np)->n_cleanblkhd)) {
2d21ac55 3838 panic("nfs_vinvalbuf: flush/inval failed");
0a7de745 3839 }
91447636 3840 lck_mtx_unlock(nfs_buf_mutex);
b0d623f7 3841 nfs_node_lock_force(np);
0a7de745 3842 if (!(flags & V_SAVE)) {
2d21ac55 3843 np->n_flag &= ~NMODIFIED;
0a7de745
A
3844 }
3845 if (vnode_vtype(NFSTOV(np)) == VREG) {
b0d623f7 3846 np->n_lastrahead = -1;
0a7de745 3847 }
b0d623f7 3848 nfs_node_unlock(np);
483a1d10 3849 NFS_BUF_FREEUP();
0a7de745 3850 return 0;
1c79356b
A
3851}
3852
55e303ae 3853
1c79356b
A
3854/*
3855 * Flush and invalidate all dirty buffers. If another process is already
3856 * doing the flush, just wait for completion.
3857 */
3858int
2d21ac55
A
3859nfs_vinvalbuf(vnode_t vp, int flags, vfs_context_t ctx, int intrflg)
3860{
3861 return nfs_vinvalbuf2(vp, flags, vfs_context_thread(ctx), vfs_context_ucred(ctx), intrflg);
3862}
3863
3864int
3865nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrflg)
1c79356b 3866{
2d21ac55
A
3867 nfsnode_t np = VTONFS(vp);
3868 struct nfsmount *nmp = VTONMP(vp);
6d2010ae 3869 int error, slpflag, slptimeo, nflags, retry = 0;
fe8ab488 3870 int ubcflags = UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE;
cb323159 3871 struct timespec ts = { .tv_sec = 2, .tv_nsec = 0 };
91447636 3872 off_t size;
1c79356b 3873
2d21ac55 3874 FSDBG_TOP(554, np, flags, intrflg, 0);
55e303ae 3875
fe8ab488
A
3876 /*
3877 * If the mount is gone no sense to try and write anything.
3878 * and hang trying to do IO.
3879 */
3880 if (nfs_mount_gone(nmp)) {
3881 flags &= ~V_SAVE;
3882 ubcflags &= ~UBC_PUSHALL;
3883 }
0a7de745
A
3884
3885 if (nmp && !NMFLAG(nmp, INTR)) {
1c79356b 3886 intrflg = 0;
0a7de745 3887 }
1c79356b
A
3888 if (intrflg) {
3889 slpflag = PCATCH;
3890 slptimeo = 2 * hz;
3891 } else {
3892 slpflag = 0;
3893 slptimeo = 0;
3894 }
1c79356b 3895
2d21ac55
A
3896 /* First wait for any other process doing a flush to complete. */
3897 lck_mtx_lock(nfs_buf_mutex);
3898 while (np->n_bflag & NBINVALINPROG) {
3899 np->n_bflag |= NBINVALWANT;
6d2010ae 3900 msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts);
b0d623f7 3901 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 3902 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3903 return error;
1c79356b 3904 }
0a7de745 3905 if (np->n_bflag & NBINVALINPROG) {
6d2010ae 3906 slpflag = 0;
0a7de745 3907 }
1c79356b 3908 }
2d21ac55
A
3909 np->n_bflag |= NBINVALINPROG;
3910 lck_mtx_unlock(nfs_buf_mutex);
3911
3912 /* Now, flush as required. */
6d2010ae 3913again:
2d21ac55
A
3914 error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0);
3915 while (error) {
3916 FSDBG(554, np, 0, 0, error);
0a7de745 3917 if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) {
2d21ac55 3918 goto done;
0a7de745 3919 }
2d21ac55 3920 error = nfs_vinvalbuf_internal(np, flags, thd, cred, 0, slptimeo);
1c79356b 3921 }
2d21ac55
A
3922
3923 /* get the pages out of vm also */
0a7de745 3924 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
fe8ab488 3925 if ((error = ubc_msync(vp, 0, size, NULL, ubcflags))) {
0a7de745 3926 if (error == EINVAL) {
6d2010ae 3927 panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error);
0a7de745 3928 }
fe8ab488 3929 if (retry++ < 10) { /* retry invalidating a few times */
0a7de745 3930 if (retry > 1 || error == ENXIO) {
fe8ab488 3931 ubcflags &= ~UBC_PUSHALL;
0a7de745 3932 }
6d2010ae 3933 goto again;
fe8ab488 3934 }
6d2010ae 3935 /* give up */
fe8ab488 3936 printf("nfs_vinvalbuf(): ubc_msync failed!, error %d\n", error);
6d2010ae 3937 }
0a7de745 3938 }
2d21ac55
A
3939done:
3940 lck_mtx_lock(nfs_buf_mutex);
3941 nflags = np->n_bflag;
0a7de745 3942 np->n_bflag &= ~(NBINVALINPROG | NBINVALWANT);
2d21ac55 3943 lck_mtx_unlock(nfs_buf_mutex);
0a7de745 3944 if (nflags & NBINVALWANT) {
2d21ac55 3945 wakeup(&np->n_bflag);
0a7de745 3946 }
91447636 3947
2d21ac55 3948 FSDBG_BOT(554, np, flags, intrflg, error);
0a7de745 3949 return error;
1c79356b
A
3950}
3951
6d2010ae
A
3952/*
3953 * Wait for any busy buffers to complete.
3954 */
3955void
3956nfs_wait_bufs(nfsnode_t np)
3957{
3958 struct nfsbuf *bp;
3959 struct nfsbuflists blist;
3960 int error = 0;
3961
3962 lck_mtx_lock(nfs_buf_mutex);
3963 if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) {
3964 while ((bp = LIST_FIRST(&blist))) {
3965 LIST_REMOVE(bp, nb_vnbufs);
3966 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
3967 nfs_buf_refget(bp);
3968 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3969 if (error != EAGAIN) {
3970 nfs_buf_refrele(bp);
3971 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3972 lck_mtx_unlock(nfs_buf_mutex);
3973 return;
3974 }
3975 }
3976 nfs_buf_refrele(bp);
3977 nfs_buf_drop(bp);
3978 }
3979 nfs_buf_itercomplete(np, &blist, NBI_CLEAN);
3980 }
3981 if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) {
3982 while ((bp = LIST_FIRST(&blist))) {
3983 LIST_REMOVE(bp, nb_vnbufs);
3984 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3985 nfs_buf_refget(bp);
3986 while ((error = nfs_buf_acquire(bp, 0, 0, 0))) {
3987 if (error != EAGAIN) {
3988 nfs_buf_refrele(bp);
3989 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3990 lck_mtx_unlock(nfs_buf_mutex);
3991 return;
3992 }
3993 }
3994 nfs_buf_refrele(bp);
3995 nfs_buf_drop(bp);
3996 }
3997 nfs_buf_itercomplete(np, &blist, NBI_DIRTY);
3998 }
3999 lck_mtx_unlock(nfs_buf_mutex);
4000}
4001
4002
1c79356b 4003/*
2d21ac55
A
4004 * Add an async I/O request to the mount's async I/O queue and make
4005 * sure that an nfsiod will service it.
1c79356b 4006 */
2d21ac55
A
4007void
4008nfs_asyncio_finish(struct nfsreq *req)
1c79356b
A
4009{
4010 struct nfsmount *nmp;
2d21ac55
A
4011 struct nfsiod *niod;
4012 int started = 0;
1c79356b 4013
2d21ac55 4014 FSDBG_TOP(552, nmp, 0, 0, 0);
1c79356b 4015again:
fe8ab488
A
4016 nmp = req->r_nmp;
4017
0a7de745 4018 if (nmp == NULL) {
2d21ac55 4019 return;
0a7de745 4020 }
fe8ab488 4021
2d21ac55
A
4022 lck_mtx_lock(nfsiod_mutex);
4023 niod = nmp->nm_niod;
4024
4025 /* grab an nfsiod if we don't have one already */
4026 if (!niod) {
4027 niod = TAILQ_FIRST(&nfsiodfree);
4028 if (niod) {
4029 TAILQ_REMOVE(&nfsiodfree, niod, niod_link);
4030 TAILQ_INSERT_TAIL(&nfsiodwork, niod, niod_link);
4031 niod->niod_nmp = nmp;
4032 } else if (((nfsiod_thread_count < NFSIOD_MAX) || (nfsiod_thread_count <= 0)) && (started < 4)) {
1c79356b 4033 /*
2d21ac55
A
4034 * Try starting a new thread.
4035 * We may try a couple times if other callers
4036 * get the new threads before we do.
1c79356b 4037 */
2d21ac55
A
4038 lck_mtx_unlock(nfsiod_mutex);
4039 started++;
0a7de745 4040 if (!nfsiod_start()) {
2d21ac55 4041 goto again;
0a7de745 4042 }
2d21ac55 4043 lck_mtx_lock(nfsiod_mutex);
1c79356b 4044 }
91447636 4045 }
55e303ae 4046
39037602
A
4047 /*
4048 * If we got here while being on the resendq we need to get off. This
4049 * happens when the timer fires and errors out requests from nfs_sigintr
4050 * or we receive a reply (UDP case) while being on the resend queue so
4051 * we're just finishing up and are not going to be resent.
4052 */
4053 lck_mtx_lock(&req->r_mtx);
4054 if (req->r_flags & R_RESENDQ) {
4055 lck_mtx_lock(&nmp->nm_lock);
4056 if (req->r_rchain.tqe_next != NFSREQNOLIST) {
4057 NFS_BIO_DBG("Proccessing async request on resendq. Removing");
4058 TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain);
4059 req->r_rchain.tqe_next = NFSREQNOLIST;
4060 assert(req->r_refs > 1);
4061 /* Remove resendq reference */
4062 req->r_refs--;
4063 }
4064 lck_mtx_unlock(&nmp->nm_lock);
4065 req->r_flags &= ~R_RESENDQ;
4066 }
4067 lck_mtx_unlock(&req->r_mtx);
4068
0a7de745 4069 if (req->r_achain.tqe_next == NFSREQNOLIST) {
2d21ac55 4070 TAILQ_INSERT_TAIL(&nmp->nm_iodq, req, r_achain);
0a7de745 4071 }
2d21ac55
A
4072
4073 /* If this mount doesn't already have an nfsiod working on it... */
4074 if (!nmp->nm_niod) {
4075 if (niod) { /* give it the nfsiod we just grabbed */
4076 nmp->nm_niod = niod;
4077 lck_mtx_unlock(nfsiod_mutex);
4078 wakeup(niod);
4079 } else if (nfsiod_thread_count > 0) {
fe8ab488 4080 /* just queue it up on nfsiod mounts queue if needed */
0a7de745 4081 if (nmp->nm_iodlink.tqe_next == NFSNOLIST) {
fe8ab488 4082 TAILQ_INSERT_TAIL(&nfsiodmounts, nmp, nm_iodlink);
0a7de745 4083 }
2d21ac55
A
4084 lck_mtx_unlock(nfsiod_mutex);
4085 } else {
4086 printf("nfs_asyncio(): no nfsiods? %d %d (%d)\n", nfsiod_thread_count, NFSIOD_MAX, started);
4087 lck_mtx_unlock(nfsiod_mutex);
4088 /* we have no other option but to be persistent */
4089 started = 0;
4090 goto again;
1c79356b 4091 }
2d21ac55
A
4092 } else {
4093 lck_mtx_unlock(nfsiod_mutex);
1c79356b
A
4094 }
4095
2d21ac55
A
4096 FSDBG_BOT(552, nmp, 0, 0, 0);
4097}
1c79356b 4098
2d21ac55
A
4099/*
4100 * queue up async I/O request for resend
4101 */
4102void
4103nfs_asyncio_resend(struct nfsreq *req)
4104{
4105 struct nfsmount *nmp = req->r_nmp;
1c79356b 4106
0a7de745 4107 if (nfs_mount_gone(nmp)) {
2d21ac55 4108 return;
0a7de745 4109 }
3e170ce0 4110
cb323159 4111#if CONFIG_NFS_GSS
2d21ac55 4112 nfs_gss_clnt_rpcdone(req);
cb323159 4113#endif
2d21ac55 4114 lck_mtx_lock(&nmp->nm_lock);
b0d623f7 4115 if (!(req->r_flags & R_RESENDQ)) {
2d21ac55
A
4116 TAILQ_INSERT_TAIL(&nmp->nm_resendq, req, r_rchain);
4117 req->r_flags |= R_RESENDQ;
3e170ce0
A
4118 /*
4119 * We take a reference on this request so that it can't be
4120 * destroyed while a resend is queued or in progress.
4121 */
4122 nfs_request_ref(req, 1);
1c79356b 4123 }
2d21ac55
A
4124 nfs_mount_sock_thread_wake(nmp);
4125 lck_mtx_unlock(&nmp->nm_lock);
1c79356b
A
4126}
4127
4128/*
b0d623f7
A
4129 * Read directory data into a buffer.
4130 *
4131 * Buffer will be filled (unless EOF is hit).
4132 * Buffers after this one may also be completely/partially filled.
1c79356b
A
4133 */
4134int
2d21ac55 4135nfs_buf_readdir(struct nfsbuf *bp, vfs_context_t ctx)
1c79356b 4136{
b0d623f7
A
4137 nfsnode_t np = bp->nb_np;
4138 struct nfsmount *nmp = NFSTONMP(np);
4139 int error = 0;
1c79356b 4140
0a7de745
A
4141 if (nfs_mount_gone(nmp)) {
4142 return ENXIO;
4143 }
55e303ae 4144
0a7de745 4145 if (nmp->nm_vers < NFS_VER4) {
b0d623f7 4146 error = nfs3_readdir_rpc(np, bp, ctx);
cb323159
A
4147 }
4148#if CONFIG_NFS4
4149 else {
b0d623f7 4150 error = nfs4_readdir_rpc(np, bp, ctx);
0a7de745 4151 }
cb323159 4152#endif
b0d623f7 4153 if (error && (error != NFSERR_DIRBUFDROPPED)) {
55e303ae
A
4154 SET(bp->nb_flags, NB_ERROR);
4155 bp->nb_error = error;
1c79356b 4156 }
0a7de745 4157 return error;
1c79356b 4158}
ea3f0419
A
4159
4160#endif /* CONFIG_NFS_CLIENT */