]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_bio.c
xnu-792.17.14.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
8f6c56a5 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
8f6c56a5
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
8ad349bb 24 * limitations under the License.
8f6c56a5
A
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * This code is derived from software contributed to Berkeley by
34 * Rick Macklem at The University of Guelph.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
65 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
66 */
1c79356b
A
67#include <sys/param.h>
68#include <sys/systm.h>
69#include <sys/resourcevar.h>
70#include <sys/signalvar.h>
91447636
A
71#include <sys/proc_internal.h>
72#include <sys/kauth.h>
55e303ae 73#include <sys/malloc.h>
1c79356b 74#include <sys/vnode.h>
55e303ae 75#include <sys/dirent.h>
91447636 76#include <sys/mount_internal.h>
1c79356b
A
77#include <sys/kernel.h>
78#include <sys/sysctl.h>
91447636
A
79#include <sys/ubc_internal.h>
80#include <sys/uio_internal.h>
1c79356b
A
81
82#include <sys/vm.h>
83#include <sys/vmparam.h>
84
85#include <sys/time.h>
86#include <kern/clock.h>
91447636
A
87#include <libkern/OSAtomic.h>
88#include <kern/kalloc.h>
1c79356b
A
89
90#include <nfs/rpcv2.h>
91#include <nfs/nfsproto.h>
92#include <nfs/nfs.h>
93#include <nfs/nfsmount.h>
1c79356b 94#include <nfs/nfsnode.h>
91447636 95#include <sys/buf_internal.h>
1c79356b
A
96
97#include <sys/kdebug.h>
98
fa4905b1
A
99#define FSDBG(A, B, C, D, E) \
100 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
101 (int)(B), (int)(C), (int)(D), (int)(E), 0)
102#define FSDBG_TOP(A, B, C, D, E) \
103 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
104 (int)(B), (int)(C), (int)(D), (int)(E), 0)
105#define FSDBG_BOT(A, B, C, D, E) \
106 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
107 (int)(B), (int)(C), (int)(D), (int)(E), 0)
108
1c79356b 109extern int nfs_numasync;
55e303ae 110extern int nfs_ioddelwri;
1c79356b 111extern struct nfsstats nfsstats;
55e303ae 112
91447636
A
113#define NFSBUFHASH(np, lbn) \
114 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
55e303ae 115LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
483a1d10 116struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
55e303ae 117u_long nfsbufhash;
91447636 118int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
483a1d10 119int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
55e303ae 120int nfs_nbdwrite;
483a1d10
A
121time_t nfsbuffreeuptimestamp;
122
91447636
A
123lck_grp_t *nfs_buf_lck_grp;
124lck_grp_attr_t *nfs_buf_lck_grp_attr;
125lck_attr_t *nfs_buf_lck_attr;
126lck_mtx_t *nfs_buf_mutex;
127
483a1d10
A
128#define NFSBUFWRITE_THROTTLE 9
129#define NFSBUF_LRU_STALE 120
130#define NFSBUF_META_STALE 240
131
132/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
133#define LRU_TO_FREEUP 6
134/* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
135#define META_TO_FREEUP 3
136/* total number of nfsbufs nfs_buf_freeup() should attempt to free */
137#define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
138/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
139#define LRU_FREEUP_FRAC_ON_TIMER 8
140/* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
141#define META_FREEUP_FRAC_ON_TIMER 16
142/* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
143#define LRU_FREEUP_MIN_FRAC 4
144/* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
145#define META_FREEUP_MIN_FRAC 2
55e303ae 146
483a1d10 147#define NFS_BUF_FREEUP() \
91447636 148 do { \
483a1d10
A
149 /* only call nfs_buf_freeup() if it has work to do: */ \
150 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
151 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
152 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
153 nfs_buf_freeup(0); \
154 } while (0)
55e303ae
A
155
156/*
157 * Initialize nfsbuf lists
158 */
159void
160nfs_nbinit(void)
161{
91447636 162 nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init();
8f6c56a5 163 lck_grp_attr_setstat(nfs_buf_lck_grp_attr);
91447636
A
164 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr);
165
166 nfs_buf_lck_attr = lck_attr_alloc_init();
167
168 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr);
169
170 nfsbufcnt = nfsbufmetacnt =
171 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
172 nfsbufmin = 128;
173 nfsbufmax = (sane_size >> PAGE_SHIFT) / 4;
174 nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16;
55e303ae
A
175 nfsneedbuffer = 0;
176 nfs_nbdwrite = 0;
483a1d10 177 nfsbuffreeuptimestamp = 0;
91447636
A
178
179 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
180 TAILQ_INIT(&nfsbuffree);
181 TAILQ_INIT(&nfsbuffreemeta);
182 TAILQ_INIT(&nfsbufdelwri);
183
55e303ae
A
184}
185
186/*
187 * try to free up some excess, unused nfsbufs
188 */
483a1d10
A
189void
190nfs_buf_freeup(int timer)
55e303ae
A
191{
192 struct nfsbuf *fbp;
483a1d10
A
193 struct timeval now;
194 int count;
91447636
A
195 struct nfsbuffreehead nfsbuffreeup;
196
197 TAILQ_INIT(&nfsbuffreeup);
198
199 lck_mtx_lock(nfs_buf_mutex);
55e303ae 200
483a1d10
A
201 microuptime(&now);
202 nfsbuffreeuptimestamp = now.tv_sec;
55e303ae 203
91447636
A
204 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
205
483a1d10
A
206 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
207 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
55e303ae
A
208 fbp = TAILQ_FIRST(&nfsbuffree);
209 if (!fbp)
210 break;
91447636
A
211 if (fbp->nb_refs)
212 break;
213 if (NBUFSTAMPVALID(fbp) &&
214 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
483a1d10
A
215 break;
216 nfs_buf_remfree(fbp);
217 /* disassociate buffer from any vnode */
218 if (fbp->nb_vp) {
483a1d10
A
219 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
220 LIST_REMOVE(fbp, nb_vnbufs);
221 fbp->nb_vnbufs.le_next = NFSNOLIST;
222 }
483a1d10 223 fbp->nb_vp = NULL;
483a1d10
A
224 }
225 LIST_REMOVE(fbp, nb_hash);
91447636 226 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
483a1d10
A
227 nfsbufcnt--;
228 }
229
230 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
231 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
232 fbp = TAILQ_FIRST(&nfsbuffreemeta);
233 if (!fbp)
234 break;
91447636
A
235 if (fbp->nb_refs)
236 break;
237 if (NBUFSTAMPVALID(fbp) &&
238 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
483a1d10 239 break;
55e303ae
A
240 nfs_buf_remfree(fbp);
241 /* disassociate buffer from any vnode */
242 if (fbp->nb_vp) {
55e303ae
A
243 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
244 LIST_REMOVE(fbp, nb_vnbufs);
245 fbp->nb_vnbufs.le_next = NFSNOLIST;
246 }
55e303ae 247 fbp->nb_vp = NULL;
55e303ae
A
248 }
249 LIST_REMOVE(fbp, nb_hash);
91447636
A
250 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
251 nfsbufcnt--;
252 nfsbufmetacnt--;
253 }
254
255 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
256 NFSBUFCNTCHK(1);
257
258 lck_mtx_unlock(nfs_buf_mutex);
259
260 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
261 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
55e303ae 262 /* nuke any creds */
91447636
A
263 if (fbp->nb_rcred != NOCRED) {
264 kauth_cred_rele(fbp->nb_rcred);
265 fbp->nb_rcred = NOCRED;
55e303ae 266 }
91447636
A
267 if (fbp->nb_wcred != NOCRED) {
268 kauth_cred_rele(fbp->nb_wcred);
269 fbp->nb_wcred = NOCRED;
270 }
271 /* if buf was NB_META, dump buffer */
272 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
273 kfree(fbp->nb_data, fbp->nb_bufsize);
55e303ae 274 FREE(fbp, M_TEMP);
55e303ae 275 }
91447636 276
55e303ae
A
277}
278
91447636
A
279/*
280 * remove a buffer from the freelist
281 * (must be called with nfs_buf_mutex held)
282 */
55e303ae
A
283void
284nfs_buf_remfree(struct nfsbuf *bp)
285{
286 if (bp->nb_free.tqe_next == NFSNOLIST)
287 panic("nfsbuf not on free list");
288 if (ISSET(bp->nb_flags, NB_DELWRI)) {
289 nfsbufdelwricnt--;
290 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
91447636 291 } else if (ISSET(bp->nb_flags, NB_META)) {
483a1d10
A
292 nfsbuffreemetacnt--;
293 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
55e303ae
A
294 } else {
295 nfsbuffreecnt--;
296 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
297 }
298 bp->nb_free.tqe_next = NFSNOLIST;
91447636 299 NFSBUFCNTCHK(1);
55e303ae
A
300}
301
302/*
303 * check for existence of nfsbuf in cache
304 */
91447636
A
305boolean_t
306nfs_buf_is_incore(vnode_t vp, daddr64_t blkno)
307{
308 boolean_t rv;
309 lck_mtx_lock(nfs_buf_mutex);
310 if (nfs_buf_incore(vp, blkno))
311 rv = TRUE;
312 else
313 rv = FALSE;
314 lck_mtx_unlock(nfs_buf_mutex);
315 return (rv);
316}
317
318/*
319 * return incore buffer (must be called with nfs_buf_mutex held)
320 */
55e303ae 321struct nfsbuf *
91447636 322nfs_buf_incore(vnode_t vp, daddr64_t blkno)
55e303ae
A
323{
324 /* Search hash chain */
91447636 325 struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first;
55e303ae 326 for (; bp != NULL; bp = bp->nb_hash.le_next)
483a1d10
A
327 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
328 if (!ISSET(bp->nb_flags, NB_INVAL)) {
329 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
330 return (bp);
331 }
332 }
55e303ae
A
333 return (NULL);
334}
335
336/*
337 * Check if it's OK to drop a page.
338 *
339 * Called by vnode_pager() on pageout request of non-dirty page.
340 * We need to make sure that it's not part of a delayed write.
341 * If it is, we can't let the VM drop it because we may need it
342 * later when/if we need to write the data (again).
343 */
344int
91447636 345nfs_buf_page_inval(vnode_t vp, off_t offset)
55e303ae
A
346{
347 struct nfsbuf *bp;
91447636
A
348 int error = 0;
349
350 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
351 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
352 if (!bp)
91447636 353 goto out;
55e303ae 354 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
91447636
A
355 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
356 error = EBUSY;
357 goto out;
358 }
55e303ae
A
359 /*
360 * If there's a dirty range in the buffer, check to
361 * see if this page intersects with the dirty range.
362 * If it does, we can't let the pager drop the page.
363 */
364 if (bp->nb_dirtyend > 0) {
365 int start = offset - NBOFF(bp);
366 if (bp->nb_dirtyend <= start ||
367 bp->nb_dirtyoff >= (start + PAGE_SIZE))
91447636
A
368 error = 0;
369 else
370 error = EBUSY;
55e303ae 371 }
91447636
A
372out:
373 lck_mtx_unlock(nfs_buf_mutex);
374 return (error);
55e303ae
A
375}
376
91447636
A
377/*
378 * set up the UPL for a buffer
379 * (must NOT be called with nfs_buf_mutex held)
380 */
55e303ae
A
381int
382nfs_buf_upl_setup(struct nfsbuf *bp)
383{
384 kern_return_t kret;
385 upl_t upl;
91447636 386 int upl_flags;
55e303ae
A
387
388 if (ISSET(bp->nb_flags, NB_PAGELIST))
389 return (0);
390
91447636
A
391 upl_flags = UPL_PRECIOUS;
392 if (! ISSET(bp->nb_flags, NB_READ)) {
393 /*
394 * We're doing a "write", so we intend to modify
395 * the pages we're gathering.
396 */
397 upl_flags |= UPL_WILL_MODIFY;
398 }
55e303ae 399 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
91447636 400 &upl, NULL, upl_flags);
55e303ae
A
401 if (kret == KERN_INVALID_ARGUMENT) {
402 /* vm object probably doesn't exist any more */
403 bp->nb_pagelist = NULL;
404 return (EINVAL);
405 }
406 if (kret != KERN_SUCCESS) {
407 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
408 bp->nb_pagelist = NULL;
409 return (EIO);
410 }
411
412 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
413
55e303ae
A
414 bp->nb_pagelist = upl;
415 SET(bp->nb_flags, NB_PAGELIST);
55e303ae
A
416 return (0);
417}
418
91447636
A
419/*
420 * update buffer's valid/dirty info from UBC
421 * (must NOT be called with nfs_buf_mutex held)
422 */
55e303ae
A
423void
424nfs_buf_upl_check(struct nfsbuf *bp)
425{
426 upl_page_info_t *pl;
427 off_t filesize, fileoffset;
428 int i, npages;
429
430 if (!ISSET(bp->nb_flags, NB_PAGELIST))
431 return;
432
433 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
434 filesize = ubc_getsize(bp->nb_vp);
435 fileoffset = NBOFF(bp);
436 if (fileoffset < filesize)
437 SET(bp->nb_flags, NB_CACHE);
438 else
439 CLR(bp->nb_flags, NB_CACHE);
440
441 pl = ubc_upl_pageinfo(bp->nb_pagelist);
442 bp->nb_valid = bp->nb_dirty = 0;
443
444 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
445 /* anything beyond the end of the file is not valid or dirty */
446 if (fileoffset >= filesize)
447 break;
448 if (!upl_valid_page(pl, i)) {
449 CLR(bp->nb_flags, NB_CACHE);
450 continue;
451 }
452 NBPGVALID_SET(bp,i);
453 if (upl_dirty_page(pl, i)) {
454 NBPGDIRTY_SET(bp, i);
455 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
456 SET(bp->nb_flags, NB_WASDIRTY);
457 }
458 }
459 fileoffset = NBOFF(bp);
460 if (ISSET(bp->nb_flags, NB_CACHE)) {
461 bp->nb_validoff = 0;
462 bp->nb_validend = bp->nb_bufsize;
463 if (fileoffset + bp->nb_validend > filesize)
464 bp->nb_validend = filesize - fileoffset;
465 } else {
466 bp->nb_validoff = bp->nb_validend = -1;
467 }
468 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
469 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
470}
471
91447636
A
472/*
473 * make sure that a buffer is mapped
474 * (must NOT be called with nfs_buf_mutex held)
475 */
55e303ae
A
476static int
477nfs_buf_map(struct nfsbuf *bp)
478{
479 kern_return_t kret;
480
481 if (bp->nb_data)
482 return (0);
483 if (!ISSET(bp->nb_flags, NB_PAGELIST))
484 return (EINVAL);
485
486 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
487 if (kret != KERN_SUCCESS)
488 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
489 if (bp->nb_data == 0)
490 panic("ubc_upl_map mapped 0");
491 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
492 return (0);
493}
494
495/*
496 * check range of pages in nfsbuf's UPL for validity
497 */
498static int
499nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
500{
501 off_t fileoffset, filesize;
502 int pg, lastpg;
503 upl_page_info_t *pl;
504
505 if (!ISSET(bp->nb_flags, NB_PAGELIST))
506 return (0);
507 pl = ubc_upl_pageinfo(bp->nb_pagelist);
508
509 size += off & PAGE_MASK;
510 off &= ~PAGE_MASK;
511 fileoffset = NBOFF(bp);
512 filesize = VTONFS(bp->nb_vp)->n_size;
513 if ((fileoffset + off + size) > filesize)
514 size = filesize - (fileoffset + off);
515
516 pg = off/PAGE_SIZE;
517 lastpg = (off + size - 1)/PAGE_SIZE;
518 while (pg <= lastpg) {
519 if (!upl_valid_page(pl, pg))
520 return (0);
521 pg++;
522 }
523 return (1);
524}
525
526/*
527 * normalize an nfsbuf's valid range
528 *
529 * the read/write code guarantees that we'll always have a valid
530 * region that is an integral number of pages. If either end
531 * of the valid range isn't page-aligned, it gets corrected
532 * here as we extend the valid range through all of the
533 * contiguous valid pages.
534 */
535static void
536nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
537{
538 int pg, npg;
539 /* pull validoff back to start of contiguous valid page range */
540 pg = bp->nb_validoff/PAGE_SIZE;
541 while (pg >= 0 && NBPGVALID(bp,pg))
542 pg--;
543 bp->nb_validoff = (pg+1) * PAGE_SIZE;
544 /* push validend forward to end of contiguous valid page range */
545 npg = bp->nb_bufsize/PAGE_SIZE;
546 pg = bp->nb_validend/PAGE_SIZE;
547 while (pg < npg && NBPGVALID(bp,pg))
548 pg++;
549 bp->nb_validend = pg * PAGE_SIZE;
550 /* clip to EOF */
91447636 551 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
55e303ae
A
552 bp->nb_validend = np->n_size % bp->nb_bufsize;
553}
554
555/*
556 * try to push out some delayed/uncommitted writes
91447636 557 * ("locked" indicates whether nfs_buf_mutex is already held)
55e303ae
A
558 */
559static void
91447636 560nfs_buf_delwri_push(int locked)
55e303ae
A
561{
562 struct nfsbuf *bp;
91447636 563 int i, error;
55e303ae
A
564
565 if (TAILQ_EMPTY(&nfsbufdelwri))
566 return;
567
568 /* first try to tell the nfsiods to do it */
569 if (nfs_asyncio(NULL, NULL) == 0)
570 return;
571
572 /* otherwise, try to do some of the work ourselves */
573 i = 0;
91447636
A
574 if (!locked)
575 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
576 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
577 struct nfsnode *np = VTONFS(bp->nb_vp);
578 nfs_buf_remfree(bp);
91447636
A
579 nfs_buf_refget(bp);
580 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
581 nfs_buf_refrele(bp);
582 if (error)
583 break;
584 if (!bp->nb_vp) {
585 /* buffer is no longer valid */
586 nfs_buf_drop(bp);
587 continue;
588 }
8f6c56a5
A
589 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
590 nfs_buf_check_write_verifier(np, bp);
55e303ae
A
591 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
592 /* put buffer at end of delwri list */
593 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
594 nfsbufdelwricnt++;
91447636
A
595 nfs_buf_drop(bp);
596 lck_mtx_unlock(nfs_buf_mutex);
597 nfs_flushcommits(np->n_vnode, NULL, 1);
55e303ae 598 } else {
91447636
A
599 SET(bp->nb_flags, NB_ASYNC);
600 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
601 nfs_buf_write(bp);
602 }
603 i++;
91447636 604 lck_mtx_lock(nfs_buf_mutex);
55e303ae 605 }
91447636
A
606 if (!locked)
607 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
608}
609
610/*
91447636
A
611 * Get an nfs buffer.
612 *
613 * Returns errno on error, 0 otherwise.
614 * Any buffer is returned in *bpp.
615 *
616 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
617 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
618 *
619 * Check for existence of buffer in cache.
620 * Or attempt to reuse a buffer from one of the free lists.
621 * Or allocate a new buffer if we haven't already hit max allocation.
622 * Or wait for a free buffer.
623 *
624 * If available buffer found, prepare it, and return it.
625 *
626 * If the calling process is interrupted by a signal for
627 * an interruptible mount point, return EINTR.
55e303ae 628 */
91447636 629int
55e303ae 630nfs_buf_get(
91447636
A
631 vnode_t vp,
632 daddr64_t blkno,
55e303ae 633 int size,
91447636
A
634 proc_t p,
635 int flags,
636 struct nfsbuf **bpp)
55e303ae
A
637{
638 struct nfsnode *np = VTONFS(vp);
639 struct nfsbuf *bp;
91447636
A
640 int biosize, bufsize;
641 kauth_cred_t cred;
55e303ae 642 int slpflag = PCATCH;
91447636
A
643 int operation = (flags & NBLK_OPMASK);
644 int error = 0;
645 struct timespec ts;
55e303ae 646
91447636
A
647 FSDBG_TOP(541, vp, blkno, size, flags);
648 *bpp = NULL;
55e303ae
A
649
650 bufsize = size;
8f6c56a5
A
651 if (bufsize > MAXBSIZE)
652 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
55e303ae 653
8f6c56a5 654 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
55e303ae 655
91447636
A
656 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) {
657 operation = NBLK_META;
658 } else if (bufsize < biosize) {
55e303ae
A
659 /* reg files should always have biosize blocks */
660 bufsize = biosize;
91447636 661 }
55e303ae 662
91447636
A
663 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
664 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
55e303ae
A
665 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
666
667 /* poke the delwri list */
91447636 668 nfs_buf_delwri_push(0);
55e303ae
A
669
670 /* sleep to let other threads run... */
671 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
672 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
673 }
674
675loop:
91447636 676 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
677
678 /* check for existence of nfsbuf in cache */
91447636 679 if ((bp = nfs_buf_incore(vp, blkno))) {
55e303ae 680 /* if busy, set wanted and wait */
91447636
A
681 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
682 if (flags & NBLK_NOWAIT) {
683 lck_mtx_unlock(nfs_buf_mutex);
684 FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc);
685 return (0);
686 }
55e303ae 687 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
91447636
A
688 SET(bp->nb_lflags, NBL_WANTED);
689
690 ts.tv_sec = 2;
691 ts.tv_nsec = 0;
692 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
693 "nfsbufget", (slpflag == PCATCH) ? 0 : &ts);
55e303ae
A
694 slpflag = 0;
695 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
91447636
A
696 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
697 FSDBG_BOT(541, vp, blkno, 0, error);
698 return (error);
55e303ae
A
699 }
700 goto loop;
701 }
702 if (bp->nb_bufsize != bufsize)
703 panic("nfsbuf size mismatch");
91447636
A
704 SET(bp->nb_lflags, NBL_BUSY);
705 SET(bp->nb_flags, NB_CACHE);
55e303ae
A
706 nfs_buf_remfree(bp);
707 /* additional paranoia: */
708 if (ISSET(bp->nb_flags, NB_PAGELIST))
709 panic("pagelist buffer was not busy");
710 goto buffer_setup;
711 }
712
91447636
A
713 if (flags & NBLK_ONLYVALID) {
714 lck_mtx_unlock(nfs_buf_mutex);
715 FSDBG_BOT(541, vp, blkno, 0, 0x0000cace);
716 return (0);
717 }
718
55e303ae
A
719 /*
720 * where to get a free buffer:
91447636 721 * - if meta and maxmeta reached, must reuse meta
55e303ae 722 * - alloc new if we haven't reached min bufs
483a1d10
A
723 * - if free lists are NOT empty
724 * - if free list is stale, use it
725 * - else if freemeta list is stale, use it
726 * - else if max bufs allocated, use least-time-to-stale
55e303ae
A
727 * - alloc new if we haven't reached max allowed
728 * - start clearing out delwri list and try again
729 */
730
91447636
A
731 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
732 /* if we've hit max meta buffers, must reuse a meta buffer */
733 bp = TAILQ_FIRST(&nfsbuffreemeta);
734 } else if ((nfsbufcnt > nfsbufmin) &&
483a1d10
A
735 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
736 /* try to pull an nfsbuf off a free list */
737 struct nfsbuf *lrubp, *metabp;
738 struct timeval now;
739 microuptime(&now);
740
91447636 741 /* if the next LRU or META buffer is invalid or stale, use it */
483a1d10 742 lrubp = TAILQ_FIRST(&nfsbuffree);
91447636
A
743 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
744 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
483a1d10
A
745 bp = lrubp;
746 metabp = TAILQ_FIRST(&nfsbuffreemeta);
91447636
A
747 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
748 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
483a1d10
A
749 bp = metabp;
750
751 if (!bp && (nfsbufcnt >= nfsbufmax)) {
752 /* we've already allocated all bufs, so */
753 /* choose the buffer that'll go stale first */
754 if (!metabp)
755 bp = lrubp;
756 else if (!lrubp)
757 bp = metabp;
758 else {
759 int32_t lru_stale_time, meta_stale_time;
760 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
761 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
762 if (lru_stale_time <= meta_stale_time)
763 bp = lrubp;
764 else
765 bp = metabp;
55e303ae 766 }
55e303ae 767 }
91447636 768 }
483a1d10 769
91447636
A
770 if (bp) {
771 /* we have a buffer to reuse */
772 FSDBG(544, vp, blkno, bp, bp->nb_flags);
773 nfs_buf_remfree(bp);
774 if (ISSET(bp->nb_flags, NB_DELWRI))
775 panic("nfs_buf_get: delwri");
776 SET(bp->nb_lflags, NBL_BUSY);
777 /* disassociate buffer from previous vnode */
778 if (bp->nb_vp) {
779 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
780 LIST_REMOVE(bp, nb_vnbufs);
781 bp->nb_vnbufs.le_next = NFSNOLIST;
483a1d10 782 }
91447636
A
783 bp->nb_vp = NULL;
784 }
785 LIST_REMOVE(bp, nb_hash);
786 /* nuke any creds we're holding */
787 cred = bp->nb_rcred;
788 if (cred != NOCRED) {
789 bp->nb_rcred = NOCRED;
790 kauth_cred_rele(cred);
791 }
792 cred = bp->nb_wcred;
793 if (cred != NOCRED) {
794 bp->nb_wcred = NOCRED;
795 kauth_cred_rele(cred);
796 }
797 /* if buf will no longer be NB_META, dump old buffer */
798 if (operation == NBLK_META) {
799 if (!ISSET(bp->nb_flags, NB_META))
800 nfsbufmetacnt++;
801 } else if (ISSET(bp->nb_flags, NB_META)) {
802 if (bp->nb_data) {
803 kfree(bp->nb_data, bp->nb_bufsize);
483a1d10
A
804 bp->nb_data = NULL;
805 }
91447636 806 nfsbufmetacnt--;
55e303ae 807 }
91447636
A
808 /* re-init buf fields */
809 bp->nb_error = 0;
810 bp->nb_validoff = bp->nb_validend = -1;
811 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
812 bp->nb_valid = 0;
813 bp->nb_dirty = 0;
8f6c56a5 814 bp->nb_verf = 0;
91447636
A
815 } else {
816 /* no buffer to reuse */
817 if ((nfsbufcnt < nfsbufmax) &&
818 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
483a1d10
A
819 /* just alloc a new one */
820 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
91447636
A
821 if (!bp) {
822 lck_mtx_unlock(nfs_buf_mutex);
823 FSDBG_BOT(541, vp, blkno, 0, error);
824 return (ENOMEM);
825 }
483a1d10 826 nfsbufcnt++;
91447636
A
827 if (operation == NBLK_META)
828 nfsbufmetacnt++;
829 NFSBUFCNTCHK(1);
483a1d10
A
830 /* init nfsbuf */
831 bzero(bp, sizeof(*bp));
832 bp->nb_free.tqe_next = NFSNOLIST;
833 bp->nb_validoff = bp->nb_validend = -1;
834 FSDBG(545, vp, blkno, bp, 0);
835 } else {
836 /* too many bufs... wait for buffers to free up */
837 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
55e303ae 838
483a1d10 839 /* poke the delwri list */
91447636 840 nfs_buf_delwri_push(1);
483a1d10
A
841
842 nfsneedbuffer = 1;
91447636
A
843 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP,
844 "nfsbufget", 0);
483a1d10 845 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
91447636
A
846 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
847 FSDBG_BOT(541, vp, blkno, 0, error);
848 return (error);
483a1d10
A
849 }
850 goto loop;
55e303ae 851 }
55e303ae
A
852 }
853
55e303ae 854 /* setup nfsbuf */
91447636
A
855 bp->nb_lflags = NBL_BUSY;
856 bp->nb_flags = 0;
55e303ae
A
857 bp->nb_lblkno = blkno;
858 /* insert buf in hash */
91447636 859 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
55e303ae 860 /* associate buffer with new vnode */
55e303ae
A
861 bp->nb_vp = vp;
862 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
863
864buffer_setup:
865
91447636
A
866 /* unlock hash */
867 lck_mtx_unlock(nfs_buf_mutex);
868
55e303ae 869 switch (operation) {
91447636 870 case NBLK_META:
55e303ae
A
871 SET(bp->nb_flags, NB_META);
872 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
91447636 873 kfree(bp->nb_data, bp->nb_bufsize);
55e303ae
A
874 bp->nb_data = NULL;
875 bp->nb_validoff = bp->nb_validend = -1;
876 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
877 bp->nb_valid = 0;
878 bp->nb_dirty = 0;
879 CLR(bp->nb_flags, NB_CACHE);
880 }
881 if (!bp->nb_data)
91447636
A
882 bp->nb_data = kalloc(bufsize);
883 if (!bp->nb_data) {
884 /* Ack! couldn't allocate the data buffer! */
885 /* cleanup buffer and return error */
886 lck_mtx_lock(nfs_buf_mutex);
887 LIST_REMOVE(bp, nb_vnbufs);
888 bp->nb_vnbufs.le_next = NFSNOLIST;
889 bp->nb_vp = NULL;
890 /* invalidate usage timestamp to allow immediate freeing */
891 NBUFSTAMPINVALIDATE(bp);
892 if (bp->nb_free.tqe_next != NFSNOLIST)
893 panic("nfsbuf on freelist");
894 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
895 nfsbuffreecnt++;
896 lck_mtx_unlock(nfs_buf_mutex);
897 FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM);
898 return (ENOMEM);
899 }
55e303ae
A
900 bp->nb_bufsize = bufsize;
901 break;
902
91447636
A
903 case NBLK_READ:
904 case NBLK_WRITE:
905 /*
906 * Set or clear NB_READ now to let the UPL subsystem know
907 * if we intend to modify the pages or not.
908 */
909 if (operation == NBLK_READ) {
910 SET(bp->nb_flags, NB_READ);
911 } else {
912 CLR(bp->nb_flags, NB_READ);
913 }
55e303ae
A
914 if (bufsize < PAGE_SIZE)
915 bufsize = PAGE_SIZE;
916 bp->nb_bufsize = bufsize;
917 bp->nb_validoff = bp->nb_validend = -1;
918
91447636 919 if (UBCINFOEXISTS(vp)) {
55e303ae
A
920 /* setup upl */
921 if (nfs_buf_upl_setup(bp)) {
922 /* unable to create upl */
923 /* vm object must no longer exist */
91447636
A
924 /* cleanup buffer and return error */
925 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
926 LIST_REMOVE(bp, nb_vnbufs);
927 bp->nb_vnbufs.le_next = NFSNOLIST;
928 bp->nb_vp = NULL;
91447636
A
929 /* invalidate usage timestamp to allow immediate freeing */
930 NBUFSTAMPINVALIDATE(bp);
55e303ae
A
931 if (bp->nb_free.tqe_next != NFSNOLIST)
932 panic("nfsbuf on freelist");
933 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
934 nfsbuffreecnt++;
91447636 935 lck_mtx_unlock(nfs_buf_mutex);
55e303ae 936 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
91447636 937 return (EIO);
55e303ae
A
938 }
939 nfs_buf_upl_check(bp);
940 }
941 break;
942
943 default:
944 panic("nfs_buf_get: %d unknown operation", operation);
945 }
946
91447636 947 *bpp = bp;
55e303ae
A
948
949 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
950
91447636 951 return (0);
55e303ae
A
952}
953
954void
483a1d10 955nfs_buf_release(struct nfsbuf *bp, int freeup)
55e303ae 956{
91447636 957 vnode_t vp = bp->nb_vp;
483a1d10 958 struct timeval now;
91447636 959 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
55e303ae
A
960
961 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
962 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
963 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
964
965 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
966 int upl_flags;
967 upl_t upl;
968 int i, rv;
969
970 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
971 rv = nfs_buf_upl_setup(bp);
972 if (rv)
973 printf("nfs_buf_release: upl create failed %d\n", rv);
974 else
975 nfs_buf_upl_check(bp);
976 }
977 upl = bp->nb_pagelist;
978 if (!upl)
979 goto pagelist_cleanup_done;
980 if (bp->nb_data) {
981 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
982 panic("ubc_upl_unmap failed");
983 bp->nb_data = NULL;
984 }
8f6c56a5 985 if (bp->nb_flags & (NB_ERROR | NB_INVAL | NB_NOCACHE)) {
3a60a9f5 986 if (bp->nb_flags & (NB_READ | NB_INVAL | NB_NOCACHE))
55e303ae
A
987 upl_flags = UPL_ABORT_DUMP_PAGES;
988 else
989 upl_flags = 0;
990 ubc_upl_abort(upl, upl_flags);
991 goto pagelist_cleanup_done;
992 }
993 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
994 if (!NBPGVALID(bp,i))
995 ubc_upl_abort_range(upl,
996 i*PAGE_SIZE, PAGE_SIZE,
997 UPL_ABORT_DUMP_PAGES |
998 UPL_ABORT_FREE_ON_EMPTY);
999 else {
1000 if (NBPGDIRTY(bp,i))
1001 upl_flags = UPL_COMMIT_SET_DIRTY;
1002 else
1003 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1004 ubc_upl_commit_range(upl,
1005 i*PAGE_SIZE, PAGE_SIZE,
1006 upl_flags |
1007 UPL_COMMIT_INACTIVATE |
1008 UPL_COMMIT_FREE_ON_EMPTY);
1009 }
1010 }
1011pagelist_cleanup_done:
1012 /* was this the last buffer in the file? */
91447636 1013 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) {
55e303ae 1014 /* if so, invalidate all pages of last buffer past EOF */
8f6c56a5 1015 int biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
91447636
A
1016 off_t start, end;
1017 start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
8f6c56a5 1018 end = trunc_page_64(NBOFF(bp) + biosize);
91447636
A
1019 if (end > start) {
1020 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1021 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1022 }
55e303ae
A
1023 }
1024 CLR(bp->nb_flags, NB_PAGELIST);
1025 bp->nb_pagelist = NULL;
1026 }
1027
91447636
A
1028 lck_mtx_lock(nfs_buf_mutex);
1029
1030 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1031
55e303ae
A
1032 /* Wake up any processes waiting for any buffer to become free. */
1033 if (nfsneedbuffer) {
1034 nfsneedbuffer = 0;
91447636 1035 wakeup_needbuffer = 1;
55e303ae
A
1036 }
1037 /* Wake up any processes waiting for _this_ buffer to become free. */
91447636
A
1038 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1039 CLR(bp->nb_lflags, NBL_WANTED);
1040 wakeup_buffer = 1;
55e303ae
A
1041 }
1042
8f6c56a5
A
1043 /* If it's not cacheable, or an error, mark it invalid. */
1044 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
55e303ae
A
1045 SET(bp->nb_flags, NB_INVAL);
1046
1047 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1048 /* If it's invalid or empty, dissociate it from its vnode */
1049 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1050 LIST_REMOVE(bp, nb_vnbufs);
1051 bp->nb_vnbufs.le_next = NFSNOLIST;
1052 }
1053 bp->nb_vp = NULL;
55e303ae
A
1054 /* if this was a delayed write, wakeup anyone */
1055 /* waiting for delayed writes to complete */
1056 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1057 CLR(bp->nb_flags, NB_DELWRI);
91447636
A
1058 OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite);
1059 NFSBUFCNTCHK(1);
1060 wakeup_nbdwrite = 1;
55e303ae 1061 }
91447636
A
1062 /* invalidate usage timestamp to allow immediate freeing */
1063 NBUFSTAMPINVALIDATE(bp);
55e303ae
A
1064 /* put buffer at head of free list */
1065 if (bp->nb_free.tqe_next != NFSNOLIST)
1066 panic("nfsbuf on freelist");
483a1d10 1067 SET(bp->nb_flags, NB_INVAL);
91447636
A
1068 if (ISSET(bp->nb_flags, NB_META)) {
1069 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1070 nfsbuffreemetacnt++;
1071 } else {
1072 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1073 nfsbuffreecnt++;
1074 }
55e303ae
A
1075 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1076 /* put buffer at end of delwri list */
1077 if (bp->nb_free.tqe_next != NFSNOLIST)
1078 panic("nfsbuf on freelist");
1079 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1080 nfsbufdelwricnt++;
91447636 1081 freeup = 0;
55e303ae 1082 } else {
483a1d10
A
1083 /* update usage timestamp */
1084 microuptime(&now);
1085 bp->nb_timestamp = now.tv_sec;
55e303ae
A
1086 /* put buffer at end of free list */
1087 if (bp->nb_free.tqe_next != NFSNOLIST)
1088 panic("nfsbuf on freelist");
483a1d10
A
1089 if (ISSET(bp->nb_flags, NB_META)) {
1090 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1091 nfsbuffreemetacnt++;
1092 } else {
1093 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1094 nfsbuffreecnt++;
1095 }
55e303ae
A
1096 }
1097
91447636 1098 NFSBUFCNTCHK(1);
55e303ae
A
1099
1100 /* Unlock the buffer. */
8f6c56a5 1101 CLR(bp->nb_flags, (NB_ASYNC | NB_NOCACHE | NB_STABLE | NB_IOD));
91447636 1102 CLR(bp->nb_lflags, NBL_BUSY);
55e303ae
A
1103
1104 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
91447636
A
1105
1106 lck_mtx_unlock(nfs_buf_mutex);
1107
1108 if (wakeup_needbuffer)
1109 wakeup(&nfsneedbuffer);
1110 if (wakeup_buffer)
1111 wakeup(bp);
1112 if (wakeup_nbdwrite)
1113 wakeup(&nfs_nbdwrite);
1114 if (freeup)
1115 NFS_BUF_FREEUP();
55e303ae
A
1116}
1117
1118/*
1119 * Wait for operations on the buffer to complete.
1120 * When they do, extract and return the I/O's error value.
1121 */
1122int
1123nfs_buf_iowait(struct nfsbuf *bp)
1124{
1125 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1126
91447636
A
1127 lck_mtx_lock(nfs_buf_mutex);
1128
55e303ae 1129 while (!ISSET(bp->nb_flags, NB_DONE))
91447636
A
1130 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0);
1131
1132 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1133
1134 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1135
1136 /* check for interruption of I/O, then errors. */
1137 if (ISSET(bp->nb_flags, NB_EINTR)) {
1138 CLR(bp->nb_flags, NB_EINTR);
1139 return (EINTR);
1140 } else if (ISSET(bp->nb_flags, NB_ERROR))
1141 return (bp->nb_error ? bp->nb_error : EIO);
1142 return (0);
1143}
1144
1145/*
1146 * Mark I/O complete on a buffer.
1147 */
1148void
1149nfs_buf_iodone(struct nfsbuf *bp)
1150{
55e303ae
A
1151
1152 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1153
1154 if (ISSET(bp->nb_flags, NB_DONE))
1155 panic("nfs_buf_iodone already");
55e303ae
A
1156 /*
1157 * I/O was done, so don't believe
1158 * the DIRTY state from VM anymore
1159 */
1160 CLR(bp->nb_flags, NB_WASDIRTY);
1161
1162 if (!ISSET(bp->nb_flags, NB_READ)) {
1163 CLR(bp->nb_flags, NB_WRITEINPROG);
91447636
A
1164 /*
1165 * vnode_writedone() takes care of waking up
1166 * any throttled write operations
1167 */
1168 vnode_writedone(bp->nb_vp);
55e303ae 1169 }
91447636
A
1170 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1171 SET(bp->nb_flags, NB_DONE); /* note that it's done */
483a1d10 1172 nfs_buf_release(bp, 1);
91447636
A
1173 } else { /* or just wakeup the buffer */
1174 lck_mtx_lock(nfs_buf_mutex);
1175 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1176 CLR(bp->nb_lflags, NBL_WANTED);
1177 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1178 wakeup(bp);
1179 }
1180
1181 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1182}
1183
1184void
91447636 1185nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p)
55e303ae 1186{
91447636 1187 vnode_t vp = bp->nb_vp;
55e303ae
A
1188
1189 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1190 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1191
1192 /*
1193 * If the block hasn't been seen before:
1194 * (1) Mark it as having been seen,
1195 * (2) Charge for the write.
1196 * (3) Make sure it's on its vnode's correct block list,
1197 */
1198 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1199 SET(bp->nb_flags, NB_DELWRI);
1200 if (p && p->p_stats)
1201 p->p_stats->p_ru.ru_oublock++; /* XXX */
91447636
A
1202 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
1203 NFSBUFCNTCHK(0);
55e303ae 1204 /* move to dirty list */
91447636 1205 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
1206 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1207 LIST_REMOVE(bp, nb_vnbufs);
1208 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
91447636 1209 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
1210 }
1211
1212 /*
1213 * If the vnode has "too many" write operations in progress
1214 * wait for them to finish the IO
1215 */
91447636 1216 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
55e303ae
A
1217
1218 /*
1219 * If we have too many delayed write buffers,
1220 * more than we can "safely" handle, just fall back to
1221 * doing the async write
1222 */
1223 if (nfs_nbdwrite < 0)
1224 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1225
1226 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1227 /* issue async write */
1228 SET(bp->nb_flags, NB_ASYNC);
1229 nfs_buf_write(bp);
1230 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1231 return;
1232 }
1233
1234 /* Otherwise, the "write" is done, so mark and release the buffer. */
1235 SET(bp->nb_flags, NB_DONE);
483a1d10 1236 nfs_buf_release(bp, 1);
55e303ae
A
1237 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1238 return;
1239}
1240
8f6c56a5
A
1241/*
1242 * Check that a "needcommit" buffer can still be committed.
1243 * If the write verifier has changed, we need to clear the
1244 * the needcommit flag.
1245 */
1246void
1247nfs_buf_check_write_verifier(struct nfsnode *np, struct nfsbuf *bp)
1248{
1249 struct nfsmount *nmp;
1250
1251 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
1252 return;
1253
1254 nmp = VFSTONFS(vnode_mount(NFSTOV(np)));
1255 if (!nmp || (bp->nb_verf == nmp->nm_verf))
1256 return;
1257
1258 /* write verifier changed, clear commit flag */
1259 bp->nb_flags &= ~NB_NEEDCOMMIT;
1260 np->n_needcommitcnt--;
1261 CHECK_NEEDCOMMITCNT(np);
1262}
1263
91447636
A
1264/*
1265 * add a reference to a buffer so it doesn't disappear while being used
1266 * (must be called with nfs_buf_mutex held)
1267 */
1268void
1269nfs_buf_refget(struct nfsbuf *bp)
1270{
1271 bp->nb_refs++;
1272}
1273/*
1274 * release a reference on a buffer
1275 * (must be called with nfs_buf_mutex held)
1276 */
1277void
1278nfs_buf_refrele(struct nfsbuf *bp)
1279{
1280 bp->nb_refs--;
1281}
1282
1283/*
1284 * mark a particular buffer as BUSY
1285 * (must be called with nfs_buf_mutex held)
1286 */
1287errno_t
1288nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1289{
1290 errno_t error;
1291 struct timespec ts;
1292
1293 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1294 /*
1295 * since the mutex_lock may block, the buffer
1296 * may become BUSY, so we need to recheck for
1297 * a NOWAIT request
1298 */
1299 if (flags & NBAC_NOWAIT)
1300 return (EBUSY);
1301 SET(bp->nb_lflags, NBL_WANTED);
1302
1303 ts.tv_sec = (slptimeo/100);
1304 /* the hz value is 100; which leads to 10ms */
1305 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1306
1307 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1308 "nfs_buf_acquire", &ts);
1309 if (error)
1310 return (error);
1311 return (EAGAIN);
1312 }
1313 if (flags & NBAC_REMOVE)
1314 nfs_buf_remfree(bp);
1315 SET(bp->nb_lflags, NBL_BUSY);
1316
1317 return (0);
1318}
1319
1320/*
1321 * simply drop the BUSY status of a buffer
1322 * (must be called with nfs_buf_mutex held)
1323 */
1324void
1325nfs_buf_drop(struct nfsbuf *bp)
1326{
1327 int need_wakeup = 0;
1328
1329 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1330 panic("nfs_buf_drop: buffer not busy!");
1331 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1332 /*
1333 * delay the actual wakeup until after we
1334 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1335 */
1336 need_wakeup = 1;
1337 }
1338 /* Unlock the buffer. */
1339 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1340
1341 if (need_wakeup)
1342 wakeup(bp);
1343}
1344
1345/*
1346 * prepare for iterating over an nfsnode's buffer list
1347 * this lock protects the queue manipulation
1348 * (must be called with nfs_buf_mutex held)
1349 */
1350int
1351nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1352{
1353 struct nfsbuflists *listheadp;
1354
1355 if (flags & NBI_DIRTY)
1356 listheadp = &np->n_dirtyblkhd;
1357 else
1358 listheadp = &np->n_cleanblkhd;
1359
1360 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1361 LIST_INIT(iterheadp);
1362 return(EWOULDBLOCK);
1363 }
1364
1365 while (np->n_bufiterflags & NBI_ITER) {
1366 np->n_bufiterflags |= NBI_ITERWANT;
1367 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0);
1368 }
1369 if (LIST_EMPTY(listheadp)) {
1370 LIST_INIT(iterheadp);
1371 return(EINVAL);
1372 }
1373 np->n_bufiterflags |= NBI_ITER;
1374
1375 iterheadp->lh_first = listheadp->lh_first;
1376 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1377 LIST_INIT(listheadp);
1378
1379 return(0);
1380}
1381
1382/*
1383 * cleanup after iterating over an nfsnode's buffer list
1384 * this lock protects the queue manipulation
1385 * (must be called with nfs_buf_mutex held)
1386 */
1387void
1388nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1389{
1390 struct nfsbuflists * listheadp;
1391 struct nfsbuf *bp;
1392
1393 if (flags & NBI_DIRTY)
1394 listheadp = &np->n_dirtyblkhd;
1395 else
1396 listheadp = &np->n_cleanblkhd;
1397
1398 while (!LIST_EMPTY(iterheadp)) {
1399 bp = LIST_FIRST(iterheadp);
1400 LIST_REMOVE(bp, nb_vnbufs);
1401 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1402 }
1403
1404 np->n_bufiterflags &= ~NBI_ITER;
1405 if (np->n_bufiterflags & NBI_ITERWANT) {
1406 np->n_bufiterflags &= ~NBI_ITERWANT;
1407 wakeup(&np->n_bufiterflags);
1408 }
1409}
1410
1c79356b
A
1411
1412/*
1413 * Vnode op for read using bio
1414 * Any similarity to readip() is purely coincidental
1415 */
1416int
91447636
A
1417nfs_bioread(
1418 vnode_t vp,
1419 struct uio *uio,
1420 __unused int ioflag,
1421 kauth_cred_t cred,
1422 proc_t p)
1c79356b 1423{
55e303ae 1424 struct nfsnode *np = VTONFS(vp);
91447636 1425 int biosize;
b4c24cb9 1426 off_t diff;
91447636
A
1427 struct nfsbuf *bp = NULL, *rabp;
1428 struct nfs_vattr nvattr;
1429 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1430 daddr64_t lbn, rabn, lastrabn = -1, tlbn;
1c79356b 1431 int bufsize;
55e303ae 1432 int nra, error = 0, n = 0, on = 0;
55e303ae 1433 caddr_t dp;
91447636
A
1434 struct dirent *direntp = NULL;
1435 enum vtype vtype;
1436 int nocachereadahead = 0;
55e303ae 1437
91447636 1438 FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1c79356b
A
1439
1440#if DIAGNOSTIC
1441 if (uio->uio_rw != UIO_READ)
1442 panic("nfs_read mode");
1443#endif
91447636 1444 if (uio_uio_resid(uio) == 0) {
55e303ae 1445 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1c79356b 1446 return (0);
55e303ae
A
1447 }
1448 if (uio->uio_offset < 0) {
1449 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1c79356b 1450 return (EINVAL);
55e303ae 1451 }
91447636
A
1452
1453 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1454 nfs_fsinfo(nmp, vp, cred, p);
8f6c56a5 1455 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
91447636 1456 vtype = vnode_vtype(vp);
1c79356b
A
1457 /*
1458 * For nfs, cache consistency can only be maintained approximately.
1459 * Although RFC1094 does not specify the criteria, the following is
1460 * believed to be compatible with the reference port.
1c79356b
A
1461 * For nfs:
1462 * If the file's modify time on the server has changed since the
1463 * last read rpc or you have written to the file,
1464 * you may have lost data cache consistency with the
1465 * server, so flush all of the file's data out of the cache.
1466 * Then force a getattr rpc to ensure that you have up to date
1467 * attributes.
1468 * NB: This implies that cache data can be read when up to
ab86ba33 1469 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
91447636
A
1470 * current attributes this could be forced by setting calling
1471 * NATTRINVALIDATE() before the nfs_getattr() call.
1c79356b 1472 */
91447636
A
1473 if (np->n_flag & NNEEDINVALIDATE) {
1474 np->n_flag &= ~NNEEDINVALIDATE;
1475 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1476 }
1477 if (np->n_flag & NMODIFIED) {
1478 if (vtype != VREG) {
1479 if (vtype != VDIR)
1480 panic("nfs: bioread, not dir");
1481 nfs_invaldir(vp);
1482 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae 1483 if (error) {
91447636 1484 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1c79356b 1485 return (error);
55e303ae 1486 }
91447636
A
1487 }
1488 NATTRINVALIDATE(np);
1489 error = nfs_getattr(vp, &nvattr, cred, p);
1490 if (error) {
1491 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1492 return (error);
1493 }
1494 if (vtype == VDIR) {
1495 /* if directory changed, purge any name cache entries */
1496 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1497 cache_purge(vp);
1498 np->n_ncmtime = nvattr.nva_mtime;
1499 }
1500 np->n_mtime = nvattr.nva_mtime;
1501 } else {
1502 error = nfs_getattr(vp, &nvattr, cred, p);
1503 if (error) {
1504 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1505 return (error);
1506 }
1507 if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) {
1508 if (vtype == VDIR) {
1509 nfs_invaldir(vp);
1510 /* purge name cache entries */
1511 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
483a1d10 1512 cache_purge(vp);
483a1d10 1513 }
91447636 1514 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae 1515 if (error) {
91447636 1516 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1c79356b 1517 return (error);
55e303ae 1518 }
91447636
A
1519 if (vtype == VDIR)
1520 np->n_ncmtime = nvattr.nva_mtime;
1521 np->n_mtime = nvattr.nva_mtime;
1522 }
1523 }
1524
1525 if (vnode_isnocache(vp)) {
1526 if (!(np->n_flag & NNOCACHE)) {
1527 if (NVALIDBUFS(np)) {
1c79356b 1528 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae 1529 if (error) {
91447636 1530 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1c79356b 1531 return (error);
55e303ae 1532 }
1c79356b 1533 }
91447636 1534 np->n_flag |= NNOCACHE;
1c79356b 1535 }
91447636
A
1536 } else if (np->n_flag & NNOCACHE) {
1537 np->n_flag &= ~NNOCACHE;
1c79356b 1538 }
1c79356b 1539
91447636
A
1540 do {
1541 if (np->n_flag & NNOCACHE) {
1542 switch (vtype) {
1543 case VREG:
1544 /*
1545 * If we have only a block or so to read,
1546 * just do the rpc directly.
1547 * If we have a couple blocks or more to read,
1548 * then we'll take advantage of readahead within
1549 * this loop to try to fetch all the data in parallel
1550 */
1551 if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) {
1552 error = nfs_readrpc(vp, uio, cred, p);
1553 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
55e303ae
A
1554 return (error);
1555 }
91447636
A
1556 nocachereadahead = 1;
1557 break;
1c79356b 1558 case VLNK:
91447636
A
1559 error = nfs_readlinkrpc(vp, uio, cred, p);
1560 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
55e303ae 1561 return (error);
1c79356b
A
1562 case VDIR:
1563 break;
1564 default:
91447636 1565 printf(" NFSNOCACHE: type %x unexpected\n", vtype);
1c79356b
A
1566 };
1567 }
91447636 1568 switch (vtype) {
1c79356b 1569 case VREG:
1c79356b 1570 lbn = uio->uio_offset / biosize;
55e303ae
A
1571
1572 /*
1573 * Copy directly from any cached pages without grabbing the bufs.
91447636
A
1574 *
1575 * Note: for "nocache" reads, we don't copy directly from UBC
1576 * because any cached pages will be for readahead buffers that
1577 * need to be invalidated anyway before we finish this request.
55e303ae 1578 */
91447636
A
1579 if (!(np->n_flag & NNOCACHE) &&
1580 (uio->uio_segflg == UIO_USERSPACE32 ||
1581 uio->uio_segflg == UIO_USERSPACE64 ||
1582 uio->uio_segflg == UIO_USERSPACE)) {
1583 // LP64todo - fix this!
1584 int io_resid = uio_uio_resid(uio);
55e303ae
A
1585 diff = np->n_size - uio->uio_offset;
1586 if (diff < io_resid)
1587 io_resid = diff;
1588 if (io_resid > 0) {
1589 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1590 if (error) {
1591 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1592 return (error);
1593 }
1594 }
1595 /* count any biocache reads that we just copied directly */
1596 if (lbn != uio->uio_offset / biosize) {
91447636 1597 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
55e303ae
A
1598 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1599 }
1600 }
1601
1602 lbn = uio->uio_offset / biosize;
1603 on = uio->uio_offset % biosize;
1c79356b
A
1604
1605 /*
1606 * Start the read ahead(s), as required.
1607 */
1608 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
55e303ae 1609 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1c79356b 1610 rabn = lbn + 1 + nra;
55e303ae
A
1611 if (rabn <= lastrabn) {
1612 /* we've already (tried to) read this block */
1613 /* no need to try it again... */
1614 continue;
1c79356b 1615 }
55e303ae 1616 lastrabn = rabn;
91447636
A
1617 if ((off_t)rabn * biosize >= (off_t)np->n_size)
1618 break;
1619 if ((np->n_flag & NNOCACHE) &&
1620 (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio))))
1621 /* for uncached readahead, don't go beyond end of request */
55e303ae
A
1622 break;
1623 /* check if block exists and is valid. */
91447636
A
1624 error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1625 if (error) {
1626 FSDBG_BOT(514, vp, 0xd1e000b, 1, error);
1627 return (error);
1628 }
1629 if (!rabp)
1630 continue;
1631 if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) {
1632 nfs_buf_release(rabp, 1);
55e303ae 1633 continue;
55e303ae
A
1634 }
1635 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1636 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1637 if (nfs_asyncio(rabp, cred)) {
1638 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1639 rabp->nb_error = EIO;
483a1d10 1640 nfs_buf_release(rabp, 1);
55e303ae
A
1641 }
1642 } else
483a1d10 1643 nfs_buf_release(rabp, 1);
55e303ae 1644 }
1c79356b
A
1645 }
1646
91447636
A
1647 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
1648 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
55e303ae
A
1649 return (0);
1650 }
1651
91447636 1652 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
55e303ae 1653
1c79356b
A
1654 /*
1655 * If the block is in the cache and has the required data
1656 * in a valid region, just copy it out.
1657 * Otherwise, get the block and write back/read in,
1658 * as required.
1659 */
1660again:
1661 bufsize = biosize;
91447636
A
1662 // LP64todo - fix this!
1663 n = min((unsigned)(bufsize - on), uio_uio_resid(uio));
1c79356b
A
1664 diff = np->n_size - uio->uio_offset;
1665 if (diff < n)
1666 n = diff;
55e303ae 1667
91447636
A
1668 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp);
1669 if (error) {
55e303ae
A
1670 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1671 return (EINTR);
1672 }
1673
1674 /* if any pages are valid... */
1675 if (bp->nb_valid) {
1676 /* ...check for any invalid pages in the read range */
1677 int pg, firstpg, lastpg, dirtypg;
1678 dirtypg = firstpg = lastpg = -1;
1679 pg = on/PAGE_SIZE;
1680 while (pg <= (on + n - 1)/PAGE_SIZE) {
1681 if (!NBPGVALID(bp,pg)) {
1682 if (firstpg < 0)
1683 firstpg = pg;
1684 lastpg = pg;
1685 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1686 dirtypg = pg;
1687 pg++;
1688 }
1689
1690 /* if there are no invalid pages, we're all set */
1691 if (firstpg < 0) {
1692 if (bp->nb_validoff < 0) {
1693 /* valid range isn't set up, so */
1694 /* set it to what we know is valid */
91447636
A
1695 bp->nb_validoff = trunc_page(on);
1696 bp->nb_validend = round_page(on+n);
55e303ae
A
1697 nfs_buf_normalize_valid_range(np, bp);
1698 }
1699 goto buffer_ready;
1700 }
1701
1702 /* there are invalid pages in the read range */
1703 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1704 /* there are also dirty page(s) in the range, */
1705 /* so write the buffer out and try again */
1706 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1707 SET(bp->nb_flags, NB_ASYNC);
91447636
A
1708 if (bp->nb_wcred == NOCRED) {
1709 kauth_cred_ref(cred);
1710 bp->nb_wcred = cred;
1711 }
55e303ae
A
1712 error = nfs_buf_write(bp);
1713 if (error) {
1714 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1715 return (error);
1716 }
1c79356b
A
1717 goto again;
1718 }
55e303ae
A
1719 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1720 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1721 /* we need to read in more than half the buffer and the */
1722 /* buffer's not dirty, so just fetch the whole buffer */
1723 bp->nb_valid = 0;
1724 } else {
1725 /* read the page range in */
91447636
A
1726 uio_t auio;
1727 char uio_buf[ UIO_SIZEOF(1) ];
1728
55e303ae 1729 NFS_BUF_MAP(bp);
91447636
A
1730 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
1731 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1732 if (!auio) {
1733 error = ENOMEM;
1734 } else {
1735 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
1736 ((lastpg - firstpg + 1) * PAGE_SIZE));
1737 error = nfs_readrpc(vp, auio, cred, p);
1738 }
55e303ae 1739 if (error) {
91447636
A
1740 if (np->n_flag & NNOCACHE)
1741 SET(bp->nb_flags, NB_NOCACHE);
483a1d10 1742 nfs_buf_release(bp, 1);
55e303ae
A
1743 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1744 return (error);
1745 }
1746 /* Make sure that the valid range is set to cover this read. */
1747 bp->nb_validoff = trunc_page_32(on);
1748 bp->nb_validend = round_page_32(on+n);
1749 nfs_buf_normalize_valid_range(np, bp);
91447636 1750 if (uio_resid(auio) > 0) {
55e303ae
A
1751 /* if short read, must have hit EOF, */
1752 /* so zero the rest of the range */
91447636 1753 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
55e303ae
A
1754 }
1755 /* mark the pages (successfully read) as valid */
1756 for (pg=firstpg; pg <= lastpg; pg++)
1757 NBPGVALID_SET(bp,pg);
1758 }
1c79356b 1759 }
55e303ae
A
1760 /* if no pages are valid, read the whole block */
1761 if (!bp->nb_valid) {
1762 SET(bp->nb_flags, NB_READ);
1763 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1764 error = nfs_doio(bp, cred, p);
1765 if (error) {
91447636
A
1766 if (np->n_flag & NNOCACHE)
1767 SET(bp->nb_flags, NB_NOCACHE);
483a1d10 1768 nfs_buf_release(bp, 1);
55e303ae
A
1769 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1770 return (error);
1771 }
1772 }
1773buffer_ready:
55e303ae
A
1774 /* validate read range against valid range and clip */
1775 if (bp->nb_validend > 0) {
1776 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1777 if (diff < n)
1778 n = diff;
1779 }
1780 if (n > 0)
1781 NFS_BUF_MAP(bp);
1c79356b
A
1782 break;
1783 case VLNK:
91447636
A
1784 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks);
1785 error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp);
1786 if (error) {
1787 FSDBG_BOT(514, vp, 0xd1e0010, 0, error);
1788 return (error);
55e303ae
A
1789 }
1790 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1791 SET(bp->nb_flags, NB_READ);
1c79356b
A
1792 error = nfs_doio(bp, cred, p);
1793 if (error) {
55e303ae 1794 SET(bp->nb_flags, NB_ERROR);
483a1d10 1795 nfs_buf_release(bp, 1);
55e303ae 1796 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1c79356b
A
1797 return (error);
1798 }
1799 }
91447636
A
1800 // LP64todo - fix this!
1801 n = min(uio_uio_resid(uio), bp->nb_validend);
1c79356b
A
1802 on = 0;
1803 break;
1804 case VDIR:
91447636 1805 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
55e303ae
A
1806 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1807 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1808 return (0);
1c79356b
A
1809 }
1810 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1811 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
91447636
A
1812 error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1813 if (error) {
1814 FSDBG_BOT(514, vp, 0xd1e0012, 0, error);
1815 return (error);
55e303ae
A
1816 }
1817 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1818 SET(bp->nb_flags, NB_READ);
1c79356b
A
1819 error = nfs_doio(bp, cred, p);
1820 if (error) {
483a1d10 1821 nfs_buf_release(bp, 1);
1c79356b 1822 }
fa4905b1
A
1823 while (error == NFSERR_BAD_COOKIE) {
1824 nfs_invaldir(vp);
1825 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1826 /*
1827 * Yuck! The directory has been modified on the
1828 * server. The only way to get the block is by
1829 * reading from the beginning to get all the
1830 * offset cookies.
1831 */
91447636 1832 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
fa4905b1 1833 if (np->n_direofoffset
91447636 1834 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
55e303ae 1835 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
fa4905b1 1836 return (0);
55e303ae 1837 }
91447636
A
1838 error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1839 if (error) {
1840 FSDBG_BOT(514, vp, 0xd1e0013, 0, error);
1841 return (error);
55e303ae
A
1842 }
1843 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1844 SET(bp->nb_flags, NB_READ);
fa4905b1
A
1845 error = nfs_doio(bp, cred, p);
1846 /*
55e303ae 1847 * no error + NB_INVAL == directory EOF,
fa4905b1
A
1848 * use the block.
1849 */
55e303ae 1850 if (error == 0 && (bp->nb_flags & NB_INVAL))
fa4905b1
A
1851 break;
1852 }
1853 /*
1854 * An error will throw away the block and the
1855 * for loop will break out. If no error and this
1856 * is not the block we want, we throw away the
1857 * block and go for the next one via the for loop.
1858 */
91447636 1859 if (error || tlbn < lbn)
483a1d10 1860 nfs_buf_release(bp, 1);
fa4905b1
A
1861 }
1862 }
1863 /*
1864 * The above while is repeated if we hit another cookie
1865 * error. If we hit an error and it wasn't a cookie error,
1866 * we give up.
1867 */
55e303ae
A
1868 if (error) {
1869 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
fa4905b1 1870 return (error);
55e303ae 1871 }
1c79356b
A
1872 }
1873
1874 /*
1875 * If not eof and read aheads are enabled, start one.
1876 * (You need the current block first, so that you have the
1877 * directory offset cookie of the next block.)
1878 */
1879 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1880 (np->n_direofoffset == 0 ||
1881 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
91447636
A
1882 !nfs_buf_is_incore(vp, lbn + 1)) {
1883 error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1884 if (error) {
1885 FSDBG_BOT(514, vp, 0xd1e0015, 0, error);
1886 return (error);
1887 }
1c79356b 1888 if (rabp) {
55e303ae
A
1889 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1890 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
fa4905b1 1891 if (nfs_asyncio(rabp, cred)) {
55e303ae
A
1892 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1893 rabp->nb_error = EIO;
483a1d10 1894 nfs_buf_release(rabp, 1);
fa4905b1 1895 }
1c79356b 1896 } else {
483a1d10 1897 nfs_buf_release(rabp, 1);
1c79356b
A
1898 }
1899 }
1900 }
1901 /*
1902 * Make sure we use a signed variant of min() since
1903 * the second term may be negative.
1904 */
91447636
A
1905 // LP64todo - fix this!
1906 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
fa4905b1 1907 /*
55e303ae
A
1908 * We keep track of the directory eof in
1909 * np->n_direofoffset and chop it off as an
1910 * extra step right here.
fa4905b1
A
1911 */
1912 if (np->n_direofoffset &&
1913 n > np->n_direofoffset - uio->uio_offset)
1914 n = np->n_direofoffset - uio->uio_offset;
55e303ae
A
1915 /*
1916 * Make sure that we return an integral number of entries so
1917 * that any subsequent calls will start copying from the start
1918 * of the next entry.
1919 *
1920 * If the current value of n has the last entry cut short,
1921 * set n to copy everything up to the last entry instead.
1922 */
1923 if (n > 0) {
1924 dp = bp->nb_data + on;
1925 while (dp < (bp->nb_data + on + n)) {
1926 direntp = (struct dirent *)dp;
1927 dp += direntp->d_reclen;
1928 }
1929 if (dp > (bp->nb_data + on + n))
1930 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1931 }
1c79356b
A
1932 break;
1933 default:
91447636
A
1934 printf("nfs_bioread: type %x unexpected\n", vtype);
1935 FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL);
55e303ae 1936 return (EINVAL);
1c79356b
A
1937 };
1938
1939 if (n > 0) {
55e303ae 1940 error = uiomove(bp->nb_data + on, (int)n, uio);
1c79356b 1941 }
91447636 1942 switch (vtype) {
1c79356b 1943 case VREG:
91447636
A
1944 if (np->n_flag & NNOCACHE)
1945 SET(bp->nb_flags, NB_NOCACHE);
1c79356b
A
1946 break;
1947 case VLNK:
1948 n = 0;
1949 break;
1950 case VDIR:
91447636
A
1951 break;
1952 default:
1c79356b 1953 break;
1c79356b 1954 }
91447636
A
1955 nfs_buf_release(bp, 1);
1956 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
1957 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1c79356b
A
1958 return (error);
1959}
1960
fa4905b1 1961
1c79356b
A
1962/*
1963 * Vnode op for write using bio
1964 */
1965int
1966nfs_write(ap)
91447636
A
1967 struct vnop_write_args /* {
1968 struct vnodeop_desc *a_desc;
1969 vnode_t a_vp;
1c79356b 1970 struct uio *a_uio;
91447636
A
1971 int a_ioflag;
1972 vfs_context_t a_context;
1c79356b
A
1973 } */ *ap;
1974{
55e303ae 1975 struct uio *uio = ap->a_uio;
91447636 1976 vnode_t vp = ap->a_vp;
1c79356b 1977 struct nfsnode *np = VTONFS(vp);
91447636
A
1978 proc_t p;
1979 kauth_cred_t cred;
1c79356b 1980 int ioflag = ap->a_ioflag;
55e303ae 1981 struct nfsbuf *bp;
91447636
A
1982 struct nfs_vattr nvattr;
1983 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1984 daddr64_t lbn;
1985 int biosize, bufsize;
1986 int n, on, error = 0;
483a1d10 1987 off_t boff, start, end, cureof;
91447636 1988 struct iovec_32 iov;
fa4905b1 1989 struct uio auio;
1c79356b 1990
91447636 1991 FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
55e303ae 1992
1c79356b
A
1993#if DIAGNOSTIC
1994 if (uio->uio_rw != UIO_WRITE)
1995 panic("nfs_write mode");
91447636 1996 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1c79356b
A
1997 panic("nfs_write proc");
1998#endif
91447636
A
1999
2000 p = vfs_context_proc(ap->a_context);
2001 cred = vfs_context_ucred(ap->a_context);
2002
2003 if (vnode_vtype(vp) != VREG)
1c79356b 2004 return (EIO);
91447636
A
2005
2006 np->n_flag |= NWRBUSY;
2007
2008 if (np->n_flag & NNEEDINVALIDATE) {
2009 np->n_flag &= ~NNEEDINVALIDATE;
2010 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
2011 }
1c79356b 2012 if (np->n_flag & NWRITEERR) {
91447636
A
2013 np->n_flag &= ~(NWRITEERR | NWRBUSY);
2014 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error);
1c79356b
A
2015 return (np->n_error);
2016 }
8f6c56a5
A
2017 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
2018 !(nmp->nm_state & NFSSTA_GOTFSINFO))
2019 (void)nfs_fsinfo(nmp, vp, cred, p);
1c79356b
A
2020 if (ioflag & (IO_APPEND | IO_SYNC)) {
2021 if (np->n_flag & NMODIFIED) {
91447636 2022 NATTRINVALIDATE(np);
1c79356b 2023 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae 2024 if (error) {
91447636 2025 np->n_flag &= ~NWRBUSY;
55e303ae 2026 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1c79356b 2027 return (error);
55e303ae 2028 }
1c79356b
A
2029 }
2030 if (ioflag & IO_APPEND) {
91447636
A
2031 NATTRINVALIDATE(np);
2032 error = nfs_getattr(vp, &nvattr, cred, p);
55e303ae 2033 if (error) {
91447636 2034 np->n_flag &= ~NWRBUSY;
55e303ae 2035 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
1c79356b 2036 return (error);
55e303ae 2037 }
1c79356b
A
2038 uio->uio_offset = np->n_size;
2039 }
2040 }
55e303ae 2041 if (uio->uio_offset < 0) {
91447636 2042 np->n_flag &= ~NWRBUSY;
55e303ae 2043 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
1c79356b 2044 return (EINVAL);
55e303ae 2045 }
91447636
A
2046 if (uio_uio_resid(uio) == 0) {
2047 np->n_flag &= ~NWRBUSY;
2048 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
1c79356b 2049 return (0);
55e303ae 2050 }
55e303ae 2051
8f6c56a5
A
2052 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
2053
91447636
A
2054 if (vnode_isnocache(vp)) {
2055 if (!(np->n_flag & NNOCACHE)) {
2056 if (NVALIDBUFS(np)) {
1c79356b 2057 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae 2058 if (error) {
91447636
A
2059 np->n_flag &= ~NWRBUSY;
2060 FSDBG_BOT(515, vp, 0, 0, error);
1c79356b 2061 return (error);
55e303ae 2062 }
55e303ae 2063 }
91447636 2064 np->n_flag |= NNOCACHE;
55e303ae 2065 }
91447636
A
2066 } else if (np->n_flag & NNOCACHE) {
2067 np->n_flag &= ~NNOCACHE;
2068 }
2069
2070 do {
2071 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes);
1c79356b 2072 lbn = uio->uio_offset / biosize;
55e303ae 2073 on = uio->uio_offset % biosize;
91447636
A
2074 // LP64todo - fix this
2075 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
1c79356b 2076again:
1c79356b 2077 bufsize = biosize;
fa4905b1
A
2078 /*
2079 * Get a cache block for writing. The range to be written is
55e303ae 2080 * (off..off+n) within the block. We ensure that the block
fa4905b1
A
2081 * either has no dirty region or that the given range is
2082 * contiguous with the existing dirty region.
2083 */
91447636
A
2084 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp);
2085 if (error) {
2086 np->n_flag &= ~NWRBUSY;
2087 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2088 return (error);
55e303ae
A
2089 }
2090 /* map the block because we know we're going to write to it */
2091 NFS_BUF_MAP(bp);
2092
91447636 2093 if (np->n_flag & NNOCACHE)
8f6c56a5 2094 SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE));
55e303ae 2095
91447636
A
2096 if (bp->nb_wcred == NOCRED) {
2097 kauth_cred_ref(cred);
2098 bp->nb_wcred = cred;
2099 }
55e303ae
A
2100
2101 /*
2102 * If there's already a dirty range AND dirty pages in this block we
2103 * need to send a commit AND write the dirty pages before continuing.
2104 *
2105 * If there's already a dirty range OR dirty pages in this block
2106 * and the new write range is not contiguous with the existing range,
2107 * then force the buffer to be written out now.
2108 * (We used to just extend the dirty range to cover the valid,
2109 * but unwritten, data in between also. But writing ranges
2110 * of data that weren't actually written by an application
2111 * risks overwriting some other client's data with stale data
2112 * that's just masquerading as new written data.)
2113 */
2114 if (bp->nb_dirtyend > 0) {
2115 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2116 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
2117 /* write/commit buffer "synchronously" */
2118 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2119 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2120 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2121 error = nfs_buf_write(bp);
2122 if (error) {
91447636
A
2123 np->n_flag &= ~NWRBUSY;
2124 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
55e303ae
A
2125 return (error);
2126 }
2127 goto again;
2128 }
2129 } else if (bp->nb_dirty) {
2130 int firstpg, lastpg;
2131 u_int32_t pagemask;
2132 /* calculate write range pagemask */
2133 firstpg = on/PAGE_SIZE;
2134 lastpg = (on+n-1)/PAGE_SIZE;
2135 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2136 /* check if there are dirty pages outside the write range */
2137 if (bp->nb_dirty & ~pagemask) {
2138 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
2139 /* write/commit buffer "synchronously" */
2140 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2141 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2142 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2143 error = nfs_buf_write(bp);
2144 if (error) {
91447636
A
2145 np->n_flag &= ~NWRBUSY;
2146 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
55e303ae
A
2147 return (error);
2148 }
2149 goto again;
2150 }
2151 /* if the first or last pages are already dirty */
2152 /* make sure that the dirty range encompasses those pages */
2153 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2154 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
2155 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2156 if (NBPGDIRTY(bp,lastpg)) {
2157 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2158 /* clip to EOF */
91447636 2159 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
55e303ae
A
2160 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2161 } else
2162 bp->nb_dirtyend = on+n;
2163 }
2164 }
2165
fa4905b1 2166 /*
55e303ae
A
2167 * Are we extending the size of the file with this write?
2168 * If so, update file size now that we have the block.
fa4905b1
A
2169 * If there was a partial buf at the old eof, validate
2170 * and zero the new bytes.
2171 */
483a1d10 2172 cureof = (off_t)np->n_size;
91447636 2173 if (uio->uio_offset + n > (off_t)np->n_size) {
55e303ae 2174 struct nfsbuf *eofbp = NULL;
91447636 2175 daddr64_t eofbn = np->n_size / biosize;
55e303ae
A
2176 int eofoff = np->n_size % biosize;
2177 int neweofoff = (uio->uio_offset + n) % biosize;
2178
2179 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
fa4905b1 2180
91447636
A
2181 if (eofoff && (eofbn < lbn)) {
2182 error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp);
2183 if (error) {
2184 np->n_flag &= ~NWRBUSY;
2185 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2186 return (error);
2187 }
2188 }
55e303ae
A
2189
2190 /* if we're extending within the same last block */
2191 /* and the block is flagged as being cached... */
2192 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2193 /* ...check that all pages in buffer are valid */
2194 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2195 u_int32_t pagemask;
2196 /* pagemask only has to extend to last page being written to */
2197 pagemask = (1 << (endpg+1)) - 1;
2198 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2199 if ((bp->nb_valid & pagemask) != pagemask) {
2200 /* zerofill any hole */
2201 if (on > bp->nb_validend) {
2202 int i;
2203 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2204 NBPGVALID_SET(bp, i);
2205 NFS_BUF_MAP(bp);
2206 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2207 bzero((char *)bp->nb_data + bp->nb_validend,
2208 on - bp->nb_validend);
2209 }
2210 /* zerofill any trailing data in the last page */
2211 if (neweofoff) {
2212 NFS_BUF_MAP(bp);
2213 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2214 bzero((char *)bp->nb_data + neweofoff,
2215 PAGE_SIZE - (neweofoff & PAGE_MASK));
2216 }
2217 }
2218 }
fa4905b1
A
2219 np->n_flag |= NMODIFIED;
2220 np->n_size = uio->uio_offset + n;
2221 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
55e303ae
A
2222 if (eofbp) {
2223 /*
2224 * We may need to zero any previously invalid data
2225 * after the old EOF in the previous EOF buffer.
2226 *
2227 * For the old last page, don't zero bytes if there
2228 * are invalid bytes in that page (i.e. the page isn't
2229 * currently valid).
2230 * For pages after the old last page, zero them and
2231 * mark them as valid.
2232 */
2233 char *d;
2234 int i;
91447636 2235 if (np->n_flag & NNOCACHE)
8f6c56a5 2236 SET(eofbp->nb_flags, (NB_NOCACHE|NB_STABLE));
55e303ae
A
2237 NFS_BUF_MAP(eofbp);
2238 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2239 d = eofbp->nb_data;
2240 i = eofoff/PAGE_SIZE;
2241 while (eofoff < biosize) {
2242 int poff = eofoff & PAGE_MASK;
2243 if (!poff || NBPGVALID(eofbp,i)) {
2244 bzero(d + eofoff, PAGE_SIZE - poff);
2245 NBPGVALID_SET(eofbp, i);
2246 }
2247 if (bp->nb_validend == eofoff)
2248 bp->nb_validend += PAGE_SIZE - poff;
2249 eofoff += PAGE_SIZE - poff;
2250 i++;
2251 }
483a1d10 2252 nfs_buf_release(eofbp, 1);
fa4905b1
A
2253 }
2254 }
fa4905b1
A
2255 /*
2256 * If dirtyend exceeds file size, chop it down. This should
2257 * not occur unless there is a race.
2258 */
91447636 2259 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
55e303ae 2260 bp->nb_dirtyend = np->n_size - NBOFF(bp);
fa4905b1 2261 /*
55e303ae
A
2262 * UBC doesn't handle partial pages, so we need to make sure
2263 * that any pages left in the page cache are completely valid.
2264 *
2265 * Writes that are smaller than a block are delayed if they
2266 * don't extend to the end of the block.
fa4905b1 2267 *
55e303ae
A
2268 * If the block isn't (completely) cached, we may need to read
2269 * in some parts of pages that aren't covered by the write.
2270 * If the write offset (on) isn't page aligned, we'll need to
2271 * read the start of the first page being written to. Likewise,
2272 * if the offset of the end of the write (on+n) isn't page aligned,
2273 * we'll need to read the end of the last page being written to.
2274 *
2275 * Notes:
2276 * We don't want to read anything we're just going to write over.
2277 * We don't want to issue multiple I/Os if we don't have to
2278 * (because they're synchronous rpcs).
2279 * We don't want to read anything we already have modified in the
2280 * page cache.
fa4905b1 2281 */
55e303ae
A
2282 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
2283 int firstpg, lastpg, dirtypg;
2284 int firstpgoff, lastpgoff;
2285 start = end = -1;
2286 firstpg = on/PAGE_SIZE;
2287 firstpgoff = on & PAGE_MASK;
2288 lastpg = (on+n-1)/PAGE_SIZE;
2289 lastpgoff = (on+n) & PAGE_MASK;
2290 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2291 /* need to read start of first page */
2292 start = firstpg * PAGE_SIZE;
2293 end = start + firstpgoff;
fa4905b1 2294 }
55e303ae
A
2295 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2296 /* need to read end of last page */
2297 if (start < 0)
2298 start = (lastpg * PAGE_SIZE) + lastpgoff;
2299 end = (lastpg + 1) * PAGE_SIZE;
fa4905b1 2300 }
fa4905b1 2301 if (end > start) {
55e303ae
A
2302 /* need to read the data in range: start...end-1 */
2303
55e303ae
A
2304 /* first, check for dirty pages in between */
2305 /* if there are, we'll have to do two reads because */
2306 /* we don't want to overwrite the dirty pages. */
2307 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2308 if (NBPGDIRTY(bp,dirtypg))
2309 break;
2310
2311 /* if start is at beginning of page, try */
2312 /* to get any preceeding pages as well. */
2313 if (!(start & PAGE_MASK)) {
2314 /* stop at next dirty/valid page or start of block */
2315 for (; start > 0; start-=PAGE_SIZE)
2316 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2317 break;
2318 }
2319
2320 NFS_BUF_MAP(bp);
2321 /* setup uio for read(s) */
2322 boff = NBOFF(bp);
91447636 2323 auio.uio_iovs.iov32p = &iov;
fa4905b1 2324 auio.uio_iovcnt = 1;
91447636 2325#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
fa4905b1 2326 auio.uio_segflg = UIO_SYSSPACE;
91447636
A
2327#else
2328 auio.uio_segflg = UIO_SYSSPACE32;
2329#endif
fa4905b1 2330 auio.uio_rw = UIO_READ;
55e303ae
A
2331
2332 if (dirtypg <= (end-1)/PAGE_SIZE) {
2333 /* there's a dirty page in the way, so just do two reads */
2334 /* we'll read the preceding data here */
2335 auio.uio_offset = boff + start;
91447636
A
2336 iov.iov_len = on - start;
2337 uio_uio_resid_set(&auio, iov.iov_len);
2338 iov.iov_base = (uintptr_t) bp->nb_data + start;
2339 error = nfs_readrpc(vp, &auio, cred, p);
55e303ae
A
2340 if (error) {
2341 bp->nb_error = error;
2342 SET(bp->nb_flags, NB_ERROR);
2343 printf("nfs_write: readrpc %d", error);
2344 }
91447636
A
2345 if (uio_uio_resid(&auio) > 0) {
2346 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01);
2347 // LP64todo - fix this
2348 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
55e303ae
A
2349 }
2350 /* update validoff/validend if necessary */
2351 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2352 bp->nb_validoff = start;
2353 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2354 bp->nb_validend = on;
91447636 2355 if ((off_t)np->n_size > boff + bp->nb_validend)
55e303ae
A
2356 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2357 /* validate any pages before the write offset */
2358 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2359 NBPGVALID_SET(bp, start/PAGE_SIZE);
2360 /* adjust start to read any trailing data */
2361 start = on+n;
2362 }
2363
2364 /* if end is at end of page, try to */
2365 /* get any following pages as well. */
2366 if (!(end & PAGE_MASK)) {
2367 /* stop at next valid page or end of block */
2368 for (; end < bufsize; end+=PAGE_SIZE)
2369 if (NBPGVALID(bp,end/PAGE_SIZE))
2370 break;
2371 }
2372
483a1d10
A
2373 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2374 /*
2375 * Either this entire read is beyond the current EOF
2376 * or the range that we won't be modifying (on+n...end)
2377 * is all beyond the current EOF.
2378 * No need to make a trip across the network to
2379 * read nothing. So, just zero the buffer instead.
2380 */
2381 FSDBG(516, bp, start, end - start, 0xd00dee00);
2382 bzero(bp->nb_data + start, end - start);
2383 } else {
2384 /* now we'll read the (rest of the) data */
2385 auio.uio_offset = boff + start;
91447636
A
2386 iov.iov_len = end - start;
2387 uio_uio_resid_set(&auio, iov.iov_len);
2388 iov.iov_base = (uintptr_t) (bp->nb_data + start);
2389 error = nfs_readrpc(vp, &auio, cred, p);
483a1d10
A
2390 if (error) {
2391 bp->nb_error = error;
2392 SET(bp->nb_flags, NB_ERROR);
2393 printf("nfs_write: readrpc %d", error);
2394 }
91447636
A
2395 if (uio_uio_resid(&auio) > 0) {
2396 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02);
2397 // LP64todo - fix this
2398 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
483a1d10 2399 }
55e303ae
A
2400 }
2401 /* update validoff/validend if necessary */
2402 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2403 bp->nb_validoff = start;
2404 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2405 bp->nb_validend = end;
91447636 2406 if ((off_t)np->n_size > boff + bp->nb_validend)
55e303ae
A
2407 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2408 /* validate any pages before the write offset's page */
2409 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2410 NBPGVALID_SET(bp, start/PAGE_SIZE);
2411 /* validate any pages after the range of pages being written to */
2412 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2413 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2414 /* Note: pages being written to will be validated when written */
fa4905b1 2415 }
fa4905b1 2416 }
55e303ae
A
2417
2418 if (ISSET(bp->nb_flags, NB_ERROR)) {
2419 error = bp->nb_error;
483a1d10 2420 nfs_buf_release(bp, 1);
91447636
A
2421 np->n_flag &= ~NWRBUSY;
2422 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
1c79356b
A
2423 return (error);
2424 }
55e303ae 2425
1c79356b
A
2426 np->n_flag |= NMODIFIED;
2427
55e303ae
A
2428 NFS_BUF_MAP(bp);
2429 error = uiomove((char *)bp->nb_data + on, n, uio);
1c79356b 2430 if (error) {
55e303ae 2431 SET(bp->nb_flags, NB_ERROR);
483a1d10 2432 nfs_buf_release(bp, 1);
91447636
A
2433 np->n_flag &= ~NWRBUSY;
2434 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
1c79356b
A
2435 return (error);
2436 }
55e303ae
A
2437
2438 /* validate any pages written to */
2439 start = on & ~PAGE_MASK;
2440 for (; start < on+n; start += PAGE_SIZE) {
2441 NBPGVALID_SET(bp, start/PAGE_SIZE);
2442 /*
2443 * This may seem a little weird, but we don't actually set the
2444 * dirty bits for writes. This is because we keep the dirty range
2445 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2446 * delayed writes, when we give the pages back to the VM we don't
2447 * want to keep them marked dirty, because when we later write the
2448 * buffer we won't be able to tell which pages were written dirty
2449 * and which pages were mmapped and dirtied.
2450 */
2451 }
2452 if (bp->nb_dirtyend > 0) {
2453 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2454 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
1c79356b 2455 } else {
55e303ae
A
2456 bp->nb_dirtyoff = on;
2457 bp->nb_dirtyend = on + n;
1c79356b 2458 }
55e303ae
A
2459 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2460 bp->nb_validoff > bp->nb_dirtyend) {
2461 bp->nb_validoff = bp->nb_dirtyoff;
2462 bp->nb_validend = bp->nb_dirtyend;
1c79356b 2463 } else {
55e303ae
A
2464 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2465 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
1c79356b 2466 }
55e303ae
A
2467 if (!ISSET(bp->nb_flags, NB_CACHE))
2468 nfs_buf_normalize_valid_range(np, bp);
1c79356b
A
2469
2470 /*
2471 * Since this block is being modified, it must be written
2472 * again and not just committed.
2473 */
55e303ae
A
2474 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2475 np->n_needcommitcnt--;
2476 CHECK_NEEDCOMMITCNT(np);
2477 }
2478 CLR(bp->nb_flags, NB_NEEDCOMMIT);
1c79356b 2479
91447636 2480 if (ioflag & IO_SYNC) {
55e303ae
A
2481 bp->nb_proc = p;
2482 error = nfs_buf_write(bp);
2483 if (error) {
91447636 2484 np->n_flag &= ~NWRBUSY;
55e303ae 2485 FSDBG_BOT(515, vp, uio->uio_offset,
91447636 2486 uio_uio_resid(uio), error);
1c79356b 2487 return (error);
55e303ae 2488 }
91447636
A
2489 } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) {
2490 bp->nb_proc = NULL;
55e303ae
A
2491 SET(bp->nb_flags, NB_ASYNC);
2492 nfs_buf_write(bp);
1c79356b 2493 } else
91447636 2494 nfs_buf_write_delayed(bp, p);
55e303ae 2495
91447636
A
2496 if (np->n_needcommitcnt > (nfsbufcnt/16))
2497 nfs_flushcommits(vp, p, 1);
55e303ae 2498
91447636 2499 } while (uio_uio_resid(uio) > 0 && n > 0);
55e303ae 2500
91447636 2501 np->n_flag &= ~NWRBUSY;
8f6c56a5
A
2502 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2503 return (0);
1c79356b
A
2504}
2505
1c79356b 2506/*
55e303ae
A
2507 * Flush out and invalidate all buffers associated with a vnode.
2508 * Called with the underlying object locked.
1c79356b 2509 */
55e303ae 2510static int
91447636
A
2511nfs_vinvalbuf_internal(
2512 vnode_t vp,
2513 int flags,
2514 kauth_cred_t cred,
2515 proc_t p,
2516 int slpflag,
2517 int slptimeo)
1c79356b 2518{
55e303ae 2519 struct nfsbuf *bp;
91447636
A
2520 struct nfsbuflists blist;
2521 int list, error = 0;
55e303ae 2522 struct nfsnode *np = VTONFS(vp);
9bccf70c 2523
55e303ae 2524 if (flags & V_SAVE) {
91447636
A
2525 if ((error = nfs_flush(vp, MNT_WAIT, cred, p,
2526 (flags & V_IGNORE_WRITEERR))))
55e303ae 2527 return (error);
91447636 2528 if (!LIST_EMPTY(&np->n_dirtyblkhd))
55e303ae 2529 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
91447636 2530 vp, LIST_FIRST(&np->n_dirtyblkhd));
9bccf70c
A
2531 }
2532
91447636 2533 lck_mtx_lock(nfs_buf_mutex);
55e303ae 2534 for (;;) {
91447636
A
2535 list = NBI_CLEAN;
2536 if (nfs_buf_iterprepare(np, &blist, list)) {
2537 list = NBI_DIRTY;
2538 if (nfs_buf_iterprepare(np, &blist, list))
2539 break;
2540 }
2541 while ((bp = LIST_FIRST(&blist))) {
2542 LIST_REMOVE(bp, nb_vnbufs);
2543 if (list == NBI_CLEAN)
2544 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2545 else
2546 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2547 nfs_buf_refget(bp);
2548 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2549 FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags);
2550 if (error != EAGAIN) {
55e303ae 2551 FSDBG(554, vp, bp, -1, error);
91447636
A
2552 nfs_buf_refrele(bp);
2553 nfs_buf_itercomplete(np, &blist, list);
2554 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
2555 return (error);
2556 }
55e303ae 2557 }
91447636 2558 nfs_buf_refrele(bp);
55e303ae 2559 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
91447636
A
2560 lck_mtx_unlock(nfs_buf_mutex);
2561 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp &&
2562 (NBOFF(bp) < (off_t)np->n_size)) {
55e303ae
A
2563 /* XXX extra paranoia: make sure we're not */
2564 /* somehow leaving any dirty data around */
2565 int mustwrite = 0;
91447636
A
2566 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
2567 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
55e303ae
A
2568 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2569 error = nfs_buf_upl_setup(bp);
2570 if (error == EINVAL) {
2571 /* vm object must no longer exist */
2572 /* hopefully we don't need to do */
2573 /* anything for this buffer */
2574 } else if (error)
91447636 2575 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
55e303ae
A
2576 bp->nb_valid = bp->nb_dirty = 0;
2577 }
2578 nfs_buf_upl_check(bp);
2579 /* check for any dirty data before the EOF */
2580 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2581 /* clip dirty range to EOF */
2582 if (bp->nb_dirtyend > end)
2583 bp->nb_dirtyend = end;
2584 mustwrite++;
2585 }
2586 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
91447636
A
2587 /* also make sure we'll have a credential to do the write */
2588 if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) {
2589 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2590 mustwrite = 0;
2591 }
55e303ae
A
2592 if (mustwrite) {
2593 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2594 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2595 panic("nfs_vinvalbuf: dirty buffer without upl");
2596 /* gotta write out dirty data before invalidating */
2597 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2598 /* (NB_NOCACHE indicates buffer should be discarded) */
2599 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2600 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
91447636
A
2601 if (bp->nb_wcred == NOCRED) {
2602 kauth_cred_ref(cred);
2603 bp->nb_wcred = cred;
2604 }
55e303ae
A
2605 error = nfs_buf_write(bp);
2606 // Note: bp has been released
2607 if (error) {
2608 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2609 np->n_error = error;
2610 np->n_flag |= NWRITEERR;
91447636
A
2611 /*
2612 * There was a write error and we need to
2613 * invalidate attrs to sync with server.
2614 * (if this write was extending the file,
2615 * we may no longer know the correct size)
2616 */
2617 NATTRINVALIDATE(np);
55e303ae
A
2618 error = 0;
2619 }
91447636
A
2620 lck_mtx_lock(nfs_buf_mutex);
2621 continue;
55e303ae
A
2622 }
2623 }
2624 SET(bp->nb_flags, NB_INVAL);
91447636 2625 // hold off on FREEUPs until we're done here
483a1d10 2626 nfs_buf_release(bp, 0);
91447636 2627 lck_mtx_lock(nfs_buf_mutex);
55e303ae 2628 }
91447636 2629 nfs_buf_itercomplete(np, &blist, list);
55e303ae 2630 }
91447636 2631 lck_mtx_unlock(nfs_buf_mutex);
483a1d10 2632 NFS_BUF_FREEUP();
91447636 2633 if (NVALIDBUFS(np))
55e303ae
A
2634 panic("nfs_vinvalbuf: flush failed");
2635 return (0);
1c79356b
A
2636}
2637
55e303ae 2638
1c79356b
A
2639/*
2640 * Flush and invalidate all dirty buffers. If another process is already
2641 * doing the flush, just wait for completion.
2642 */
2643int
91447636
A
2644nfs_vinvalbuf(
2645 vnode_t vp,
2646 int flags,
2647 kauth_cred_t cred,
2648 proc_t p,
2649 int intrflg)
1c79356b 2650{
91447636
A
2651 struct nfsnode *np = VTONFS(vp);
2652 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1c79356b 2653 int error = 0, slpflag, slptimeo;
91447636 2654 off_t size;
1c79356b 2655
55e303ae
A
2656 FSDBG_TOP(554, vp, flags, intrflg, 0);
2657
2658 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
1c79356b
A
2659 intrflg = 0;
2660 if (intrflg) {
2661 slpflag = PCATCH;
2662 slptimeo = 2 * hz;
2663 } else {
2664 slpflag = 0;
2665 slptimeo = 0;
2666 }
2667 /*
2668 * First wait for any other process doing a flush to complete.
2669 */
2670 while (np->n_flag & NFLUSHINPROG) {
2671 np->n_flag |= NFLUSHWANT;
55e303ae
A
2672 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2673 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2674 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
91447636 2675 if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
55e303ae
A
2676 FSDBG_BOT(554, vp, flags, intrflg, error);
2677 return (error);
2678 }
1c79356b
A
2679 }
2680
2681 /*
2682 * Now, flush as required.
2683 */
2684 np->n_flag |= NFLUSHINPROG;
55e303ae 2685 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
1c79356b 2686 while (error) {
55e303ae 2687 FSDBG(554, vp, 0, 0, error);
91447636 2688 error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p);
55e303ae 2689 if (error) {
1c79356b
A
2690 np->n_flag &= ~NFLUSHINPROG;
2691 if (np->n_flag & NFLUSHWANT) {
2692 np->n_flag &= ~NFLUSHWANT;
2693 wakeup((caddr_t)&np->n_flag);
2694 }
55e303ae
A
2695 FSDBG_BOT(554, vp, flags, intrflg, error);
2696 return (error);
1c79356b 2697 }
55e303ae 2698 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
1c79356b
A
2699 }
2700 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2701 if (np->n_flag & NFLUSHWANT) {
2702 np->n_flag &= ~NFLUSHWANT;
2703 wakeup((caddr_t)&np->n_flag);
2704 }
91447636
A
2705 /*
2706 * get the pages out of vm also
2707 */
2708 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
2709 int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE);
55e303ae 2710 if (!rv)
91447636 2711 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
0b4e3aa0 2712 }
91447636 2713
55e303ae 2714 FSDBG_BOT(554, vp, flags, intrflg, 0);
1c79356b
A
2715 return (0);
2716}
2717
2718/*
2719 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2720 * This is mainly to avoid queueing async I/O requests when the nfsiods
2721 * are all hung on a dead server.
2722 */
2723int
2724nfs_asyncio(bp, cred)
55e303ae 2725 struct nfsbuf *bp;
91447636 2726 kauth_cred_t cred;
1c79356b
A
2727{
2728 struct nfsmount *nmp;
2729 int i;
2730 int gotiod;
2731 int slpflag = 0;
2732 int slptimeo = 0;
55e303ae 2733 int error, error2;
91447636
A
2734 void *wakeme = NULL;
2735 struct timespec ts;
1c79356b
A
2736
2737 if (nfs_numasync == 0)
2738 return (EIO);
55e303ae
A
2739
2740 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2741
91447636 2742 nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL);
1c79356b 2743again:
55e303ae 2744 if (nmp && nmp->nm_flag & NFSMNT_INT)
1c79356b
A
2745 slpflag = PCATCH;
2746 gotiod = FALSE;
2747
91447636
A
2748 lck_mtx_lock(nfs_iod_mutex);
2749
55e303ae
A
2750 /* no nfsbuf means tell nfsiod to process delwri list */
2751 if (!bp)
2752 nfs_ioddelwri = 1;
2753
1c79356b
A
2754 /*
2755 * Find a free iod to process this request.
2756 */
2757 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2758 if (nfs_iodwant[i]) {
2759 /*
2760 * Found one, so wake it up and tell it which
2761 * mount to process.
2762 */
91447636 2763 nfs_iodwant[i] = NULL;
1c79356b 2764 nfs_iodmount[i] = nmp;
55e303ae
A
2765 if (nmp)
2766 nmp->nm_bufqiods++;
91447636 2767 wakeme = &nfs_iodwant[i];
1c79356b
A
2768 gotiod = TRUE;
2769 break;
2770 }
2771
55e303ae 2772 /* if we're just poking the delwri list, we're done */
91447636
A
2773 if (!bp) {
2774 lck_mtx_unlock(nfs_iod_mutex);
2775 if (wakeme)
2776 wakeup(wakeme);
2777 FSDBG_BOT(552, bp, 0x10101010, wakeme, 0);
55e303ae 2778 return (0);
91447636 2779 }
55e303ae 2780
1c79356b
A
2781 /*
2782 * If none are free, we may already have an iod working on this mount
2783 * point. If so, it will process our request.
2784 */
2785 if (!gotiod) {
2786 if (nmp->nm_bufqiods > 0) {
1c79356b
A
2787 gotiod = TRUE;
2788 }
2789 }
2790
2791 /*
2792 * If we have an iod which can process the request, then queue
2793 * the buffer.
2794 */
55e303ae 2795 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
1c79356b
A
2796 if (gotiod) {
2797 /*
2798 * Ensure that the queue never grows too large.
2799 */
2800 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
55e303ae
A
2801 if (ISSET(bp->nb_flags, NB_IOD)) {
2802 /* An nfsiod is attempting this async operation so */
2803 /* we must not fall asleep on the bufq because we */
2804 /* could be waiting on ourself. Just return error */
2805 /* and we'll do this operation syncrhonously. */
2806 goto out;
2807 }
2808 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
1c79356b 2809 nmp->nm_bufqwant = TRUE;
91447636
A
2810
2811 ts.tv_sec = (slptimeo/100);
2812 /* the hz value is 100; which leads to 10ms */
2813 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2814
2815 error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO,
2816 "nfsaio", &ts);
1c79356b 2817 if (error) {
55e303ae
A
2818 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2819 if (error2) {
91447636 2820 lck_mtx_unlock(nfs_iod_mutex);
55e303ae
A
2821 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2822 return (error2);
2823 }
1c79356b
A
2824 if (slpflag == PCATCH) {
2825 slpflag = 0;
2826 slptimeo = 2 * hz;
2827 }
2828 }
2829 /*
2830 * We might have lost our iod while sleeping,
2831 * so check and loop if nescessary.
2832 */
2833 if (nmp->nm_bufqiods == 0) {
91447636 2834 lck_mtx_unlock(nfs_iod_mutex);
1c79356b
A
2835 goto again;
2836 }
2837 }
2838
55e303ae
A
2839 if (ISSET(bp->nb_flags, NB_READ)) {
2840 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
91447636
A
2841 kauth_cred_ref(cred);
2842 bp->nb_rcred = cred;
1c79356b
A
2843 }
2844 } else {
55e303ae
A
2845 SET(bp->nb_flags, NB_WRITEINPROG);
2846 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
91447636
A
2847 kauth_cred_ref(cred);
2848 bp->nb_wcred = cred;
1c79356b
A
2849 }
2850 }
2851
55e303ae 2852 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
1c79356b 2853 nmp->nm_bufqlen++;
91447636
A
2854 lck_mtx_unlock(nfs_iod_mutex);
2855 if (wakeme)
2856 wakeup(wakeme);
55e303ae 2857 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
1c79356b
A
2858 return (0);
2859 }
2860
55e303ae 2861out:
91447636 2862 lck_mtx_unlock(nfs_iod_mutex);
1c79356b
A
2863 /*
2864 * All the iods are busy on other mounts, so return EIO to
2865 * force the caller to process the i/o synchronously.
2866 */
55e303ae 2867 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
1c79356b
A
2868 return (EIO);
2869}
2870
2871/*
2872 * Do an I/O operation to/from a cache block. This may be called
2873 * synchronously or from an nfsiod.
2874 */
2875int
91447636 2876nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p)
1c79356b 2877{
91447636
A
2878 struct uio *uiop;
2879 vnode_t vp;
1c79356b
A
2880 struct nfsnode *np;
2881 struct nfsmount *nmp;
8f6c56a5 2882 int error = 0, diff, len, iomode, invalidate = 0;
1c79356b 2883 struct uio uio;
91447636
A
2884 struct iovec_32 io;
2885 enum vtype vtype;
1c79356b 2886
55e303ae 2887 vp = bp->nb_vp;
91447636 2888 vtype = vnode_vtype(vp);
1c79356b 2889 np = VTONFS(vp);
91447636 2890 nmp = VFSTONFS(vnode_mount(vp));
1c79356b 2891 uiop = &uio;
91447636 2892 uiop->uio_iovs.iov32p = &io;
1c79356b 2893 uiop->uio_iovcnt = 1;
91447636 2894#if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
1c79356b 2895 uiop->uio_segflg = UIO_SYSSPACE;
91447636
A
2896#else
2897 uiop->uio_segflg = UIO_SYSSPACE32;
2898#endif
1c79356b 2899
55e303ae
A
2900 /*
2901 * we've decided to perform I/O for this block,
2902 * so we couldn't possibly NB_DONE. So, clear it.
1c79356b 2903 */
55e303ae
A
2904 if (ISSET(bp->nb_flags, NB_DONE)) {
2905 if (!ISSET(bp->nb_flags, NB_ASYNC))
1c79356b 2906 panic("nfs_doio: done and not async");
55e303ae 2907 CLR(bp->nb_flags, NB_DONE);
1c79356b 2908 }
55e303ae
A
2909 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2910 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2911 bp->nb_dirtyend);
2912
2913 if (ISSET(bp->nb_flags, NB_READ)) {
91447636 2914 if (vtype == VREG)
55e303ae 2915 NFS_BUF_MAP(bp);
91447636
A
2916 io.iov_len = bp->nb_bufsize;
2917 uio_uio_resid_set(uiop, io.iov_len);
2918 io.iov_base = (uintptr_t) bp->nb_data;
1c79356b 2919 uiop->uio_rw = UIO_READ;
91447636 2920 switch (vtype) {
1c79356b 2921 case VREG:
55e303ae 2922 uiop->uio_offset = NBOFF(bp);
91447636
A
2923 OSAddAtomic(1, (SInt32*)&nfsstats.read_bios);
2924 error = nfs_readrpc(vp, uiop, cr, p);
2925 FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error);
1c79356b 2926 if (!error) {
55e303ae
A
2927 /* update valid range */
2928 bp->nb_validoff = 0;
91447636 2929 if (uio_uio_resid(uiop) != 0) {
1c79356b
A
2930 /*
2931 * If len > 0, there is a hole in the file and
2932 * no writes after the hole have been pushed to
2933 * the server yet.
2934 * Just zero fill the rest of the valid area.
2935 */
91447636
A
2936 // LP64todo - fix this
2937 diff = bp->nb_bufsize - uio_uio_resid(uiop);
55e303ae 2938 len = np->n_size - (NBOFF(bp) + diff);
fa4905b1 2939 if (len > 0) {
91447636
A
2940 // LP64todo - fix this
2941 len = min(len, uio_uio_resid(uiop));
55e303ae
A
2942 bzero((char *)bp->nb_data + diff, len);
2943 bp->nb_validend = diff + len;
fa4905b1
A
2944 FSDBG(258, diff, len, 0, 1);
2945 } else
55e303ae 2946 bp->nb_validend = diff;
1c79356b 2947 } else
55e303ae
A
2948 bp->nb_validend = bp->nb_bufsize;
2949 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2950 if (bp->nb_validend & PAGE_MASK) {
2951 /* valid range ends in the middle of a page so we */
2952 /* need to zero-fill any invalid data at the end */
2953 /* of the last page */
2954 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2955 bp->nb_bufsize - bp->nb_validend);
2956 FSDBG(258, bp->nb_validend,
2957 bp->nb_bufsize - bp->nb_validend, 0, 2);
1c79356b 2958 }
1c79356b 2959 }
1c79356b
A
2960 break;
2961 case VLNK:
2962 uiop->uio_offset = (off_t)0;
91447636
A
2963 OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios);
2964 error = nfs_readlinkrpc(vp, uiop, cr, p);
55e303ae
A
2965 if (!error) {
2966 bp->nb_validoff = 0;
2967 bp->nb_validend = uiop->uio_offset;
2968 }
1c79356b
A
2969 break;
2970 case VDIR:
91447636 2971 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
55e303ae 2972 uiop->uio_offset = NBOFF(bp);
1c79356b
A
2973 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2974 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2975 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
91447636 2976 error = nfs_readdirplusrpc(vp, uiop, cr, p);
1c79356b
A
2977 if (error == NFSERR_NOTSUPP)
2978 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2979 }
2980 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
91447636 2981 error = nfs_readdirrpc(vp, uiop, cr, p);
55e303ae
A
2982 if (!error) {
2983 bp->nb_validoff = 0;
2984 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2985 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2986 }
1c79356b
A
2987 break;
2988 default:
91447636 2989 printf("nfs_doio: type %x unexpected\n", vtype);
1c79356b
A
2990 break;
2991 };
2992 if (error) {
55e303ae
A
2993 SET(bp->nb_flags, NB_ERROR);
2994 bp->nb_error = error;
1c79356b 2995 }
55e303ae 2996
1c79356b 2997 } else {
55e303ae
A
2998 /* we're doing a write */
2999 int doff, dend = 0;
3000
3001 /* We need to make sure the pages are locked before doing I/O. */
91447636 3002 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) {
55e303ae
A
3003 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
3004 error = nfs_buf_upl_setup(bp);
3005 if (error) {
3006 printf("nfs_doio: upl create failed %d\n", error);
3007 SET(bp->nb_flags, NB_ERROR);
3008 bp->nb_error = EIO;
3009 return (EIO);
3010 }
3011 nfs_buf_upl_check(bp);
3012 }
3013 }
3014
3015 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
3016 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
3017 /*
3018 * There are pages marked dirty that need to be written out.
3019 *
3020 * We don't want to just combine the write range with the
3021 * range of pages that are dirty because that could cause us
3022 * to write data that wasn't actually written to.
3023 * We also don't want to write data more than once.
3024 *
3025 * If the dirty range just needs to be committed, we do that.
3026 * Otherwise, we write the dirty range and clear the dirty bits
3027 * for any COMPLETE pages covered by that range.
3028 * If there are dirty pages left after that, we write out the
3029 * parts that we haven't written yet.
3030 */
3031 }
3032
fa4905b1 3033 /*
55e303ae
A
3034 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3035 * an actual write will have to be done.
3036 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
fa4905b1 3037 */
8f6c56a5
A
3038 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3039 nfs_buf_check_write_verifier(np, bp);
55e303ae
A
3040 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
3041 doff = NBOFF(bp) + bp->nb_dirtyoff;
3042 SET(bp->nb_flags, NB_WRITEINPROG);
3043 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
3044 bp->nb_wcred, bp->nb_proc);
3045 CLR(bp->nb_flags, NB_WRITEINPROG);
3046 if (!error) {
3047 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3048 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3049 np->n_needcommitcnt--;
3050 CHECK_NEEDCOMMITCNT(np);
8f6c56a5 3051 }
fa4905b1 3052 }
1c79356b 3053
55e303ae
A
3054 if (!error && bp->nb_dirtyend > 0) {
3055 /* there's a dirty range that needs to be written out */
3056 u_int32_t pagemask;
3057 int firstpg, lastpg;
3058
91447636 3059 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
55e303ae
A
3060 bp->nb_dirtyend = np->n_size - NBOFF(bp);
3061
3062 NFS_BUF_MAP(bp);
3063
3064 doff = bp->nb_dirtyoff;
3065 dend = bp->nb_dirtyend;
3066
3067 /* if doff page is dirty, move doff to start of page */
3068 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
3069 doff -= doff & PAGE_MASK;
3070 /* try to expand write range to include preceding dirty pages */
3071 if (!(doff & PAGE_MASK))
3072 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
3073 doff -= PAGE_SIZE;
3074 /* if dend page is dirty, move dend to start of next page */
3075 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
3076 dend = round_page_32(dend);
3077 /* try to expand write range to include trailing dirty pages */
3078 if (!(dend & PAGE_MASK))
3079 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
3080 dend += PAGE_SIZE;
3081 /* make sure to keep dend clipped to EOF */
91447636 3082 if (NBOFF(bp) + dend > (off_t)np->n_size)
55e303ae
A
3083 dend = np->n_size - NBOFF(bp);
3084 /* calculate range of complete pages being written */
3085 firstpg = round_page_32(doff) / PAGE_SIZE;
3086 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
3087 /* calculate mask for that page range */
3088 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
3089
3090 /* compare page mask to nb_dirty; if there are other dirty pages */
3091 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
8f6c56a5 3092 /* not needcommit/nocache/call; otherwise write FILESYNC */
55e303ae
A
3093 if (bp->nb_dirty & ~pagemask)
3094 iomode = NFSV3WRITE_FILESYNC;
8f6c56a5 3095 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
1c79356b
A
3096 iomode = NFSV3WRITE_UNSTABLE;
3097 else
3098 iomode = NFSV3WRITE_FILESYNC;
55e303ae
A
3099
3100 /* write the dirty range */
91447636
A
3101 io.iov_len = dend - doff;
3102 uio_uio_resid_set(uiop, io.iov_len);
55e303ae 3103 uiop->uio_offset = NBOFF(bp) + doff;
91447636 3104 io.iov_base = (uintptr_t) bp->nb_data + doff;
55e303ae
A
3105 uiop->uio_rw = UIO_WRITE;
3106
91447636 3107 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
55e303ae
A
3108
3109 SET(bp->nb_flags, NB_WRITEINPROG);
8f6c56a5 3110 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &bp->nb_verf);
55e303ae
A
3111 /* clear dirty bits for pages we've written */
3112 if (!error)
3113 bp->nb_dirty &= ~pagemask;
3114 /* set/clear needcommit flag */
3115 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
3116 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3117 np->n_needcommitcnt++;
3118 SET(bp->nb_flags, NB_NEEDCOMMIT);
3119 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3120 bp->nb_dirtyoff = doff;
3121 bp->nb_dirtyend = dend;
3122 } else {
3123 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3124 np->n_needcommitcnt--;
3125 CHECK_NEEDCOMMITCNT(np);
3126 }
3127 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3128 }
3129 CLR(bp->nb_flags, NB_WRITEINPROG);
1c79356b 3130 /*
55e303ae
A
3131 * For an interrupted write, the buffer is still valid and the write
3132 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3133 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3134 * NB_EINTR is not relevant.
3135 *
3136 * For the case of a V3 write rpc not being committed to stable
3137 * storage, the block is still dirty and requires either a commit rpc
3138 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3139 * block is reused. This is indicated by setting the NB_DELWRI and
3140 * NB_NEEDCOMMIT flags.
1c79356b 3141 */
55e303ae 3142 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
8f6c56a5 3143 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
55e303ae
A
3144 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3145 SET(bp->nb_flags, NB_DELWRI);
91447636
A
3146 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
3147 NFSBUFCNTCHK(0);
55e303ae
A
3148 }
3149 FSDBG(261, bp->nb_validoff, bp->nb_validend,
3150 bp->nb_bufsize, 0);
3151 /*
3152 * Since for the NB_ASYNC case, nfs_bwrite() has
3153 * reassigned the buffer to the clean list, we have to
3154 * reassign it back to the dirty one. Ugh.
3155 */
3156 if (ISSET(bp->nb_flags, NB_ASYNC)) {
3157 /* move to dirty list */
91447636 3158 lck_mtx_lock(nfs_buf_mutex);
55e303ae
A
3159 if (bp->nb_vnbufs.le_next != NFSNOLIST)
3160 LIST_REMOVE(bp, nb_vnbufs);
3161 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
91447636 3162 lck_mtx_unlock(nfs_buf_mutex);
55e303ae
A
3163 } else {
3164 SET(bp->nb_flags, NB_EINTR);
3165 }
1c79356b 3166 } else {
55e303ae 3167 /* either there's an error or we don't need to commit */
1c79356b 3168 if (error) {
55e303ae
A
3169 SET(bp->nb_flags, NB_ERROR);
3170 bp->nb_error = np->n_error = error;
3171 np->n_flag |= NWRITEERR;
91447636
A
3172 /*
3173 * There was a write error and we need to
3174 * invalidate attrs and flush buffers in
3175 * order to sync up with the server.
3176 * (if this write was extending the file,
3177 * we may no longer know the correct size)
3178 *
3179 * But we can't call vinvalbuf while holding
3180 * this buffer busy. Set a flag to do it after
3181 * releasing the buffer.
3182 *
3183 * Note we can only invalidate in this function
3184 * if this is an async write and so the iodone
3185 * below will release the buffer. Also, we
3186 * shouldn't call vinvalbuf from nfsiod because
3187 * that may deadlock waiting for the completion
3188 * of writes that are queued up behind this one.
3189 */
3190 if (ISSET(bp->nb_flags, NB_ASYNC) &&
3191 !ISSET(bp->nb_flags, NB_IOD)) {
3192 invalidate = 1;
3193 } else {
3194 /* invalidate later */
3195 np->n_flag |= NNEEDINVALIDATE;
3196 }
3197 NATTRINVALIDATE(np);
1c79356b 3198 }
55e303ae
A
3199 /* clear the dirty range */
3200 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
1c79356b 3201 }
55e303ae
A
3202 }
3203
3204 if (!error && bp->nb_dirty) {
3205 /* there are pages marked dirty that need to be written out */
91447636 3206 int pg, count, npages, off;
55e303ae 3207
91447636 3208 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
1c79356b 3209
55e303ae
A
3210 NFS_BUF_MAP(bp);
3211
3212 /*
3213 * we do these writes synchronously because we can't really
3214 * support the unstable/needommit method. We could write
3215 * them unstable, clear the dirty bits, and then commit the
3216 * whole block later, but if we need to rewrite the data, we
3217 * won't have any idea which pages were written because that
3218 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3219 * also can't leave the dirty bits set because then we wouldn't
3220 * be able to tell if the pages were re-dirtied between the end
3221 * of the write and the commit.
3222 */
3223 iomode = NFSV3WRITE_FILESYNC;
3224 uiop->uio_rw = UIO_WRITE;
3225
3226 SET(bp->nb_flags, NB_WRITEINPROG);
3227 npages = bp->nb_bufsize/PAGE_SIZE;
3228 for (pg=0; pg < npages; pg++) {
3229 if (!NBPGDIRTY(bp,pg))
3230 continue;
91447636
A
3231 count = 1;
3232 while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count))
3233 count++;
3234 /* write count pages starting with page pg */
55e303ae 3235 off = pg * PAGE_SIZE;
91447636 3236 len = count * PAGE_SIZE;
55e303ae
A
3237
3238 /* clip writes to EOF */
91447636 3239 if (NBOFF(bp) + off + len > (off_t)np->n_size)
55e303ae
A
3240 len -= (NBOFF(bp) + off + len) - np->n_size;
3241 if (len > 0) {
91447636
A
3242 io.iov_len = len;
3243 uio_uio_resid_set(uiop, io.iov_len);
55e303ae 3244 uiop->uio_offset = NBOFF(bp) + off;
91447636 3245 io.iov_base = (uintptr_t) bp->nb_data + off;
8f6c56a5 3246 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &bp->nb_verf);
55e303ae
A
3247 if (error)
3248 break;
3249 }
3250 /* clear dirty bits */
91447636 3251 while (count--) {
55e303ae
A
3252 bp->nb_dirty &= ~(1 << pg);
3253 /* leave pg on last page */
91447636 3254 if (count) pg++;
55e303ae 3255 }
fa4905b1 3256 }
55e303ae
A
3257 if (!error) {
3258 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3259 np->n_needcommitcnt--;
3260 CHECK_NEEDCOMMITCNT(np);
3261 }
3262 CLR(bp->nb_flags, NB_NEEDCOMMIT);
fa4905b1 3263 }
55e303ae
A
3264 CLR(bp->nb_flags, NB_WRITEINPROG);
3265 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
fa4905b1 3266 np->n_size);
1c79356b 3267 }
1c79356b 3268
55e303ae
A
3269 if (error) {
3270 SET(bp->nb_flags, NB_ERROR);
3271 bp->nb_error = error;
3272 }
1c79356b 3273 }
1c79356b 3274
55e303ae
A
3275 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
3276
3277 nfs_buf_iodone(bp);
91447636
A
3278
3279 if (invalidate) {
3280 /*
3281 * There was a write error and we need to
3282 * invalidate attrs and flush buffers in
3283 * order to sync up with the server.
3284 * (if this write was extending the file,
3285 * we may no longer know the correct size)
3286 *
3287 * But we couldn't call vinvalbuf while holding
3288 * the buffer busy. So we call vinvalbuf() after
3289 * releasing the buffer.
3290 *
3291 * Note: we don't bother calling nfs_vinvalbuf() if
3292 * there's already a flush in progress.
3293 */
3294 if (!(np->n_flag & NFLUSHINPROG))
3295 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1);
3296 }
3297
1c79356b
A
3298 return (error);
3299}