]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-792.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/malloc.h>
68 #include <sys/vnode.h>
69 #include <sys/dirent.h>
70 #include <sys/mount_internal.h>
71 #include <sys/kernel.h>
72 #include <sys/sysctl.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/uio_internal.h>
75
76 #include <sys/vm.h>
77 #include <sys/vmparam.h>
78
79 #include <sys/time.h>
80 #include <kern/clock.h>
81 #include <libkern/OSAtomic.h>
82 #include <kern/kalloc.h>
83
84 #include <nfs/rpcv2.h>
85 #include <nfs/nfsproto.h>
86 #include <nfs/nfs.h>
87 #include <nfs/nfsmount.h>
88 #include <nfs/nfsnode.h>
89 #include <sys/buf_internal.h>
90
91 #include <sys/kdebug.h>
92
93 #define FSDBG(A, B, C, D, E) \
94 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
95 (int)(B), (int)(C), (int)(D), (int)(E), 0)
96 #define FSDBG_TOP(A, B, C, D, E) \
97 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
98 (int)(B), (int)(C), (int)(D), (int)(E), 0)
99 #define FSDBG_BOT(A, B, C, D, E) \
100 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
101 (int)(B), (int)(C), (int)(D), (int)(E), 0)
102
103 extern int nfs_numasync;
104 extern int nfs_ioddelwri;
105 extern struct nfsstats nfsstats;
106
107 #define NFSBUFHASH(np, lbn) \
108 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
109 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
110 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
111 u_long nfsbufhash;
112 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
113 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
114 int nfs_nbdwrite;
115 time_t nfsbuffreeuptimestamp;
116
117 lck_grp_t *nfs_buf_lck_grp;
118 lck_grp_attr_t *nfs_buf_lck_grp_attr;
119 lck_attr_t *nfs_buf_lck_attr;
120 lck_mtx_t *nfs_buf_mutex;
121
122 #define NFSBUFWRITE_THROTTLE 9
123 #define NFSBUF_LRU_STALE 120
124 #define NFSBUF_META_STALE 240
125
126 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
127 #define LRU_TO_FREEUP 6
128 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
129 #define META_TO_FREEUP 3
130 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
131 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
132 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
133 #define LRU_FREEUP_FRAC_ON_TIMER 8
134 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
135 #define META_FREEUP_FRAC_ON_TIMER 16
136 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
137 #define LRU_FREEUP_MIN_FRAC 4
138 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
139 #define META_FREEUP_MIN_FRAC 2
140
141 #define NFS_BUF_FREEUP() \
142 do { \
143 /* only call nfs_buf_freeup() if it has work to do: */ \
144 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
145 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
146 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
147 nfs_buf_freeup(0); \
148 } while (0)
149
150 /*
151 * Initialize nfsbuf lists
152 */
153 void
154 nfs_nbinit(void)
155 {
156 nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init();
157 lck_grp_attr_setstat(nfs_buf_lck_grp_attr);
158 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr);
159
160 nfs_buf_lck_attr = lck_attr_alloc_init();
161
162 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr);
163
164 nfsbufcnt = nfsbufmetacnt =
165 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
166 nfsbufmin = 128;
167 nfsbufmax = (sane_size >> PAGE_SHIFT) / 4;
168 nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16;
169 nfsneedbuffer = 0;
170 nfs_nbdwrite = 0;
171 nfsbuffreeuptimestamp = 0;
172
173 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
174 TAILQ_INIT(&nfsbuffree);
175 TAILQ_INIT(&nfsbuffreemeta);
176 TAILQ_INIT(&nfsbufdelwri);
177
178 }
179
180 /*
181 * try to free up some excess, unused nfsbufs
182 */
183 void
184 nfs_buf_freeup(int timer)
185 {
186 struct nfsbuf *fbp;
187 struct timeval now;
188 int count;
189 struct nfsbuffreehead nfsbuffreeup;
190
191 TAILQ_INIT(&nfsbuffreeup);
192
193 lck_mtx_lock(nfs_buf_mutex);
194
195 microuptime(&now);
196 nfsbuffreeuptimestamp = now.tv_sec;
197
198 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
199
200 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
201 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
202 fbp = TAILQ_FIRST(&nfsbuffree);
203 if (!fbp)
204 break;
205 if (fbp->nb_refs)
206 break;
207 if (NBUFSTAMPVALID(fbp) &&
208 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
209 break;
210 nfs_buf_remfree(fbp);
211 /* disassociate buffer from any vnode */
212 if (fbp->nb_vp) {
213 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
214 LIST_REMOVE(fbp, nb_vnbufs);
215 fbp->nb_vnbufs.le_next = NFSNOLIST;
216 }
217 fbp->nb_vp = NULL;
218 }
219 LIST_REMOVE(fbp, nb_hash);
220 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
221 nfsbufcnt--;
222 }
223
224 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
225 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
226 fbp = TAILQ_FIRST(&nfsbuffreemeta);
227 if (!fbp)
228 break;
229 if (fbp->nb_refs)
230 break;
231 if (NBUFSTAMPVALID(fbp) &&
232 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
233 break;
234 nfs_buf_remfree(fbp);
235 /* disassociate buffer from any vnode */
236 if (fbp->nb_vp) {
237 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
238 LIST_REMOVE(fbp, nb_vnbufs);
239 fbp->nb_vnbufs.le_next = NFSNOLIST;
240 }
241 fbp->nb_vp = NULL;
242 }
243 LIST_REMOVE(fbp, nb_hash);
244 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
245 nfsbufcnt--;
246 nfsbufmetacnt--;
247 }
248
249 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
250 NFSBUFCNTCHK(1);
251
252 lck_mtx_unlock(nfs_buf_mutex);
253
254 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
255 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
256 /* nuke any creds */
257 if (fbp->nb_rcred != NOCRED) {
258 kauth_cred_rele(fbp->nb_rcred);
259 fbp->nb_rcred = NOCRED;
260 }
261 if (fbp->nb_wcred != NOCRED) {
262 kauth_cred_rele(fbp->nb_wcred);
263 fbp->nb_wcred = NOCRED;
264 }
265 /* if buf was NB_META, dump buffer */
266 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
267 kfree(fbp->nb_data, fbp->nb_bufsize);
268 FREE(fbp, M_TEMP);
269 }
270
271 }
272
273 /*
274 * remove a buffer from the freelist
275 * (must be called with nfs_buf_mutex held)
276 */
277 void
278 nfs_buf_remfree(struct nfsbuf *bp)
279 {
280 if (bp->nb_free.tqe_next == NFSNOLIST)
281 panic("nfsbuf not on free list");
282 if (ISSET(bp->nb_flags, NB_DELWRI)) {
283 nfsbufdelwricnt--;
284 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
285 } else if (ISSET(bp->nb_flags, NB_META)) {
286 nfsbuffreemetacnt--;
287 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
288 } else {
289 nfsbuffreecnt--;
290 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
291 }
292 bp->nb_free.tqe_next = NFSNOLIST;
293 NFSBUFCNTCHK(1);
294 }
295
296 /*
297 * check for existence of nfsbuf in cache
298 */
299 boolean_t
300 nfs_buf_is_incore(vnode_t vp, daddr64_t blkno)
301 {
302 boolean_t rv;
303 lck_mtx_lock(nfs_buf_mutex);
304 if (nfs_buf_incore(vp, blkno))
305 rv = TRUE;
306 else
307 rv = FALSE;
308 lck_mtx_unlock(nfs_buf_mutex);
309 return (rv);
310 }
311
312 /*
313 * return incore buffer (must be called with nfs_buf_mutex held)
314 */
315 struct nfsbuf *
316 nfs_buf_incore(vnode_t vp, daddr64_t blkno)
317 {
318 /* Search hash chain */
319 struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first;
320 for (; bp != NULL; bp = bp->nb_hash.le_next)
321 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
322 if (!ISSET(bp->nb_flags, NB_INVAL)) {
323 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
324 return (bp);
325 }
326 }
327 return (NULL);
328 }
329
330 /*
331 * Check if it's OK to drop a page.
332 *
333 * Called by vnode_pager() on pageout request of non-dirty page.
334 * We need to make sure that it's not part of a delayed write.
335 * If it is, we can't let the VM drop it because we may need it
336 * later when/if we need to write the data (again).
337 */
338 int
339 nfs_buf_page_inval(vnode_t vp, off_t offset)
340 {
341 struct nfsbuf *bp;
342 int error = 0;
343
344 lck_mtx_lock(nfs_buf_mutex);
345 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
346 if (!bp)
347 goto out;
348 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
349 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
350 error = EBUSY;
351 goto out;
352 }
353 /*
354 * If there's a dirty range in the buffer, check to
355 * see if this page intersects with the dirty range.
356 * If it does, we can't let the pager drop the page.
357 */
358 if (bp->nb_dirtyend > 0) {
359 int start = offset - NBOFF(bp);
360 if (bp->nb_dirtyend <= start ||
361 bp->nb_dirtyoff >= (start + PAGE_SIZE))
362 error = 0;
363 else
364 error = EBUSY;
365 }
366 out:
367 lck_mtx_unlock(nfs_buf_mutex);
368 return (error);
369 }
370
371 /*
372 * set up the UPL for a buffer
373 * (must NOT be called with nfs_buf_mutex held)
374 */
375 int
376 nfs_buf_upl_setup(struct nfsbuf *bp)
377 {
378 kern_return_t kret;
379 upl_t upl;
380 int upl_flags;
381
382 if (ISSET(bp->nb_flags, NB_PAGELIST))
383 return (0);
384
385 upl_flags = UPL_PRECIOUS;
386 if (! ISSET(bp->nb_flags, NB_READ)) {
387 /*
388 * We're doing a "write", so we intend to modify
389 * the pages we're gathering.
390 */
391 upl_flags |= UPL_WILL_MODIFY;
392 }
393 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
394 &upl, NULL, upl_flags);
395 if (kret == KERN_INVALID_ARGUMENT) {
396 /* vm object probably doesn't exist any more */
397 bp->nb_pagelist = NULL;
398 return (EINVAL);
399 }
400 if (kret != KERN_SUCCESS) {
401 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
402 bp->nb_pagelist = NULL;
403 return (EIO);
404 }
405
406 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
407
408 bp->nb_pagelist = upl;
409 SET(bp->nb_flags, NB_PAGELIST);
410 return (0);
411 }
412
413 /*
414 * update buffer's valid/dirty info from UBC
415 * (must NOT be called with nfs_buf_mutex held)
416 */
417 void
418 nfs_buf_upl_check(struct nfsbuf *bp)
419 {
420 upl_page_info_t *pl;
421 off_t filesize, fileoffset;
422 int i, npages;
423
424 if (!ISSET(bp->nb_flags, NB_PAGELIST))
425 return;
426
427 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
428 filesize = ubc_getsize(bp->nb_vp);
429 fileoffset = NBOFF(bp);
430 if (fileoffset < filesize)
431 SET(bp->nb_flags, NB_CACHE);
432 else
433 CLR(bp->nb_flags, NB_CACHE);
434
435 pl = ubc_upl_pageinfo(bp->nb_pagelist);
436 bp->nb_valid = bp->nb_dirty = 0;
437
438 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
439 /* anything beyond the end of the file is not valid or dirty */
440 if (fileoffset >= filesize)
441 break;
442 if (!upl_valid_page(pl, i)) {
443 CLR(bp->nb_flags, NB_CACHE);
444 continue;
445 }
446 NBPGVALID_SET(bp,i);
447 if (upl_dirty_page(pl, i)) {
448 NBPGDIRTY_SET(bp, i);
449 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
450 SET(bp->nb_flags, NB_WASDIRTY);
451 }
452 }
453 fileoffset = NBOFF(bp);
454 if (ISSET(bp->nb_flags, NB_CACHE)) {
455 bp->nb_validoff = 0;
456 bp->nb_validend = bp->nb_bufsize;
457 if (fileoffset + bp->nb_validend > filesize)
458 bp->nb_validend = filesize - fileoffset;
459 } else {
460 bp->nb_validoff = bp->nb_validend = -1;
461 }
462 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
463 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
464 }
465
466 /*
467 * make sure that a buffer is mapped
468 * (must NOT be called with nfs_buf_mutex held)
469 */
470 static int
471 nfs_buf_map(struct nfsbuf *bp)
472 {
473 kern_return_t kret;
474
475 if (bp->nb_data)
476 return (0);
477 if (!ISSET(bp->nb_flags, NB_PAGELIST))
478 return (EINVAL);
479
480 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
481 if (kret != KERN_SUCCESS)
482 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
483 if (bp->nb_data == 0)
484 panic("ubc_upl_map mapped 0");
485 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
486 return (0);
487 }
488
489 /*
490 * check range of pages in nfsbuf's UPL for validity
491 */
492 static int
493 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
494 {
495 off_t fileoffset, filesize;
496 int pg, lastpg;
497 upl_page_info_t *pl;
498
499 if (!ISSET(bp->nb_flags, NB_PAGELIST))
500 return (0);
501 pl = ubc_upl_pageinfo(bp->nb_pagelist);
502
503 size += off & PAGE_MASK;
504 off &= ~PAGE_MASK;
505 fileoffset = NBOFF(bp);
506 filesize = VTONFS(bp->nb_vp)->n_size;
507 if ((fileoffset + off + size) > filesize)
508 size = filesize - (fileoffset + off);
509
510 pg = off/PAGE_SIZE;
511 lastpg = (off + size - 1)/PAGE_SIZE;
512 while (pg <= lastpg) {
513 if (!upl_valid_page(pl, pg))
514 return (0);
515 pg++;
516 }
517 return (1);
518 }
519
520 /*
521 * normalize an nfsbuf's valid range
522 *
523 * the read/write code guarantees that we'll always have a valid
524 * region that is an integral number of pages. If either end
525 * of the valid range isn't page-aligned, it gets corrected
526 * here as we extend the valid range through all of the
527 * contiguous valid pages.
528 */
529 static void
530 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
531 {
532 int pg, npg;
533 /* pull validoff back to start of contiguous valid page range */
534 pg = bp->nb_validoff/PAGE_SIZE;
535 while (pg >= 0 && NBPGVALID(bp,pg))
536 pg--;
537 bp->nb_validoff = (pg+1) * PAGE_SIZE;
538 /* push validend forward to end of contiguous valid page range */
539 npg = bp->nb_bufsize/PAGE_SIZE;
540 pg = bp->nb_validend/PAGE_SIZE;
541 while (pg < npg && NBPGVALID(bp,pg))
542 pg++;
543 bp->nb_validend = pg * PAGE_SIZE;
544 /* clip to EOF */
545 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
546 bp->nb_validend = np->n_size % bp->nb_bufsize;
547 }
548
549 /*
550 * try to push out some delayed/uncommitted writes
551 * ("locked" indicates whether nfs_buf_mutex is already held)
552 */
553 static void
554 nfs_buf_delwri_push(int locked)
555 {
556 struct nfsbuf *bp;
557 int i, error;
558
559 if (TAILQ_EMPTY(&nfsbufdelwri))
560 return;
561
562 /* first try to tell the nfsiods to do it */
563 if (nfs_asyncio(NULL, NULL) == 0)
564 return;
565
566 /* otherwise, try to do some of the work ourselves */
567 i = 0;
568 if (!locked)
569 lck_mtx_lock(nfs_buf_mutex);
570 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
571 struct nfsnode *np = VTONFS(bp->nb_vp);
572 nfs_buf_remfree(bp);
573 nfs_buf_refget(bp);
574 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
575 nfs_buf_refrele(bp);
576 if (error)
577 break;
578 if (!bp->nb_vp) {
579 /* buffer is no longer valid */
580 nfs_buf_drop(bp);
581 continue;
582 }
583 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
584 /* put buffer at end of delwri list */
585 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
586 nfsbufdelwricnt++;
587 nfs_buf_drop(bp);
588 lck_mtx_unlock(nfs_buf_mutex);
589 nfs_flushcommits(np->n_vnode, NULL, 1);
590 } else {
591 SET(bp->nb_flags, NB_ASYNC);
592 lck_mtx_unlock(nfs_buf_mutex);
593 nfs_buf_write(bp);
594 }
595 i++;
596 lck_mtx_lock(nfs_buf_mutex);
597 }
598 if (!locked)
599 lck_mtx_unlock(nfs_buf_mutex);
600 }
601
602 /*
603 * Get an nfs buffer.
604 *
605 * Returns errno on error, 0 otherwise.
606 * Any buffer is returned in *bpp.
607 *
608 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
609 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
610 *
611 * Check for existence of buffer in cache.
612 * Or attempt to reuse a buffer from one of the free lists.
613 * Or allocate a new buffer if we haven't already hit max allocation.
614 * Or wait for a free buffer.
615 *
616 * If available buffer found, prepare it, and return it.
617 *
618 * If the calling process is interrupted by a signal for
619 * an interruptible mount point, return EINTR.
620 */
621 int
622 nfs_buf_get(
623 vnode_t vp,
624 daddr64_t blkno,
625 int size,
626 proc_t p,
627 int flags,
628 struct nfsbuf **bpp)
629 {
630 struct nfsnode *np = VTONFS(vp);
631 struct nfsbuf *bp;
632 int biosize, bufsize;
633 kauth_cred_t cred;
634 int slpflag = PCATCH;
635 int operation = (flags & NBLK_OPMASK);
636 int error = 0;
637 struct timespec ts;
638
639 FSDBG_TOP(541, vp, blkno, size, flags);
640 *bpp = NULL;
641
642 bufsize = size;
643 if (bufsize > MAXBSIZE)
644 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
645
646 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
647
648 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) {
649 operation = NBLK_META;
650 } else if (bufsize < biosize) {
651 /* reg files should always have biosize blocks */
652 bufsize = biosize;
653 }
654
655 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
656 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
657 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
658
659 /* poke the delwri list */
660 nfs_buf_delwri_push(0);
661
662 /* sleep to let other threads run... */
663 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
664 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
665 }
666
667 loop:
668 lck_mtx_lock(nfs_buf_mutex);
669
670 /* check for existence of nfsbuf in cache */
671 if ((bp = nfs_buf_incore(vp, blkno))) {
672 /* if busy, set wanted and wait */
673 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
674 if (flags & NBLK_NOWAIT) {
675 lck_mtx_unlock(nfs_buf_mutex);
676 FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc);
677 return (0);
678 }
679 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
680 SET(bp->nb_lflags, NBL_WANTED);
681
682 ts.tv_sec = 2;
683 ts.tv_nsec = 0;
684 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
685 "nfsbufget", (slpflag == PCATCH) ? 0 : &ts);
686 slpflag = 0;
687 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
688 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
689 FSDBG_BOT(541, vp, blkno, 0, error);
690 return (error);
691 }
692 goto loop;
693 }
694 if (bp->nb_bufsize != bufsize)
695 panic("nfsbuf size mismatch");
696 SET(bp->nb_lflags, NBL_BUSY);
697 SET(bp->nb_flags, NB_CACHE);
698 nfs_buf_remfree(bp);
699 /* additional paranoia: */
700 if (ISSET(bp->nb_flags, NB_PAGELIST))
701 panic("pagelist buffer was not busy");
702 goto buffer_setup;
703 }
704
705 if (flags & NBLK_ONLYVALID) {
706 lck_mtx_unlock(nfs_buf_mutex);
707 FSDBG_BOT(541, vp, blkno, 0, 0x0000cace);
708 return (0);
709 }
710
711 /*
712 * where to get a free buffer:
713 * - if meta and maxmeta reached, must reuse meta
714 * - alloc new if we haven't reached min bufs
715 * - if free lists are NOT empty
716 * - if free list is stale, use it
717 * - else if freemeta list is stale, use it
718 * - else if max bufs allocated, use least-time-to-stale
719 * - alloc new if we haven't reached max allowed
720 * - start clearing out delwri list and try again
721 */
722
723 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
724 /* if we've hit max meta buffers, must reuse a meta buffer */
725 bp = TAILQ_FIRST(&nfsbuffreemeta);
726 } else if ((nfsbufcnt > nfsbufmin) &&
727 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
728 /* try to pull an nfsbuf off a free list */
729 struct nfsbuf *lrubp, *metabp;
730 struct timeval now;
731 microuptime(&now);
732
733 /* if the next LRU or META buffer is invalid or stale, use it */
734 lrubp = TAILQ_FIRST(&nfsbuffree);
735 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
736 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
737 bp = lrubp;
738 metabp = TAILQ_FIRST(&nfsbuffreemeta);
739 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
740 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
741 bp = metabp;
742
743 if (!bp && (nfsbufcnt >= nfsbufmax)) {
744 /* we've already allocated all bufs, so */
745 /* choose the buffer that'll go stale first */
746 if (!metabp)
747 bp = lrubp;
748 else if (!lrubp)
749 bp = metabp;
750 else {
751 int32_t lru_stale_time, meta_stale_time;
752 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
753 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
754 if (lru_stale_time <= meta_stale_time)
755 bp = lrubp;
756 else
757 bp = metabp;
758 }
759 }
760 }
761
762 if (bp) {
763 /* we have a buffer to reuse */
764 FSDBG(544, vp, blkno, bp, bp->nb_flags);
765 nfs_buf_remfree(bp);
766 if (ISSET(bp->nb_flags, NB_DELWRI))
767 panic("nfs_buf_get: delwri");
768 SET(bp->nb_lflags, NBL_BUSY);
769 /* disassociate buffer from previous vnode */
770 if (bp->nb_vp) {
771 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
772 LIST_REMOVE(bp, nb_vnbufs);
773 bp->nb_vnbufs.le_next = NFSNOLIST;
774 }
775 bp->nb_vp = NULL;
776 }
777 LIST_REMOVE(bp, nb_hash);
778 /* nuke any creds we're holding */
779 cred = bp->nb_rcred;
780 if (cred != NOCRED) {
781 bp->nb_rcred = NOCRED;
782 kauth_cred_rele(cred);
783 }
784 cred = bp->nb_wcred;
785 if (cred != NOCRED) {
786 bp->nb_wcred = NOCRED;
787 kauth_cred_rele(cred);
788 }
789 /* if buf will no longer be NB_META, dump old buffer */
790 if (operation == NBLK_META) {
791 if (!ISSET(bp->nb_flags, NB_META))
792 nfsbufmetacnt++;
793 } else if (ISSET(bp->nb_flags, NB_META)) {
794 if (bp->nb_data) {
795 kfree(bp->nb_data, bp->nb_bufsize);
796 bp->nb_data = NULL;
797 }
798 nfsbufmetacnt--;
799 }
800 /* re-init buf fields */
801 bp->nb_error = 0;
802 bp->nb_validoff = bp->nb_validend = -1;
803 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
804 bp->nb_valid = 0;
805 bp->nb_dirty = 0;
806 } else {
807 /* no buffer to reuse */
808 if ((nfsbufcnt < nfsbufmax) &&
809 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
810 /* just alloc a new one */
811 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
812 if (!bp) {
813 lck_mtx_unlock(nfs_buf_mutex);
814 FSDBG_BOT(541, vp, blkno, 0, error);
815 return (ENOMEM);
816 }
817 nfsbufcnt++;
818 if (operation == NBLK_META)
819 nfsbufmetacnt++;
820 NFSBUFCNTCHK(1);
821 /* init nfsbuf */
822 bzero(bp, sizeof(*bp));
823 bp->nb_free.tqe_next = NFSNOLIST;
824 bp->nb_validoff = bp->nb_validend = -1;
825 FSDBG(545, vp, blkno, bp, 0);
826 } else {
827 /* too many bufs... wait for buffers to free up */
828 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
829
830 /* poke the delwri list */
831 nfs_buf_delwri_push(1);
832
833 nfsneedbuffer = 1;
834 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP,
835 "nfsbufget", 0);
836 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
837 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
838 FSDBG_BOT(541, vp, blkno, 0, error);
839 return (error);
840 }
841 goto loop;
842 }
843 }
844
845 /* setup nfsbuf */
846 bp->nb_lflags = NBL_BUSY;
847 bp->nb_flags = 0;
848 bp->nb_lblkno = blkno;
849 /* insert buf in hash */
850 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
851 /* associate buffer with new vnode */
852 bp->nb_vp = vp;
853 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
854
855 buffer_setup:
856
857 /* unlock hash */
858 lck_mtx_unlock(nfs_buf_mutex);
859
860 switch (operation) {
861 case NBLK_META:
862 SET(bp->nb_flags, NB_META);
863 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
864 kfree(bp->nb_data, bp->nb_bufsize);
865 bp->nb_data = NULL;
866 bp->nb_validoff = bp->nb_validend = -1;
867 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
868 bp->nb_valid = 0;
869 bp->nb_dirty = 0;
870 CLR(bp->nb_flags, NB_CACHE);
871 }
872 if (!bp->nb_data)
873 bp->nb_data = kalloc(bufsize);
874 if (!bp->nb_data) {
875 /* Ack! couldn't allocate the data buffer! */
876 /* cleanup buffer and return error */
877 lck_mtx_lock(nfs_buf_mutex);
878 LIST_REMOVE(bp, nb_vnbufs);
879 bp->nb_vnbufs.le_next = NFSNOLIST;
880 bp->nb_vp = NULL;
881 /* invalidate usage timestamp to allow immediate freeing */
882 NBUFSTAMPINVALIDATE(bp);
883 if (bp->nb_free.tqe_next != NFSNOLIST)
884 panic("nfsbuf on freelist");
885 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
886 nfsbuffreecnt++;
887 lck_mtx_unlock(nfs_buf_mutex);
888 FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM);
889 return (ENOMEM);
890 }
891 bp->nb_bufsize = bufsize;
892 break;
893
894 case NBLK_READ:
895 case NBLK_WRITE:
896 /*
897 * Set or clear NB_READ now to let the UPL subsystem know
898 * if we intend to modify the pages or not.
899 */
900 if (operation == NBLK_READ) {
901 SET(bp->nb_flags, NB_READ);
902 } else {
903 CLR(bp->nb_flags, NB_READ);
904 }
905 if (bufsize < PAGE_SIZE)
906 bufsize = PAGE_SIZE;
907 bp->nb_bufsize = bufsize;
908 bp->nb_validoff = bp->nb_validend = -1;
909
910 if (UBCINFOEXISTS(vp)) {
911 /* setup upl */
912 if (nfs_buf_upl_setup(bp)) {
913 /* unable to create upl */
914 /* vm object must no longer exist */
915 /* cleanup buffer and return error */
916 lck_mtx_lock(nfs_buf_mutex);
917 LIST_REMOVE(bp, nb_vnbufs);
918 bp->nb_vnbufs.le_next = NFSNOLIST;
919 bp->nb_vp = NULL;
920 /* invalidate usage timestamp to allow immediate freeing */
921 NBUFSTAMPINVALIDATE(bp);
922 if (bp->nb_free.tqe_next != NFSNOLIST)
923 panic("nfsbuf on freelist");
924 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
925 nfsbuffreecnt++;
926 lck_mtx_unlock(nfs_buf_mutex);
927 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
928 return (EIO);
929 }
930 nfs_buf_upl_check(bp);
931 }
932 break;
933
934 default:
935 panic("nfs_buf_get: %d unknown operation", operation);
936 }
937
938 *bpp = bp;
939
940 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
941
942 return (0);
943 }
944
945 void
946 nfs_buf_release(struct nfsbuf *bp, int freeup)
947 {
948 vnode_t vp = bp->nb_vp;
949 struct timeval now;
950 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
951
952 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
953 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
954 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
955
956 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
957 int upl_flags;
958 upl_t upl;
959 int i, rv;
960
961 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
962 rv = nfs_buf_upl_setup(bp);
963 if (rv)
964 printf("nfs_buf_release: upl create failed %d\n", rv);
965 else
966 nfs_buf_upl_check(bp);
967 }
968 upl = bp->nb_pagelist;
969 if (!upl)
970 goto pagelist_cleanup_done;
971 if (bp->nb_data) {
972 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
973 panic("ubc_upl_unmap failed");
974 bp->nb_data = NULL;
975 }
976 if (bp->nb_flags & (NB_ERROR | NB_INVAL | NB_NOCACHE)) {
977 if (bp->nb_flags & (NB_READ | NB_INVAL))
978 upl_flags = UPL_ABORT_DUMP_PAGES;
979 else
980 upl_flags = 0;
981 ubc_upl_abort(upl, upl_flags);
982 goto pagelist_cleanup_done;
983 }
984 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
985 if (!NBPGVALID(bp,i))
986 ubc_upl_abort_range(upl,
987 i*PAGE_SIZE, PAGE_SIZE,
988 UPL_ABORT_DUMP_PAGES |
989 UPL_ABORT_FREE_ON_EMPTY);
990 else {
991 if (NBPGDIRTY(bp,i))
992 upl_flags = UPL_COMMIT_SET_DIRTY;
993 else
994 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
995 ubc_upl_commit_range(upl,
996 i*PAGE_SIZE, PAGE_SIZE,
997 upl_flags |
998 UPL_COMMIT_INACTIVATE |
999 UPL_COMMIT_FREE_ON_EMPTY);
1000 }
1001 }
1002 pagelist_cleanup_done:
1003 /* was this the last buffer in the file? */
1004 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) {
1005 /* if so, invalidate all pages of last buffer past EOF */
1006 int biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
1007 off_t start, end;
1008 start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
1009 end = trunc_page_64(NBOFF(bp) + biosize);
1010 if (end > start) {
1011 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1012 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1013 }
1014 }
1015 CLR(bp->nb_flags, NB_PAGELIST);
1016 bp->nb_pagelist = NULL;
1017 }
1018
1019 lck_mtx_lock(nfs_buf_mutex);
1020
1021 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1022
1023 /* Wake up any processes waiting for any buffer to become free. */
1024 if (nfsneedbuffer) {
1025 nfsneedbuffer = 0;
1026 wakeup_needbuffer = 1;
1027 }
1028 /* Wake up any processes waiting for _this_ buffer to become free. */
1029 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1030 CLR(bp->nb_lflags, NBL_WANTED);
1031 wakeup_buffer = 1;
1032 }
1033
1034 /* If it's not cacheable, or an error, mark it invalid. */
1035 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
1036 SET(bp->nb_flags, NB_INVAL);
1037
1038 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1039 /* If it's invalid or empty, dissociate it from its vnode */
1040 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1041 LIST_REMOVE(bp, nb_vnbufs);
1042 bp->nb_vnbufs.le_next = NFSNOLIST;
1043 }
1044 bp->nb_vp = NULL;
1045 /* if this was a delayed write, wakeup anyone */
1046 /* waiting for delayed writes to complete */
1047 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1048 CLR(bp->nb_flags, NB_DELWRI);
1049 OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite);
1050 NFSBUFCNTCHK(1);
1051 wakeup_nbdwrite = 1;
1052 }
1053 /* invalidate usage timestamp to allow immediate freeing */
1054 NBUFSTAMPINVALIDATE(bp);
1055 /* put buffer at head of free list */
1056 if (bp->nb_free.tqe_next != NFSNOLIST)
1057 panic("nfsbuf on freelist");
1058 SET(bp->nb_flags, NB_INVAL);
1059 if (ISSET(bp->nb_flags, NB_META)) {
1060 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1061 nfsbuffreemetacnt++;
1062 } else {
1063 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1064 nfsbuffreecnt++;
1065 }
1066 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1067 /* put buffer at end of delwri list */
1068 if (bp->nb_free.tqe_next != NFSNOLIST)
1069 panic("nfsbuf on freelist");
1070 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1071 nfsbufdelwricnt++;
1072 freeup = 0;
1073 } else {
1074 /* update usage timestamp */
1075 microuptime(&now);
1076 bp->nb_timestamp = now.tv_sec;
1077 /* put buffer at end of free list */
1078 if (bp->nb_free.tqe_next != NFSNOLIST)
1079 panic("nfsbuf on freelist");
1080 if (ISSET(bp->nb_flags, NB_META)) {
1081 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1082 nfsbuffreemetacnt++;
1083 } else {
1084 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1085 nfsbuffreecnt++;
1086 }
1087 }
1088
1089 NFSBUFCNTCHK(1);
1090
1091 /* Unlock the buffer. */
1092 CLR(bp->nb_flags, (NB_ASYNC | NB_NOCACHE | NB_STABLE | NB_IOD));
1093 CLR(bp->nb_lflags, NBL_BUSY);
1094
1095 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1096
1097 lck_mtx_unlock(nfs_buf_mutex);
1098
1099 if (wakeup_needbuffer)
1100 wakeup(&nfsneedbuffer);
1101 if (wakeup_buffer)
1102 wakeup(bp);
1103 if (wakeup_nbdwrite)
1104 wakeup(&nfs_nbdwrite);
1105 if (freeup)
1106 NFS_BUF_FREEUP();
1107 }
1108
1109 /*
1110 * Wait for operations on the buffer to complete.
1111 * When they do, extract and return the I/O's error value.
1112 */
1113 int
1114 nfs_buf_iowait(struct nfsbuf *bp)
1115 {
1116 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1117
1118 lck_mtx_lock(nfs_buf_mutex);
1119
1120 while (!ISSET(bp->nb_flags, NB_DONE))
1121 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0);
1122
1123 lck_mtx_unlock(nfs_buf_mutex);
1124
1125 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1126
1127 /* check for interruption of I/O, then errors. */
1128 if (ISSET(bp->nb_flags, NB_EINTR)) {
1129 CLR(bp->nb_flags, NB_EINTR);
1130 return (EINTR);
1131 } else if (ISSET(bp->nb_flags, NB_ERROR))
1132 return (bp->nb_error ? bp->nb_error : EIO);
1133 return (0);
1134 }
1135
1136 /*
1137 * Mark I/O complete on a buffer.
1138 */
1139 void
1140 nfs_buf_iodone(struct nfsbuf *bp)
1141 {
1142
1143 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1144
1145 if (ISSET(bp->nb_flags, NB_DONE))
1146 panic("nfs_buf_iodone already");
1147 /*
1148 * I/O was done, so don't believe
1149 * the DIRTY state from VM anymore
1150 */
1151 CLR(bp->nb_flags, NB_WASDIRTY);
1152
1153 if (!ISSET(bp->nb_flags, NB_READ)) {
1154 CLR(bp->nb_flags, NB_WRITEINPROG);
1155 /*
1156 * vnode_writedone() takes care of waking up
1157 * any throttled write operations
1158 */
1159 vnode_writedone(bp->nb_vp);
1160 }
1161 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1162 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1163 nfs_buf_release(bp, 1);
1164 } else { /* or just wakeup the buffer */
1165 lck_mtx_lock(nfs_buf_mutex);
1166 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1167 CLR(bp->nb_lflags, NBL_WANTED);
1168 lck_mtx_unlock(nfs_buf_mutex);
1169 wakeup(bp);
1170 }
1171
1172 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1173 }
1174
1175 void
1176 nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p)
1177 {
1178 vnode_t vp = bp->nb_vp;
1179
1180 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1181 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1182
1183 /*
1184 * If the block hasn't been seen before:
1185 * (1) Mark it as having been seen,
1186 * (2) Charge for the write.
1187 * (3) Make sure it's on its vnode's correct block list,
1188 */
1189 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1190 SET(bp->nb_flags, NB_DELWRI);
1191 if (p && p->p_stats)
1192 p->p_stats->p_ru.ru_oublock++; /* XXX */
1193 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
1194 NFSBUFCNTCHK(0);
1195 /* move to dirty list */
1196 lck_mtx_lock(nfs_buf_mutex);
1197 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1198 LIST_REMOVE(bp, nb_vnbufs);
1199 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1200 lck_mtx_unlock(nfs_buf_mutex);
1201 }
1202
1203 /*
1204 * If the vnode has "too many" write operations in progress
1205 * wait for them to finish the IO
1206 */
1207 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1208
1209 /*
1210 * If we have too many delayed write buffers,
1211 * more than we can "safely" handle, just fall back to
1212 * doing the async write
1213 */
1214 if (nfs_nbdwrite < 0)
1215 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1216
1217 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1218 /* issue async write */
1219 SET(bp->nb_flags, NB_ASYNC);
1220 nfs_buf_write(bp);
1221 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1222 return;
1223 }
1224
1225 /* Otherwise, the "write" is done, so mark and release the buffer. */
1226 SET(bp->nb_flags, NB_DONE);
1227 nfs_buf_release(bp, 1);
1228 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1229 return;
1230 }
1231
1232 /*
1233 * add a reference to a buffer so it doesn't disappear while being used
1234 * (must be called with nfs_buf_mutex held)
1235 */
1236 void
1237 nfs_buf_refget(struct nfsbuf *bp)
1238 {
1239 bp->nb_refs++;
1240 }
1241 /*
1242 * release a reference on a buffer
1243 * (must be called with nfs_buf_mutex held)
1244 */
1245 void
1246 nfs_buf_refrele(struct nfsbuf *bp)
1247 {
1248 bp->nb_refs--;
1249 }
1250
1251 /*
1252 * mark a particular buffer as BUSY
1253 * (must be called with nfs_buf_mutex held)
1254 */
1255 errno_t
1256 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1257 {
1258 errno_t error;
1259 struct timespec ts;
1260
1261 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1262 /*
1263 * since the mutex_lock may block, the buffer
1264 * may become BUSY, so we need to recheck for
1265 * a NOWAIT request
1266 */
1267 if (flags & NBAC_NOWAIT)
1268 return (EBUSY);
1269 SET(bp->nb_lflags, NBL_WANTED);
1270
1271 ts.tv_sec = (slptimeo/100);
1272 /* the hz value is 100; which leads to 10ms */
1273 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1274
1275 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1276 "nfs_buf_acquire", &ts);
1277 if (error)
1278 return (error);
1279 return (EAGAIN);
1280 }
1281 if (flags & NBAC_REMOVE)
1282 nfs_buf_remfree(bp);
1283 SET(bp->nb_lflags, NBL_BUSY);
1284
1285 return (0);
1286 }
1287
1288 /*
1289 * simply drop the BUSY status of a buffer
1290 * (must be called with nfs_buf_mutex held)
1291 */
1292 void
1293 nfs_buf_drop(struct nfsbuf *bp)
1294 {
1295 int need_wakeup = 0;
1296
1297 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1298 panic("nfs_buf_drop: buffer not busy!");
1299 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1300 /*
1301 * delay the actual wakeup until after we
1302 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1303 */
1304 need_wakeup = 1;
1305 }
1306 /* Unlock the buffer. */
1307 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1308
1309 if (need_wakeup)
1310 wakeup(bp);
1311 }
1312
1313 /*
1314 * prepare for iterating over an nfsnode's buffer list
1315 * this lock protects the queue manipulation
1316 * (must be called with nfs_buf_mutex held)
1317 */
1318 int
1319 nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1320 {
1321 struct nfsbuflists *listheadp;
1322
1323 if (flags & NBI_DIRTY)
1324 listheadp = &np->n_dirtyblkhd;
1325 else
1326 listheadp = &np->n_cleanblkhd;
1327
1328 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1329 LIST_INIT(iterheadp);
1330 return(EWOULDBLOCK);
1331 }
1332
1333 while (np->n_bufiterflags & NBI_ITER) {
1334 np->n_bufiterflags |= NBI_ITERWANT;
1335 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0);
1336 }
1337 if (LIST_EMPTY(listheadp)) {
1338 LIST_INIT(iterheadp);
1339 return(EINVAL);
1340 }
1341 np->n_bufiterflags |= NBI_ITER;
1342
1343 iterheadp->lh_first = listheadp->lh_first;
1344 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1345 LIST_INIT(listheadp);
1346
1347 return(0);
1348 }
1349
1350 /*
1351 * cleanup after iterating over an nfsnode's buffer list
1352 * this lock protects the queue manipulation
1353 * (must be called with nfs_buf_mutex held)
1354 */
1355 void
1356 nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1357 {
1358 struct nfsbuflists * listheadp;
1359 struct nfsbuf *bp;
1360
1361 if (flags & NBI_DIRTY)
1362 listheadp = &np->n_dirtyblkhd;
1363 else
1364 listheadp = &np->n_cleanblkhd;
1365
1366 while (!LIST_EMPTY(iterheadp)) {
1367 bp = LIST_FIRST(iterheadp);
1368 LIST_REMOVE(bp, nb_vnbufs);
1369 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1370 }
1371
1372 np->n_bufiterflags &= ~NBI_ITER;
1373 if (np->n_bufiterflags & NBI_ITERWANT) {
1374 np->n_bufiterflags &= ~NBI_ITERWANT;
1375 wakeup(&np->n_bufiterflags);
1376 }
1377 }
1378
1379
1380 /*
1381 * Vnode op for read using bio
1382 * Any similarity to readip() is purely coincidental
1383 */
1384 int
1385 nfs_bioread(
1386 vnode_t vp,
1387 struct uio *uio,
1388 __unused int ioflag,
1389 kauth_cred_t cred,
1390 proc_t p)
1391 {
1392 struct nfsnode *np = VTONFS(vp);
1393 int biosize;
1394 off_t diff;
1395 struct nfsbuf *bp = NULL, *rabp;
1396 struct nfs_vattr nvattr;
1397 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1398 daddr64_t lbn, rabn, lastrabn = -1, tlbn;
1399 int bufsize;
1400 int nra, error = 0, n = 0, on = 0;
1401 caddr_t dp;
1402 struct dirent *direntp = NULL;
1403 enum vtype vtype;
1404 int nocachereadahead = 0;
1405
1406 FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1407
1408 #if DIAGNOSTIC
1409 if (uio->uio_rw != UIO_READ)
1410 panic("nfs_read mode");
1411 #endif
1412 if (uio_uio_resid(uio) == 0) {
1413 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1414 return (0);
1415 }
1416 if (uio->uio_offset < 0) {
1417 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1418 return (EINVAL);
1419 }
1420
1421 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1422 nfs_fsinfo(nmp, vp, cred, p);
1423 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
1424 vtype = vnode_vtype(vp);
1425 /*
1426 * For nfs, cache consistency can only be maintained approximately.
1427 * Although RFC1094 does not specify the criteria, the following is
1428 * believed to be compatible with the reference port.
1429 * For nfs:
1430 * If the file's modify time on the server has changed since the
1431 * last read rpc or you have written to the file,
1432 * you may have lost data cache consistency with the
1433 * server, so flush all of the file's data out of the cache.
1434 * Then force a getattr rpc to ensure that you have up to date
1435 * attributes.
1436 * NB: This implies that cache data can be read when up to
1437 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1438 * current attributes this could be forced by setting calling
1439 * NATTRINVALIDATE() before the nfs_getattr() call.
1440 */
1441 if (np->n_flag & NNEEDINVALIDATE) {
1442 np->n_flag &= ~NNEEDINVALIDATE;
1443 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1444 }
1445 if (np->n_flag & NMODIFIED) {
1446 if (vtype != VREG) {
1447 if (vtype != VDIR)
1448 panic("nfs: bioread, not dir");
1449 nfs_invaldir(vp);
1450 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1451 if (error) {
1452 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1453 return (error);
1454 }
1455 }
1456 NATTRINVALIDATE(np);
1457 error = nfs_getattr(vp, &nvattr, cred, p);
1458 if (error) {
1459 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1460 return (error);
1461 }
1462 if (vtype == VDIR) {
1463 /* if directory changed, purge any name cache entries */
1464 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1465 cache_purge(vp);
1466 np->n_ncmtime = nvattr.nva_mtime;
1467 }
1468 np->n_mtime = nvattr.nva_mtime;
1469 } else {
1470 error = nfs_getattr(vp, &nvattr, cred, p);
1471 if (error) {
1472 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1473 return (error);
1474 }
1475 if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) {
1476 if (vtype == VDIR) {
1477 nfs_invaldir(vp);
1478 /* purge name cache entries */
1479 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1480 cache_purge(vp);
1481 }
1482 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1483 if (error) {
1484 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1485 return (error);
1486 }
1487 if (vtype == VDIR)
1488 np->n_ncmtime = nvattr.nva_mtime;
1489 np->n_mtime = nvattr.nva_mtime;
1490 }
1491 }
1492
1493 if (vnode_isnocache(vp)) {
1494 if (!(np->n_flag & NNOCACHE)) {
1495 if (NVALIDBUFS(np)) {
1496 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1497 if (error) {
1498 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1499 return (error);
1500 }
1501 }
1502 np->n_flag |= NNOCACHE;
1503 }
1504 } else if (np->n_flag & NNOCACHE) {
1505 np->n_flag &= ~NNOCACHE;
1506 }
1507
1508 do {
1509 if (np->n_flag & NNOCACHE) {
1510 switch (vtype) {
1511 case VREG:
1512 /*
1513 * If we have only a block or so to read,
1514 * just do the rpc directly.
1515 * If we have a couple blocks or more to read,
1516 * then we'll take advantage of readahead within
1517 * this loop to try to fetch all the data in parallel
1518 */
1519 if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) {
1520 error = nfs_readrpc(vp, uio, cred, p);
1521 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1522 return (error);
1523 }
1524 nocachereadahead = 1;
1525 break;
1526 case VLNK:
1527 error = nfs_readlinkrpc(vp, uio, cred, p);
1528 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1529 return (error);
1530 case VDIR:
1531 break;
1532 default:
1533 printf(" NFSNOCACHE: type %x unexpected\n", vtype);
1534 };
1535 }
1536 switch (vtype) {
1537 case VREG:
1538 lbn = uio->uio_offset / biosize;
1539
1540 /*
1541 * Copy directly from any cached pages without grabbing the bufs.
1542 *
1543 * Note: for "nocache" reads, we don't copy directly from UBC
1544 * because any cached pages will be for readahead buffers that
1545 * need to be invalidated anyway before we finish this request.
1546 */
1547 if (!(np->n_flag & NNOCACHE) &&
1548 (uio->uio_segflg == UIO_USERSPACE32 ||
1549 uio->uio_segflg == UIO_USERSPACE64 ||
1550 uio->uio_segflg == UIO_USERSPACE)) {
1551 // LP64todo - fix this!
1552 int io_resid = uio_uio_resid(uio);
1553 diff = np->n_size - uio->uio_offset;
1554 if (diff < io_resid)
1555 io_resid = diff;
1556 if (io_resid > 0) {
1557 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1558 if (error) {
1559 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1560 return (error);
1561 }
1562 }
1563 /* count any biocache reads that we just copied directly */
1564 if (lbn != uio->uio_offset / biosize) {
1565 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1566 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1567 }
1568 }
1569
1570 lbn = uio->uio_offset / biosize;
1571 on = uio->uio_offset % biosize;
1572
1573 /*
1574 * Start the read ahead(s), as required.
1575 */
1576 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1577 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1578 rabn = lbn + 1 + nra;
1579 if (rabn <= lastrabn) {
1580 /* we've already (tried to) read this block */
1581 /* no need to try it again... */
1582 continue;
1583 }
1584 lastrabn = rabn;
1585 if ((off_t)rabn * biosize >= (off_t)np->n_size)
1586 break;
1587 if ((np->n_flag & NNOCACHE) &&
1588 (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio))))
1589 /* for uncached readahead, don't go beyond end of request */
1590 break;
1591 /* check if block exists and is valid. */
1592 error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1593 if (error) {
1594 FSDBG_BOT(514, vp, 0xd1e000b, 1, error);
1595 return (error);
1596 }
1597 if (!rabp)
1598 continue;
1599 if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) {
1600 nfs_buf_release(rabp, 1);
1601 continue;
1602 }
1603 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1604 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1605 if (nfs_asyncio(rabp, cred)) {
1606 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1607 rabp->nb_error = EIO;
1608 nfs_buf_release(rabp, 1);
1609 }
1610 } else
1611 nfs_buf_release(rabp, 1);
1612 }
1613 }
1614
1615 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
1616 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
1617 return (0);
1618 }
1619
1620 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
1621
1622 /*
1623 * If the block is in the cache and has the required data
1624 * in a valid region, just copy it out.
1625 * Otherwise, get the block and write back/read in,
1626 * as required.
1627 */
1628 again:
1629 bufsize = biosize;
1630 // LP64todo - fix this!
1631 n = min((unsigned)(bufsize - on), uio_uio_resid(uio));
1632 diff = np->n_size - uio->uio_offset;
1633 if (diff < n)
1634 n = diff;
1635
1636 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp);
1637 if (error) {
1638 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1639 return (EINTR);
1640 }
1641
1642 /* if any pages are valid... */
1643 if (bp->nb_valid) {
1644 /* ...check for any invalid pages in the read range */
1645 int pg, firstpg, lastpg, dirtypg;
1646 dirtypg = firstpg = lastpg = -1;
1647 pg = on/PAGE_SIZE;
1648 while (pg <= (on + n - 1)/PAGE_SIZE) {
1649 if (!NBPGVALID(bp,pg)) {
1650 if (firstpg < 0)
1651 firstpg = pg;
1652 lastpg = pg;
1653 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1654 dirtypg = pg;
1655 pg++;
1656 }
1657
1658 /* if there are no invalid pages, we're all set */
1659 if (firstpg < 0) {
1660 if (bp->nb_validoff < 0) {
1661 /* valid range isn't set up, so */
1662 /* set it to what we know is valid */
1663 bp->nb_validoff = trunc_page(on);
1664 bp->nb_validend = round_page(on+n);
1665 nfs_buf_normalize_valid_range(np, bp);
1666 }
1667 goto buffer_ready;
1668 }
1669
1670 /* there are invalid pages in the read range */
1671 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1672 /* there are also dirty page(s) in the range, */
1673 /* so write the buffer out and try again */
1674 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1675 SET(bp->nb_flags, NB_ASYNC);
1676 if (bp->nb_wcred == NOCRED) {
1677 kauth_cred_ref(cred);
1678 bp->nb_wcred = cred;
1679 }
1680 error = nfs_buf_write(bp);
1681 if (error) {
1682 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1683 return (error);
1684 }
1685 goto again;
1686 }
1687 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1688 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1689 /* we need to read in more than half the buffer and the */
1690 /* buffer's not dirty, so just fetch the whole buffer */
1691 bp->nb_valid = 0;
1692 } else {
1693 /* read the page range in */
1694 uio_t auio;
1695 char uio_buf[ UIO_SIZEOF(1) ];
1696
1697 NFS_BUF_MAP(bp);
1698 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
1699 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1700 if (!auio) {
1701 error = ENOMEM;
1702 } else {
1703 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
1704 ((lastpg - firstpg + 1) * PAGE_SIZE));
1705 error = nfs_readrpc(vp, auio, cred, p);
1706 }
1707 if (error) {
1708 if (np->n_flag & NNOCACHE)
1709 SET(bp->nb_flags, NB_NOCACHE);
1710 nfs_buf_release(bp, 1);
1711 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1712 return (error);
1713 }
1714 /* Make sure that the valid range is set to cover this read. */
1715 bp->nb_validoff = trunc_page_32(on);
1716 bp->nb_validend = round_page_32(on+n);
1717 nfs_buf_normalize_valid_range(np, bp);
1718 if (uio_resid(auio) > 0) {
1719 /* if short read, must have hit EOF, */
1720 /* so zero the rest of the range */
1721 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
1722 }
1723 /* mark the pages (successfully read) as valid */
1724 for (pg=firstpg; pg <= lastpg; pg++)
1725 NBPGVALID_SET(bp,pg);
1726 }
1727 }
1728 /* if no pages are valid, read the whole block */
1729 if (!bp->nb_valid) {
1730 SET(bp->nb_flags, NB_READ);
1731 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1732 error = nfs_doio(bp, cred, p);
1733 if (error) {
1734 if (np->n_flag & NNOCACHE)
1735 SET(bp->nb_flags, NB_NOCACHE);
1736 nfs_buf_release(bp, 1);
1737 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1738 return (error);
1739 }
1740 }
1741 buffer_ready:
1742 /* validate read range against valid range and clip */
1743 if (bp->nb_validend > 0) {
1744 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1745 if (diff < n)
1746 n = diff;
1747 }
1748 if (n > 0)
1749 NFS_BUF_MAP(bp);
1750 break;
1751 case VLNK:
1752 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks);
1753 error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp);
1754 if (error) {
1755 FSDBG_BOT(514, vp, 0xd1e0010, 0, error);
1756 return (error);
1757 }
1758 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1759 SET(bp->nb_flags, NB_READ);
1760 error = nfs_doio(bp, cred, p);
1761 if (error) {
1762 SET(bp->nb_flags, NB_ERROR);
1763 nfs_buf_release(bp, 1);
1764 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1765 return (error);
1766 }
1767 }
1768 // LP64todo - fix this!
1769 n = min(uio_uio_resid(uio), bp->nb_validend);
1770 on = 0;
1771 break;
1772 case VDIR:
1773 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
1774 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1775 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1776 return (0);
1777 }
1778 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1779 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1780 error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1781 if (error) {
1782 FSDBG_BOT(514, vp, 0xd1e0012, 0, error);
1783 return (error);
1784 }
1785 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1786 SET(bp->nb_flags, NB_READ);
1787 error = nfs_doio(bp, cred, p);
1788 if (error) {
1789 nfs_buf_release(bp, 1);
1790 }
1791 while (error == NFSERR_BAD_COOKIE) {
1792 nfs_invaldir(vp);
1793 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1794 /*
1795 * Yuck! The directory has been modified on the
1796 * server. The only way to get the block is by
1797 * reading from the beginning to get all the
1798 * offset cookies.
1799 */
1800 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
1801 if (np->n_direofoffset
1802 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1803 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1804 return (0);
1805 }
1806 error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1807 if (error) {
1808 FSDBG_BOT(514, vp, 0xd1e0013, 0, error);
1809 return (error);
1810 }
1811 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1812 SET(bp->nb_flags, NB_READ);
1813 error = nfs_doio(bp, cred, p);
1814 /*
1815 * no error + NB_INVAL == directory EOF,
1816 * use the block.
1817 */
1818 if (error == 0 && (bp->nb_flags & NB_INVAL))
1819 break;
1820 }
1821 /*
1822 * An error will throw away the block and the
1823 * for loop will break out. If no error and this
1824 * is not the block we want, we throw away the
1825 * block and go for the next one via the for loop.
1826 */
1827 if (error || tlbn < lbn)
1828 nfs_buf_release(bp, 1);
1829 }
1830 }
1831 /*
1832 * The above while is repeated if we hit another cookie
1833 * error. If we hit an error and it wasn't a cookie error,
1834 * we give up.
1835 */
1836 if (error) {
1837 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1838 return (error);
1839 }
1840 }
1841
1842 /*
1843 * If not eof and read aheads are enabled, start one.
1844 * (You need the current block first, so that you have the
1845 * directory offset cookie of the next block.)
1846 */
1847 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1848 (np->n_direofoffset == 0 ||
1849 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1850 !nfs_buf_is_incore(vp, lbn + 1)) {
1851 error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1852 if (error) {
1853 FSDBG_BOT(514, vp, 0xd1e0015, 0, error);
1854 return (error);
1855 }
1856 if (rabp) {
1857 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1858 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1859 if (nfs_asyncio(rabp, cred)) {
1860 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1861 rabp->nb_error = EIO;
1862 nfs_buf_release(rabp, 1);
1863 }
1864 } else {
1865 nfs_buf_release(rabp, 1);
1866 }
1867 }
1868 }
1869 /*
1870 * Make sure we use a signed variant of min() since
1871 * the second term may be negative.
1872 */
1873 // LP64todo - fix this!
1874 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
1875 /*
1876 * We keep track of the directory eof in
1877 * np->n_direofoffset and chop it off as an
1878 * extra step right here.
1879 */
1880 if (np->n_direofoffset &&
1881 n > np->n_direofoffset - uio->uio_offset)
1882 n = np->n_direofoffset - uio->uio_offset;
1883 /*
1884 * Make sure that we return an integral number of entries so
1885 * that any subsequent calls will start copying from the start
1886 * of the next entry.
1887 *
1888 * If the current value of n has the last entry cut short,
1889 * set n to copy everything up to the last entry instead.
1890 */
1891 if (n > 0) {
1892 dp = bp->nb_data + on;
1893 while (dp < (bp->nb_data + on + n)) {
1894 direntp = (struct dirent *)dp;
1895 dp += direntp->d_reclen;
1896 }
1897 if (dp > (bp->nb_data + on + n))
1898 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1899 }
1900 break;
1901 default:
1902 printf("nfs_bioread: type %x unexpected\n", vtype);
1903 FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL);
1904 return (EINVAL);
1905 };
1906
1907 if (n > 0) {
1908 error = uiomove(bp->nb_data + on, (int)n, uio);
1909 }
1910 switch (vtype) {
1911 case VREG:
1912 if (np->n_flag & NNOCACHE)
1913 SET(bp->nb_flags, NB_NOCACHE);
1914 break;
1915 case VLNK:
1916 n = 0;
1917 break;
1918 case VDIR:
1919 break;
1920 default:
1921 break;
1922 }
1923 nfs_buf_release(bp, 1);
1924 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
1925 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1926 return (error);
1927 }
1928
1929
1930 /*
1931 * Vnode op for write using bio
1932 */
1933 int
1934 nfs_write(ap)
1935 struct vnop_write_args /* {
1936 struct vnodeop_desc *a_desc;
1937 vnode_t a_vp;
1938 struct uio *a_uio;
1939 int a_ioflag;
1940 vfs_context_t a_context;
1941 } */ *ap;
1942 {
1943 struct uio *uio = ap->a_uio;
1944 vnode_t vp = ap->a_vp;
1945 struct nfsnode *np = VTONFS(vp);
1946 proc_t p;
1947 kauth_cred_t cred;
1948 int ioflag = ap->a_ioflag;
1949 struct nfsbuf *bp;
1950 struct nfs_vattr nvattr;
1951 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1952 daddr64_t lbn;
1953 int biosize, bufsize;
1954 int n, on, error = 0;
1955 off_t boff, start, end, cureof;
1956 struct iovec_32 iov;
1957 struct uio auio;
1958
1959 FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1960
1961 #if DIAGNOSTIC
1962 if (uio->uio_rw != UIO_WRITE)
1963 panic("nfs_write mode");
1964 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1965 panic("nfs_write proc");
1966 #endif
1967
1968 p = vfs_context_proc(ap->a_context);
1969 cred = vfs_context_ucred(ap->a_context);
1970
1971 if (vnode_vtype(vp) != VREG)
1972 return (EIO);
1973
1974 np->n_flag |= NWRBUSY;
1975
1976 if (np->n_flag & NNEEDINVALIDATE) {
1977 np->n_flag &= ~NNEEDINVALIDATE;
1978 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1979 }
1980 if (np->n_flag & NWRITEERR) {
1981 np->n_flag &= ~(NWRITEERR | NWRBUSY);
1982 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error);
1983 return (np->n_error);
1984 }
1985 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1986 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1987 (void)nfs_fsinfo(nmp, vp, cred, p);
1988 if (ioflag & (IO_APPEND | IO_SYNC)) {
1989 if (np->n_flag & NMODIFIED) {
1990 NATTRINVALIDATE(np);
1991 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1992 if (error) {
1993 np->n_flag &= ~NWRBUSY;
1994 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1995 return (error);
1996 }
1997 }
1998 if (ioflag & IO_APPEND) {
1999 NATTRINVALIDATE(np);
2000 error = nfs_getattr(vp, &nvattr, cred, p);
2001 if (error) {
2002 np->n_flag &= ~NWRBUSY;
2003 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
2004 return (error);
2005 }
2006 uio->uio_offset = np->n_size;
2007 }
2008 }
2009 if (uio->uio_offset < 0) {
2010 np->n_flag &= ~NWRBUSY;
2011 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
2012 return (EINVAL);
2013 }
2014 if (uio_uio_resid(uio) == 0) {
2015 np->n_flag &= ~NWRBUSY;
2016 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2017 return (0);
2018 }
2019
2020 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
2021
2022 if (vnode_isnocache(vp)) {
2023 if (!(np->n_flag & NNOCACHE)) {
2024 if (NVALIDBUFS(np)) {
2025 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2026 if (error) {
2027 np->n_flag &= ~NWRBUSY;
2028 FSDBG_BOT(515, vp, 0, 0, error);
2029 return (error);
2030 }
2031 }
2032 np->n_flag |= NNOCACHE;
2033 }
2034 } else if (np->n_flag & NNOCACHE) {
2035 np->n_flag &= ~NNOCACHE;
2036 }
2037
2038 do {
2039 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes);
2040 lbn = uio->uio_offset / biosize;
2041 on = uio->uio_offset % biosize;
2042 // LP64todo - fix this
2043 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
2044 again:
2045 bufsize = biosize;
2046 /*
2047 * Get a cache block for writing. The range to be written is
2048 * (off..off+n) within the block. We ensure that the block
2049 * either has no dirty region or that the given range is
2050 * contiguous with the existing dirty region.
2051 */
2052 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp);
2053 if (error) {
2054 np->n_flag &= ~NWRBUSY;
2055 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2056 return (error);
2057 }
2058 /* map the block because we know we're going to write to it */
2059 NFS_BUF_MAP(bp);
2060
2061 if (np->n_flag & NNOCACHE)
2062 SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE));
2063
2064 if (bp->nb_wcred == NOCRED) {
2065 kauth_cred_ref(cred);
2066 bp->nb_wcred = cred;
2067 }
2068
2069 /*
2070 * If there's already a dirty range AND dirty pages in this block we
2071 * need to send a commit AND write the dirty pages before continuing.
2072 *
2073 * If there's already a dirty range OR dirty pages in this block
2074 * and the new write range is not contiguous with the existing range,
2075 * then force the buffer to be written out now.
2076 * (We used to just extend the dirty range to cover the valid,
2077 * but unwritten, data in between also. But writing ranges
2078 * of data that weren't actually written by an application
2079 * risks overwriting some other client's data with stale data
2080 * that's just masquerading as new written data.)
2081 */
2082 if (bp->nb_dirtyend > 0) {
2083 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2084 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
2085 /* write/commit buffer "synchronously" */
2086 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2087 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2088 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2089 error = nfs_buf_write(bp);
2090 if (error) {
2091 np->n_flag &= ~NWRBUSY;
2092 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2093 return (error);
2094 }
2095 goto again;
2096 }
2097 } else if (bp->nb_dirty) {
2098 int firstpg, lastpg;
2099 u_int32_t pagemask;
2100 /* calculate write range pagemask */
2101 firstpg = on/PAGE_SIZE;
2102 lastpg = (on+n-1)/PAGE_SIZE;
2103 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2104 /* check if there are dirty pages outside the write range */
2105 if (bp->nb_dirty & ~pagemask) {
2106 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
2107 /* write/commit buffer "synchronously" */
2108 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2109 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2110 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2111 error = nfs_buf_write(bp);
2112 if (error) {
2113 np->n_flag &= ~NWRBUSY;
2114 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2115 return (error);
2116 }
2117 goto again;
2118 }
2119 /* if the first or last pages are already dirty */
2120 /* make sure that the dirty range encompasses those pages */
2121 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2122 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
2123 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2124 if (NBPGDIRTY(bp,lastpg)) {
2125 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2126 /* clip to EOF */
2127 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2128 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2129 } else
2130 bp->nb_dirtyend = on+n;
2131 }
2132 }
2133
2134 /*
2135 * Are we extending the size of the file with this write?
2136 * If so, update file size now that we have the block.
2137 * If there was a partial buf at the old eof, validate
2138 * and zero the new bytes.
2139 */
2140 cureof = (off_t)np->n_size;
2141 if (uio->uio_offset + n > (off_t)np->n_size) {
2142 struct nfsbuf *eofbp = NULL;
2143 daddr64_t eofbn = np->n_size / biosize;
2144 int eofoff = np->n_size % biosize;
2145 int neweofoff = (uio->uio_offset + n) % biosize;
2146
2147 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
2148
2149 if (eofoff && (eofbn < lbn)) {
2150 error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp);
2151 if (error) {
2152 np->n_flag &= ~NWRBUSY;
2153 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2154 return (error);
2155 }
2156 }
2157
2158 /* if we're extending within the same last block */
2159 /* and the block is flagged as being cached... */
2160 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2161 /* ...check that all pages in buffer are valid */
2162 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2163 u_int32_t pagemask;
2164 /* pagemask only has to extend to last page being written to */
2165 pagemask = (1 << (endpg+1)) - 1;
2166 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2167 if ((bp->nb_valid & pagemask) != pagemask) {
2168 /* zerofill any hole */
2169 if (on > bp->nb_validend) {
2170 int i;
2171 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2172 NBPGVALID_SET(bp, i);
2173 NFS_BUF_MAP(bp);
2174 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2175 bzero((char *)bp->nb_data + bp->nb_validend,
2176 on - bp->nb_validend);
2177 }
2178 /* zerofill any trailing data in the last page */
2179 if (neweofoff) {
2180 NFS_BUF_MAP(bp);
2181 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2182 bzero((char *)bp->nb_data + neweofoff,
2183 PAGE_SIZE - (neweofoff & PAGE_MASK));
2184 }
2185 }
2186 }
2187 np->n_flag |= NMODIFIED;
2188 np->n_size = uio->uio_offset + n;
2189 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
2190 if (eofbp) {
2191 /*
2192 * We may need to zero any previously invalid data
2193 * after the old EOF in the previous EOF buffer.
2194 *
2195 * For the old last page, don't zero bytes if there
2196 * are invalid bytes in that page (i.e. the page isn't
2197 * currently valid).
2198 * For pages after the old last page, zero them and
2199 * mark them as valid.
2200 */
2201 char *d;
2202 int i;
2203 if (np->n_flag & NNOCACHE)
2204 SET(eofbp->nb_flags, (NB_NOCACHE|NB_STABLE));
2205 NFS_BUF_MAP(eofbp);
2206 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2207 d = eofbp->nb_data;
2208 i = eofoff/PAGE_SIZE;
2209 while (eofoff < biosize) {
2210 int poff = eofoff & PAGE_MASK;
2211 if (!poff || NBPGVALID(eofbp,i)) {
2212 bzero(d + eofoff, PAGE_SIZE - poff);
2213 NBPGVALID_SET(eofbp, i);
2214 }
2215 if (bp->nb_validend == eofoff)
2216 bp->nb_validend += PAGE_SIZE - poff;
2217 eofoff += PAGE_SIZE - poff;
2218 i++;
2219 }
2220 nfs_buf_release(eofbp, 1);
2221 }
2222 }
2223 /*
2224 * If dirtyend exceeds file size, chop it down. This should
2225 * not occur unless there is a race.
2226 */
2227 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2228 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2229 /*
2230 * UBC doesn't handle partial pages, so we need to make sure
2231 * that any pages left in the page cache are completely valid.
2232 *
2233 * Writes that are smaller than a block are delayed if they
2234 * don't extend to the end of the block.
2235 *
2236 * If the block isn't (completely) cached, we may need to read
2237 * in some parts of pages that aren't covered by the write.
2238 * If the write offset (on) isn't page aligned, we'll need to
2239 * read the start of the first page being written to. Likewise,
2240 * if the offset of the end of the write (on+n) isn't page aligned,
2241 * we'll need to read the end of the last page being written to.
2242 *
2243 * Notes:
2244 * We don't want to read anything we're just going to write over.
2245 * We don't want to issue multiple I/Os if we don't have to
2246 * (because they're synchronous rpcs).
2247 * We don't want to read anything we already have modified in the
2248 * page cache.
2249 */
2250 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
2251 int firstpg, lastpg, dirtypg;
2252 int firstpgoff, lastpgoff;
2253 start = end = -1;
2254 firstpg = on/PAGE_SIZE;
2255 firstpgoff = on & PAGE_MASK;
2256 lastpg = (on+n-1)/PAGE_SIZE;
2257 lastpgoff = (on+n) & PAGE_MASK;
2258 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2259 /* need to read start of first page */
2260 start = firstpg * PAGE_SIZE;
2261 end = start + firstpgoff;
2262 }
2263 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2264 /* need to read end of last page */
2265 if (start < 0)
2266 start = (lastpg * PAGE_SIZE) + lastpgoff;
2267 end = (lastpg + 1) * PAGE_SIZE;
2268 }
2269 if (end > start) {
2270 /* need to read the data in range: start...end-1 */
2271
2272 /* first, check for dirty pages in between */
2273 /* if there are, we'll have to do two reads because */
2274 /* we don't want to overwrite the dirty pages. */
2275 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2276 if (NBPGDIRTY(bp,dirtypg))
2277 break;
2278
2279 /* if start is at beginning of page, try */
2280 /* to get any preceeding pages as well. */
2281 if (!(start & PAGE_MASK)) {
2282 /* stop at next dirty/valid page or start of block */
2283 for (; start > 0; start-=PAGE_SIZE)
2284 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2285 break;
2286 }
2287
2288 NFS_BUF_MAP(bp);
2289 /* setup uio for read(s) */
2290 boff = NBOFF(bp);
2291 auio.uio_iovs.iov32p = &iov;
2292 auio.uio_iovcnt = 1;
2293 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2294 auio.uio_segflg = UIO_SYSSPACE;
2295 #else
2296 auio.uio_segflg = UIO_SYSSPACE32;
2297 #endif
2298 auio.uio_rw = UIO_READ;
2299
2300 if (dirtypg <= (end-1)/PAGE_SIZE) {
2301 /* there's a dirty page in the way, so just do two reads */
2302 /* we'll read the preceding data here */
2303 auio.uio_offset = boff + start;
2304 iov.iov_len = on - start;
2305 uio_uio_resid_set(&auio, iov.iov_len);
2306 iov.iov_base = (uintptr_t) bp->nb_data + start;
2307 error = nfs_readrpc(vp, &auio, cred, p);
2308 if (error) {
2309 bp->nb_error = error;
2310 SET(bp->nb_flags, NB_ERROR);
2311 printf("nfs_write: readrpc %d", error);
2312 }
2313 if (uio_uio_resid(&auio) > 0) {
2314 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01);
2315 // LP64todo - fix this
2316 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2317 }
2318 /* update validoff/validend if necessary */
2319 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2320 bp->nb_validoff = start;
2321 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2322 bp->nb_validend = on;
2323 if ((off_t)np->n_size > boff + bp->nb_validend)
2324 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2325 /* validate any pages before the write offset */
2326 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2327 NBPGVALID_SET(bp, start/PAGE_SIZE);
2328 /* adjust start to read any trailing data */
2329 start = on+n;
2330 }
2331
2332 /* if end is at end of page, try to */
2333 /* get any following pages as well. */
2334 if (!(end & PAGE_MASK)) {
2335 /* stop at next valid page or end of block */
2336 for (; end < bufsize; end+=PAGE_SIZE)
2337 if (NBPGVALID(bp,end/PAGE_SIZE))
2338 break;
2339 }
2340
2341 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2342 /*
2343 * Either this entire read is beyond the current EOF
2344 * or the range that we won't be modifying (on+n...end)
2345 * is all beyond the current EOF.
2346 * No need to make a trip across the network to
2347 * read nothing. So, just zero the buffer instead.
2348 */
2349 FSDBG(516, bp, start, end - start, 0xd00dee00);
2350 bzero(bp->nb_data + start, end - start);
2351 } else {
2352 /* now we'll read the (rest of the) data */
2353 auio.uio_offset = boff + start;
2354 iov.iov_len = end - start;
2355 uio_uio_resid_set(&auio, iov.iov_len);
2356 iov.iov_base = (uintptr_t) (bp->nb_data + start);
2357 error = nfs_readrpc(vp, &auio, cred, p);
2358 if (error) {
2359 bp->nb_error = error;
2360 SET(bp->nb_flags, NB_ERROR);
2361 printf("nfs_write: readrpc %d", error);
2362 }
2363 if (uio_uio_resid(&auio) > 0) {
2364 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02);
2365 // LP64todo - fix this
2366 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2367 }
2368 }
2369 /* update validoff/validend if necessary */
2370 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2371 bp->nb_validoff = start;
2372 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2373 bp->nb_validend = end;
2374 if ((off_t)np->n_size > boff + bp->nb_validend)
2375 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2376 /* validate any pages before the write offset's page */
2377 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2378 NBPGVALID_SET(bp, start/PAGE_SIZE);
2379 /* validate any pages after the range of pages being written to */
2380 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2381 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2382 /* Note: pages being written to will be validated when written */
2383 }
2384 }
2385
2386 if (ISSET(bp->nb_flags, NB_ERROR)) {
2387 error = bp->nb_error;
2388 nfs_buf_release(bp, 1);
2389 np->n_flag &= ~NWRBUSY;
2390 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2391 return (error);
2392 }
2393
2394 np->n_flag |= NMODIFIED;
2395
2396 NFS_BUF_MAP(bp);
2397 error = uiomove((char *)bp->nb_data + on, n, uio);
2398 if (error) {
2399 SET(bp->nb_flags, NB_ERROR);
2400 nfs_buf_release(bp, 1);
2401 np->n_flag &= ~NWRBUSY;
2402 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2403 return (error);
2404 }
2405
2406 /* validate any pages written to */
2407 start = on & ~PAGE_MASK;
2408 for (; start < on+n; start += PAGE_SIZE) {
2409 NBPGVALID_SET(bp, start/PAGE_SIZE);
2410 /*
2411 * This may seem a little weird, but we don't actually set the
2412 * dirty bits for writes. This is because we keep the dirty range
2413 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2414 * delayed writes, when we give the pages back to the VM we don't
2415 * want to keep them marked dirty, because when we later write the
2416 * buffer we won't be able to tell which pages were written dirty
2417 * and which pages were mmapped and dirtied.
2418 */
2419 }
2420 if (bp->nb_dirtyend > 0) {
2421 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2422 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2423 } else {
2424 bp->nb_dirtyoff = on;
2425 bp->nb_dirtyend = on + n;
2426 }
2427 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2428 bp->nb_validoff > bp->nb_dirtyend) {
2429 bp->nb_validoff = bp->nb_dirtyoff;
2430 bp->nb_validend = bp->nb_dirtyend;
2431 } else {
2432 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2433 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2434 }
2435 if (!ISSET(bp->nb_flags, NB_CACHE))
2436 nfs_buf_normalize_valid_range(np, bp);
2437
2438 /*
2439 * Since this block is being modified, it must be written
2440 * again and not just committed.
2441 */
2442 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2443 np->n_needcommitcnt--;
2444 CHECK_NEEDCOMMITCNT(np);
2445 }
2446 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2447
2448 if (ioflag & IO_SYNC) {
2449 bp->nb_proc = p;
2450 error = nfs_buf_write(bp);
2451 if (error) {
2452 np->n_flag &= ~NWRBUSY;
2453 FSDBG_BOT(515, vp, uio->uio_offset,
2454 uio_uio_resid(uio), error);
2455 return (error);
2456 }
2457 } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) {
2458 bp->nb_proc = NULL;
2459 SET(bp->nb_flags, NB_ASYNC);
2460 nfs_buf_write(bp);
2461 } else
2462 nfs_buf_write_delayed(bp, p);
2463
2464 if (np->n_needcommitcnt > (nfsbufcnt/16))
2465 nfs_flushcommits(vp, p, 1);
2466
2467 } while (uio_uio_resid(uio) > 0 && n > 0);
2468
2469 np->n_flag &= ~NWRBUSY;
2470 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2471 return (0);
2472 }
2473
2474 /*
2475 * Flush out and invalidate all buffers associated with a vnode.
2476 * Called with the underlying object locked.
2477 */
2478 static int
2479 nfs_vinvalbuf_internal(
2480 vnode_t vp,
2481 int flags,
2482 kauth_cred_t cred,
2483 proc_t p,
2484 int slpflag,
2485 int slptimeo)
2486 {
2487 struct nfsbuf *bp;
2488 struct nfsbuflists blist;
2489 int list, error = 0;
2490 struct nfsnode *np = VTONFS(vp);
2491
2492 if (flags & V_SAVE) {
2493 if ((error = nfs_flush(vp, MNT_WAIT, cred, p,
2494 (flags & V_IGNORE_WRITEERR))))
2495 return (error);
2496 if (!LIST_EMPTY(&np->n_dirtyblkhd))
2497 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2498 vp, LIST_FIRST(&np->n_dirtyblkhd));
2499 }
2500
2501 lck_mtx_lock(nfs_buf_mutex);
2502 for (;;) {
2503 list = NBI_CLEAN;
2504 if (nfs_buf_iterprepare(np, &blist, list)) {
2505 list = NBI_DIRTY;
2506 if (nfs_buf_iterprepare(np, &blist, list))
2507 break;
2508 }
2509 while ((bp = LIST_FIRST(&blist))) {
2510 LIST_REMOVE(bp, nb_vnbufs);
2511 if (list == NBI_CLEAN)
2512 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2513 else
2514 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2515 nfs_buf_refget(bp);
2516 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2517 FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags);
2518 if (error != EAGAIN) {
2519 FSDBG(554, vp, bp, -1, error);
2520 nfs_buf_refrele(bp);
2521 nfs_buf_itercomplete(np, &blist, list);
2522 lck_mtx_unlock(nfs_buf_mutex);
2523 return (error);
2524 }
2525 }
2526 nfs_buf_refrele(bp);
2527 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2528 lck_mtx_unlock(nfs_buf_mutex);
2529 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp &&
2530 (NBOFF(bp) < (off_t)np->n_size)) {
2531 /* XXX extra paranoia: make sure we're not */
2532 /* somehow leaving any dirty data around */
2533 int mustwrite = 0;
2534 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
2535 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
2536 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2537 error = nfs_buf_upl_setup(bp);
2538 if (error == EINVAL) {
2539 /* vm object must no longer exist */
2540 /* hopefully we don't need to do */
2541 /* anything for this buffer */
2542 } else if (error)
2543 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
2544 bp->nb_valid = bp->nb_dirty = 0;
2545 }
2546 nfs_buf_upl_check(bp);
2547 /* check for any dirty data before the EOF */
2548 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2549 /* clip dirty range to EOF */
2550 if (bp->nb_dirtyend > end)
2551 bp->nb_dirtyend = end;
2552 mustwrite++;
2553 }
2554 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2555 /* also make sure we'll have a credential to do the write */
2556 if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) {
2557 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2558 mustwrite = 0;
2559 }
2560 if (mustwrite) {
2561 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2562 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2563 panic("nfs_vinvalbuf: dirty buffer without upl");
2564 /* gotta write out dirty data before invalidating */
2565 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2566 /* (NB_NOCACHE indicates buffer should be discarded) */
2567 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2568 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2569 if (bp->nb_wcred == NOCRED) {
2570 kauth_cred_ref(cred);
2571 bp->nb_wcred = cred;
2572 }
2573 error = nfs_buf_write(bp);
2574 // Note: bp has been released
2575 if (error) {
2576 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2577 np->n_error = error;
2578 np->n_flag |= NWRITEERR;
2579 /*
2580 * There was a write error and we need to
2581 * invalidate attrs to sync with server.
2582 * (if this write was extending the file,
2583 * we may no longer know the correct size)
2584 */
2585 NATTRINVALIDATE(np);
2586 error = 0;
2587 }
2588 lck_mtx_lock(nfs_buf_mutex);
2589 continue;
2590 }
2591 }
2592 SET(bp->nb_flags, NB_INVAL);
2593 // hold off on FREEUPs until we're done here
2594 nfs_buf_release(bp, 0);
2595 lck_mtx_lock(nfs_buf_mutex);
2596 }
2597 nfs_buf_itercomplete(np, &blist, list);
2598 }
2599 lck_mtx_unlock(nfs_buf_mutex);
2600 NFS_BUF_FREEUP();
2601 if (NVALIDBUFS(np))
2602 panic("nfs_vinvalbuf: flush failed");
2603 return (0);
2604 }
2605
2606
2607 /*
2608 * Flush and invalidate all dirty buffers. If another process is already
2609 * doing the flush, just wait for completion.
2610 */
2611 int
2612 nfs_vinvalbuf(
2613 vnode_t vp,
2614 int flags,
2615 kauth_cred_t cred,
2616 proc_t p,
2617 int intrflg)
2618 {
2619 struct nfsnode *np = VTONFS(vp);
2620 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
2621 int error = 0, slpflag, slptimeo;
2622 off_t size;
2623
2624 FSDBG_TOP(554, vp, flags, intrflg, 0);
2625
2626 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2627 intrflg = 0;
2628 if (intrflg) {
2629 slpflag = PCATCH;
2630 slptimeo = 2 * hz;
2631 } else {
2632 slpflag = 0;
2633 slptimeo = 0;
2634 }
2635 /*
2636 * First wait for any other process doing a flush to complete.
2637 */
2638 while (np->n_flag & NFLUSHINPROG) {
2639 np->n_flag |= NFLUSHWANT;
2640 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2641 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2642 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2643 if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
2644 FSDBG_BOT(554, vp, flags, intrflg, error);
2645 return (error);
2646 }
2647 }
2648
2649 /*
2650 * Now, flush as required.
2651 */
2652 np->n_flag |= NFLUSHINPROG;
2653 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2654 while (error) {
2655 FSDBG(554, vp, 0, 0, error);
2656 error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p);
2657 if (error) {
2658 np->n_flag &= ~NFLUSHINPROG;
2659 if (np->n_flag & NFLUSHWANT) {
2660 np->n_flag &= ~NFLUSHWANT;
2661 wakeup((caddr_t)&np->n_flag);
2662 }
2663 FSDBG_BOT(554, vp, flags, intrflg, error);
2664 return (error);
2665 }
2666 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2667 }
2668 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2669 if (np->n_flag & NFLUSHWANT) {
2670 np->n_flag &= ~NFLUSHWANT;
2671 wakeup((caddr_t)&np->n_flag);
2672 }
2673 /*
2674 * get the pages out of vm also
2675 */
2676 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
2677 int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE);
2678 if (!rv)
2679 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2680 }
2681
2682 FSDBG_BOT(554, vp, flags, intrflg, 0);
2683 return (0);
2684 }
2685
2686 /*
2687 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2688 * This is mainly to avoid queueing async I/O requests when the nfsiods
2689 * are all hung on a dead server.
2690 */
2691 int
2692 nfs_asyncio(bp, cred)
2693 struct nfsbuf *bp;
2694 kauth_cred_t cred;
2695 {
2696 struct nfsmount *nmp;
2697 int i;
2698 int gotiod;
2699 int slpflag = 0;
2700 int slptimeo = 0;
2701 int error, error2;
2702 void *wakeme = NULL;
2703 struct timespec ts;
2704
2705 if (nfs_numasync == 0)
2706 return (EIO);
2707
2708 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2709
2710 nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL);
2711 again:
2712 if (nmp && nmp->nm_flag & NFSMNT_INT)
2713 slpflag = PCATCH;
2714 gotiod = FALSE;
2715
2716 lck_mtx_lock(nfs_iod_mutex);
2717
2718 /* no nfsbuf means tell nfsiod to process delwri list */
2719 if (!bp)
2720 nfs_ioddelwri = 1;
2721
2722 /*
2723 * Find a free iod to process this request.
2724 */
2725 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2726 if (nfs_iodwant[i]) {
2727 /*
2728 * Found one, so wake it up and tell it which
2729 * mount to process.
2730 */
2731 nfs_iodwant[i] = NULL;
2732 nfs_iodmount[i] = nmp;
2733 if (nmp)
2734 nmp->nm_bufqiods++;
2735 wakeme = &nfs_iodwant[i];
2736 gotiod = TRUE;
2737 break;
2738 }
2739
2740 /* if we're just poking the delwri list, we're done */
2741 if (!bp) {
2742 lck_mtx_unlock(nfs_iod_mutex);
2743 if (wakeme)
2744 wakeup(wakeme);
2745 FSDBG_BOT(552, bp, 0x10101010, wakeme, 0);
2746 return (0);
2747 }
2748
2749 /*
2750 * If none are free, we may already have an iod working on this mount
2751 * point. If so, it will process our request.
2752 */
2753 if (!gotiod) {
2754 if (nmp->nm_bufqiods > 0) {
2755 gotiod = TRUE;
2756 }
2757 }
2758
2759 /*
2760 * If we have an iod which can process the request, then queue
2761 * the buffer.
2762 */
2763 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2764 if (gotiod) {
2765 /*
2766 * Ensure that the queue never grows too large.
2767 */
2768 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2769 if (ISSET(bp->nb_flags, NB_IOD)) {
2770 /* An nfsiod is attempting this async operation so */
2771 /* we must not fall asleep on the bufq because we */
2772 /* could be waiting on ourself. Just return error */
2773 /* and we'll do this operation syncrhonously. */
2774 goto out;
2775 }
2776 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2777 nmp->nm_bufqwant = TRUE;
2778
2779 ts.tv_sec = (slptimeo/100);
2780 /* the hz value is 100; which leads to 10ms */
2781 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2782
2783 error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO,
2784 "nfsaio", &ts);
2785 if (error) {
2786 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2787 if (error2) {
2788 lck_mtx_unlock(nfs_iod_mutex);
2789 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2790 return (error2);
2791 }
2792 if (slpflag == PCATCH) {
2793 slpflag = 0;
2794 slptimeo = 2 * hz;
2795 }
2796 }
2797 /*
2798 * We might have lost our iod while sleeping,
2799 * so check and loop if nescessary.
2800 */
2801 if (nmp->nm_bufqiods == 0) {
2802 lck_mtx_unlock(nfs_iod_mutex);
2803 goto again;
2804 }
2805 }
2806
2807 if (ISSET(bp->nb_flags, NB_READ)) {
2808 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2809 kauth_cred_ref(cred);
2810 bp->nb_rcred = cred;
2811 }
2812 } else {
2813 SET(bp->nb_flags, NB_WRITEINPROG);
2814 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2815 kauth_cred_ref(cred);
2816 bp->nb_wcred = cred;
2817 }
2818 }
2819
2820 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2821 nmp->nm_bufqlen++;
2822 lck_mtx_unlock(nfs_iod_mutex);
2823 if (wakeme)
2824 wakeup(wakeme);
2825 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2826 return (0);
2827 }
2828
2829 out:
2830 lck_mtx_unlock(nfs_iod_mutex);
2831 /*
2832 * All the iods are busy on other mounts, so return EIO to
2833 * force the caller to process the i/o synchronously.
2834 */
2835 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2836 return (EIO);
2837 }
2838
2839 /*
2840 * Do an I/O operation to/from a cache block. This may be called
2841 * synchronously or from an nfsiod.
2842 */
2843 int
2844 nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p)
2845 {
2846 struct uio *uiop;
2847 vnode_t vp;
2848 struct nfsnode *np;
2849 struct nfsmount *nmp;
2850 int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0;
2851 struct uio uio;
2852 struct iovec_32 io;
2853 enum vtype vtype;
2854
2855 vp = bp->nb_vp;
2856 vtype = vnode_vtype(vp);
2857 np = VTONFS(vp);
2858 nmp = VFSTONFS(vnode_mount(vp));
2859 uiop = &uio;
2860 uiop->uio_iovs.iov32p = &io;
2861 uiop->uio_iovcnt = 1;
2862 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2863 uiop->uio_segflg = UIO_SYSSPACE;
2864 #else
2865 uiop->uio_segflg = UIO_SYSSPACE32;
2866 #endif
2867
2868 /*
2869 * we've decided to perform I/O for this block,
2870 * so we couldn't possibly NB_DONE. So, clear it.
2871 */
2872 if (ISSET(bp->nb_flags, NB_DONE)) {
2873 if (!ISSET(bp->nb_flags, NB_ASYNC))
2874 panic("nfs_doio: done and not async");
2875 CLR(bp->nb_flags, NB_DONE);
2876 }
2877 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2878 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2879 bp->nb_dirtyend);
2880
2881 if (ISSET(bp->nb_flags, NB_READ)) {
2882 if (vtype == VREG)
2883 NFS_BUF_MAP(bp);
2884 io.iov_len = bp->nb_bufsize;
2885 uio_uio_resid_set(uiop, io.iov_len);
2886 io.iov_base = (uintptr_t) bp->nb_data;
2887 uiop->uio_rw = UIO_READ;
2888 switch (vtype) {
2889 case VREG:
2890 uiop->uio_offset = NBOFF(bp);
2891 OSAddAtomic(1, (SInt32*)&nfsstats.read_bios);
2892 error = nfs_readrpc(vp, uiop, cr, p);
2893 FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error);
2894 if (!error) {
2895 /* update valid range */
2896 bp->nb_validoff = 0;
2897 if (uio_uio_resid(uiop) != 0) {
2898 /*
2899 * If len > 0, there is a hole in the file and
2900 * no writes after the hole have been pushed to
2901 * the server yet.
2902 * Just zero fill the rest of the valid area.
2903 */
2904 // LP64todo - fix this
2905 diff = bp->nb_bufsize - uio_uio_resid(uiop);
2906 len = np->n_size - (NBOFF(bp) + diff);
2907 if (len > 0) {
2908 // LP64todo - fix this
2909 len = min(len, uio_uio_resid(uiop));
2910 bzero((char *)bp->nb_data + diff, len);
2911 bp->nb_validend = diff + len;
2912 FSDBG(258, diff, len, 0, 1);
2913 } else
2914 bp->nb_validend = diff;
2915 } else
2916 bp->nb_validend = bp->nb_bufsize;
2917 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2918 if (bp->nb_validend & PAGE_MASK) {
2919 /* valid range ends in the middle of a page so we */
2920 /* need to zero-fill any invalid data at the end */
2921 /* of the last page */
2922 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2923 bp->nb_bufsize - bp->nb_validend);
2924 FSDBG(258, bp->nb_validend,
2925 bp->nb_bufsize - bp->nb_validend, 0, 2);
2926 }
2927 }
2928 break;
2929 case VLNK:
2930 uiop->uio_offset = (off_t)0;
2931 OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios);
2932 error = nfs_readlinkrpc(vp, uiop, cr, p);
2933 if (!error) {
2934 bp->nb_validoff = 0;
2935 bp->nb_validend = uiop->uio_offset;
2936 }
2937 break;
2938 case VDIR:
2939 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
2940 uiop->uio_offset = NBOFF(bp);
2941 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2942 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2943 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2944 error = nfs_readdirplusrpc(vp, uiop, cr, p);
2945 if (error == NFSERR_NOTSUPP)
2946 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2947 }
2948 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2949 error = nfs_readdirrpc(vp, uiop, cr, p);
2950 if (!error) {
2951 bp->nb_validoff = 0;
2952 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2953 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2954 }
2955 break;
2956 default:
2957 printf("nfs_doio: type %x unexpected\n", vtype);
2958 break;
2959 };
2960 if (error) {
2961 SET(bp->nb_flags, NB_ERROR);
2962 bp->nb_error = error;
2963 }
2964
2965 } else {
2966 /* we're doing a write */
2967 int doff, dend = 0;
2968
2969 /* We need to make sure the pages are locked before doing I/O. */
2970 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) {
2971 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2972 error = nfs_buf_upl_setup(bp);
2973 if (error) {
2974 printf("nfs_doio: upl create failed %d\n", error);
2975 SET(bp->nb_flags, NB_ERROR);
2976 bp->nb_error = EIO;
2977 return (EIO);
2978 }
2979 nfs_buf_upl_check(bp);
2980 }
2981 }
2982
2983 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2984 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2985 /*
2986 * There are pages marked dirty that need to be written out.
2987 *
2988 * We don't want to just combine the write range with the
2989 * range of pages that are dirty because that could cause us
2990 * to write data that wasn't actually written to.
2991 * We also don't want to write data more than once.
2992 *
2993 * If the dirty range just needs to be committed, we do that.
2994 * Otherwise, we write the dirty range and clear the dirty bits
2995 * for any COMPLETE pages covered by that range.
2996 * If there are dirty pages left after that, we write out the
2997 * parts that we haven't written yet.
2998 */
2999 }
3000
3001 /*
3002 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3003 * an actual write will have to be done.
3004 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
3005 */
3006 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
3007 doff = NBOFF(bp) + bp->nb_dirtyoff;
3008 SET(bp->nb_flags, NB_WRITEINPROG);
3009 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
3010 bp->nb_wcred, bp->nb_proc);
3011 CLR(bp->nb_flags, NB_WRITEINPROG);
3012 if (!error) {
3013 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3014 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3015 np->n_needcommitcnt--;
3016 CHECK_NEEDCOMMITCNT(np);
3017 } else if (error == NFSERR_STALEWRITEVERF)
3018 nfs_clearcommit(vnode_mount(vp));
3019 }
3020
3021 if (!error && bp->nb_dirtyend > 0) {
3022 /* there's a dirty range that needs to be written out */
3023 u_int32_t pagemask;
3024 int firstpg, lastpg;
3025
3026 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
3027 bp->nb_dirtyend = np->n_size - NBOFF(bp);
3028
3029 NFS_BUF_MAP(bp);
3030
3031 doff = bp->nb_dirtyoff;
3032 dend = bp->nb_dirtyend;
3033
3034 /* if doff page is dirty, move doff to start of page */
3035 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
3036 doff -= doff & PAGE_MASK;
3037 /* try to expand write range to include preceding dirty pages */
3038 if (!(doff & PAGE_MASK))
3039 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
3040 doff -= PAGE_SIZE;
3041 /* if dend page is dirty, move dend to start of next page */
3042 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
3043 dend = round_page_32(dend);
3044 /* try to expand write range to include trailing dirty pages */
3045 if (!(dend & PAGE_MASK))
3046 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
3047 dend += PAGE_SIZE;
3048 /* make sure to keep dend clipped to EOF */
3049 if (NBOFF(bp) + dend > (off_t)np->n_size)
3050 dend = np->n_size - NBOFF(bp);
3051 /* calculate range of complete pages being written */
3052 firstpg = round_page_32(doff) / PAGE_SIZE;
3053 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
3054 /* calculate mask for that page range */
3055 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
3056
3057 /* compare page mask to nb_dirty; if there are other dirty pages */
3058 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
3059 /* not needcommit/nocache/call; otherwise write FILESYNC */
3060 if (bp->nb_dirty & ~pagemask)
3061 iomode = NFSV3WRITE_FILESYNC;
3062 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
3063 iomode = NFSV3WRITE_UNSTABLE;
3064 else
3065 iomode = NFSV3WRITE_FILESYNC;
3066
3067 /* write the dirty range */
3068 io.iov_len = dend - doff;
3069 uio_uio_resid_set(uiop, io.iov_len);
3070 uiop->uio_offset = NBOFF(bp) + doff;
3071 io.iov_base = (uintptr_t) bp->nb_data + doff;
3072 uiop->uio_rw = UIO_WRITE;
3073
3074 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3075
3076 SET(bp->nb_flags, NB_WRITEINPROG);
3077 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3078 if (must_commit)
3079 nfs_clearcommit(vnode_mount(vp));
3080 /* clear dirty bits for pages we've written */
3081 if (!error)
3082 bp->nb_dirty &= ~pagemask;
3083 /* set/clear needcommit flag */
3084 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
3085 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3086 np->n_needcommitcnt++;
3087 SET(bp->nb_flags, NB_NEEDCOMMIT);
3088 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3089 bp->nb_dirtyoff = doff;
3090 bp->nb_dirtyend = dend;
3091 } else {
3092 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3093 np->n_needcommitcnt--;
3094 CHECK_NEEDCOMMITCNT(np);
3095 }
3096 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3097 }
3098 CLR(bp->nb_flags, NB_WRITEINPROG);
3099 /*
3100 * For an interrupted write, the buffer is still valid and the write
3101 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3102 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3103 * NB_EINTR is not relevant.
3104 *
3105 * For the case of a V3 write rpc not being committed to stable
3106 * storage, the block is still dirty and requires either a commit rpc
3107 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3108 * block is reused. This is indicated by setting the NB_DELWRI and
3109 * NB_NEEDCOMMIT flags.
3110 */
3111 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
3112 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
3113 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3114 SET(bp->nb_flags, NB_DELWRI);
3115 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
3116 NFSBUFCNTCHK(0);
3117 }
3118 FSDBG(261, bp->nb_validoff, bp->nb_validend,
3119 bp->nb_bufsize, 0);
3120 /*
3121 * Since for the NB_ASYNC case, nfs_bwrite() has
3122 * reassigned the buffer to the clean list, we have to
3123 * reassign it back to the dirty one. Ugh.
3124 */
3125 if (ISSET(bp->nb_flags, NB_ASYNC)) {
3126 /* move to dirty list */
3127 lck_mtx_lock(nfs_buf_mutex);
3128 if (bp->nb_vnbufs.le_next != NFSNOLIST)
3129 LIST_REMOVE(bp, nb_vnbufs);
3130 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3131 lck_mtx_unlock(nfs_buf_mutex);
3132 } else {
3133 SET(bp->nb_flags, NB_EINTR);
3134 }
3135 } else {
3136 /* either there's an error or we don't need to commit */
3137 if (error) {
3138 SET(bp->nb_flags, NB_ERROR);
3139 bp->nb_error = np->n_error = error;
3140 np->n_flag |= NWRITEERR;
3141 /*
3142 * There was a write error and we need to
3143 * invalidate attrs and flush buffers in
3144 * order to sync up with the server.
3145 * (if this write was extending the file,
3146 * we may no longer know the correct size)
3147 *
3148 * But we can't call vinvalbuf while holding
3149 * this buffer busy. Set a flag to do it after
3150 * releasing the buffer.
3151 *
3152 * Note we can only invalidate in this function
3153 * if this is an async write and so the iodone
3154 * below will release the buffer. Also, we
3155 * shouldn't call vinvalbuf from nfsiod because
3156 * that may deadlock waiting for the completion
3157 * of writes that are queued up behind this one.
3158 */
3159 if (ISSET(bp->nb_flags, NB_ASYNC) &&
3160 !ISSET(bp->nb_flags, NB_IOD)) {
3161 invalidate = 1;
3162 } else {
3163 /* invalidate later */
3164 np->n_flag |= NNEEDINVALIDATE;
3165 }
3166 NATTRINVALIDATE(np);
3167 }
3168 /* clear the dirty range */
3169 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3170 }
3171 }
3172
3173 if (!error && bp->nb_dirty) {
3174 /* there are pages marked dirty that need to be written out */
3175 int pg, count, npages, off;
3176
3177 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3178
3179 NFS_BUF_MAP(bp);
3180
3181 /*
3182 * we do these writes synchronously because we can't really
3183 * support the unstable/needommit method. We could write
3184 * them unstable, clear the dirty bits, and then commit the
3185 * whole block later, but if we need to rewrite the data, we
3186 * won't have any idea which pages were written because that
3187 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3188 * also can't leave the dirty bits set because then we wouldn't
3189 * be able to tell if the pages were re-dirtied between the end
3190 * of the write and the commit.
3191 */
3192 iomode = NFSV3WRITE_FILESYNC;
3193 uiop->uio_rw = UIO_WRITE;
3194
3195 SET(bp->nb_flags, NB_WRITEINPROG);
3196 npages = bp->nb_bufsize/PAGE_SIZE;
3197 for (pg=0; pg < npages; pg++) {
3198 if (!NBPGDIRTY(bp,pg))
3199 continue;
3200 count = 1;
3201 while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count))
3202 count++;
3203 /* write count pages starting with page pg */
3204 off = pg * PAGE_SIZE;
3205 len = count * PAGE_SIZE;
3206
3207 /* clip writes to EOF */
3208 if (NBOFF(bp) + off + len > (off_t)np->n_size)
3209 len -= (NBOFF(bp) + off + len) - np->n_size;
3210 if (len > 0) {
3211 io.iov_len = len;
3212 uio_uio_resid_set(uiop, io.iov_len);
3213 uiop->uio_offset = NBOFF(bp) + off;
3214 io.iov_base = (uintptr_t) bp->nb_data + off;
3215 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3216 if (must_commit)
3217 nfs_clearcommit(vnode_mount(vp));
3218 if (error)
3219 break;
3220 }
3221 /* clear dirty bits */
3222 while (count--) {
3223 bp->nb_dirty &= ~(1 << pg);
3224 /* leave pg on last page */
3225 if (count) pg++;
3226 }
3227 }
3228 if (!error) {
3229 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3230 np->n_needcommitcnt--;
3231 CHECK_NEEDCOMMITCNT(np);
3232 }
3233 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3234 }
3235 CLR(bp->nb_flags, NB_WRITEINPROG);
3236 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
3237 np->n_size);
3238 }
3239
3240 if (error) {
3241 SET(bp->nb_flags, NB_ERROR);
3242 bp->nb_error = error;
3243 }
3244 }
3245
3246 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
3247
3248 nfs_buf_iodone(bp);
3249
3250 if (invalidate) {
3251 /*
3252 * There was a write error and we need to
3253 * invalidate attrs and flush buffers in
3254 * order to sync up with the server.
3255 * (if this write was extending the file,
3256 * we may no longer know the correct size)
3257 *
3258 * But we couldn't call vinvalbuf while holding
3259 * the buffer busy. So we call vinvalbuf() after
3260 * releasing the buffer.
3261 *
3262 * Note: we don't bother calling nfs_vinvalbuf() if
3263 * there's already a flush in progress.
3264 */
3265 if (!(np->n_flag & NFLUSHINPROG))
3266 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1);
3267 }
3268
3269 return (error);
3270 }