]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
482d4aef48b322eadafe7c6c5c50705fe9600bcb
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/malloc.h>
68 #include <sys/vnode.h>
69 #include <sys/dirent.h>
70 #include <sys/mount_internal.h>
71 #include <sys/kernel.h>
72 #include <sys/sysctl.h>
73 #include <sys/ubc_internal.h>
74 #include <sys/uio_internal.h>
75
76 #include <sys/vm.h>
77 #include <sys/vmparam.h>
78
79 #include <sys/time.h>
80 #include <kern/clock.h>
81 #include <libkern/OSAtomic.h>
82 #include <kern/kalloc.h>
83
84 #include <nfs/rpcv2.h>
85 #include <nfs/nfsproto.h>
86 #include <nfs/nfs.h>
87 #include <nfs/nfsmount.h>
88 #include <nfs/nfsnode.h>
89 #include <sys/buf_internal.h>
90
91 #include <sys/kdebug.h>
92
93 #define FSDBG(A, B, C, D, E) \
94 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
95 (int)(B), (int)(C), (int)(D), (int)(E), 0)
96 #define FSDBG_TOP(A, B, C, D, E) \
97 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
98 (int)(B), (int)(C), (int)(D), (int)(E), 0)
99 #define FSDBG_BOT(A, B, C, D, E) \
100 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
101 (int)(B), (int)(C), (int)(D), (int)(E), 0)
102
103 extern int nfs_numasync;
104 extern int nfs_ioddelwri;
105 extern struct nfsstats nfsstats;
106
107 #define NFSBUFHASH(np, lbn) \
108 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
109 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
110 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
111 u_long nfsbufhash;
112 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
113 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
114 int nfs_nbdwrite;
115 time_t nfsbuffreeuptimestamp;
116
117 lck_grp_t *nfs_buf_lck_grp;
118 lck_grp_attr_t *nfs_buf_lck_grp_attr;
119 lck_attr_t *nfs_buf_lck_attr;
120 lck_mtx_t *nfs_buf_mutex;
121
122 #define NFSBUFWRITE_THROTTLE 9
123 #define NFSBUF_LRU_STALE 120
124 #define NFSBUF_META_STALE 240
125
126 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
127 #define LRU_TO_FREEUP 6
128 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
129 #define META_TO_FREEUP 3
130 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
131 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
132 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
133 #define LRU_FREEUP_FRAC_ON_TIMER 8
134 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
135 #define META_FREEUP_FRAC_ON_TIMER 16
136 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
137 #define LRU_FREEUP_MIN_FRAC 4
138 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
139 #define META_FREEUP_MIN_FRAC 2
140
141 #define NFS_BUF_FREEUP() \
142 do { \
143 /* only call nfs_buf_freeup() if it has work to do: */ \
144 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
145 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
146 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
147 nfs_buf_freeup(0); \
148 } while (0)
149
150 /*
151 * Initialize nfsbuf lists
152 */
153 void
154 nfs_nbinit(void)
155 {
156 nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init();
157 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr);
158
159 nfs_buf_lck_attr = lck_attr_alloc_init();
160
161 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr);
162
163 nfsbufcnt = nfsbufmetacnt =
164 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
165 nfsbufmin = 128;
166 nfsbufmax = (sane_size >> PAGE_SHIFT) / 4;
167 nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16;
168 nfsneedbuffer = 0;
169 nfs_nbdwrite = 0;
170 nfsbuffreeuptimestamp = 0;
171
172 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
173 TAILQ_INIT(&nfsbuffree);
174 TAILQ_INIT(&nfsbuffreemeta);
175 TAILQ_INIT(&nfsbufdelwri);
176
177 }
178
179 /*
180 * try to free up some excess, unused nfsbufs
181 */
182 void
183 nfs_buf_freeup(int timer)
184 {
185 struct nfsbuf *fbp;
186 struct timeval now;
187 int count;
188 struct nfsbuffreehead nfsbuffreeup;
189
190 TAILQ_INIT(&nfsbuffreeup);
191
192 lck_mtx_lock(nfs_buf_mutex);
193
194 microuptime(&now);
195 nfsbuffreeuptimestamp = now.tv_sec;
196
197 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
198
199 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
200 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
201 fbp = TAILQ_FIRST(&nfsbuffree);
202 if (!fbp)
203 break;
204 if (fbp->nb_refs)
205 break;
206 if (NBUFSTAMPVALID(fbp) &&
207 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
208 break;
209 nfs_buf_remfree(fbp);
210 /* disassociate buffer from any vnode */
211 if (fbp->nb_vp) {
212 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
213 LIST_REMOVE(fbp, nb_vnbufs);
214 fbp->nb_vnbufs.le_next = NFSNOLIST;
215 }
216 fbp->nb_vp = NULL;
217 }
218 LIST_REMOVE(fbp, nb_hash);
219 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
220 nfsbufcnt--;
221 }
222
223 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
224 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
225 fbp = TAILQ_FIRST(&nfsbuffreemeta);
226 if (!fbp)
227 break;
228 if (fbp->nb_refs)
229 break;
230 if (NBUFSTAMPVALID(fbp) &&
231 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
232 break;
233 nfs_buf_remfree(fbp);
234 /* disassociate buffer from any vnode */
235 if (fbp->nb_vp) {
236 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
237 LIST_REMOVE(fbp, nb_vnbufs);
238 fbp->nb_vnbufs.le_next = NFSNOLIST;
239 }
240 fbp->nb_vp = NULL;
241 }
242 LIST_REMOVE(fbp, nb_hash);
243 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
244 nfsbufcnt--;
245 nfsbufmetacnt--;
246 }
247
248 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
249 NFSBUFCNTCHK(1);
250
251 lck_mtx_unlock(nfs_buf_mutex);
252
253 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
254 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
255 /* nuke any creds */
256 if (fbp->nb_rcred != NOCRED) {
257 kauth_cred_rele(fbp->nb_rcred);
258 fbp->nb_rcred = NOCRED;
259 }
260 if (fbp->nb_wcred != NOCRED) {
261 kauth_cred_rele(fbp->nb_wcred);
262 fbp->nb_wcred = NOCRED;
263 }
264 /* if buf was NB_META, dump buffer */
265 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
266 kfree(fbp->nb_data, fbp->nb_bufsize);
267 FREE(fbp, M_TEMP);
268 }
269
270 }
271
272 /*
273 * remove a buffer from the freelist
274 * (must be called with nfs_buf_mutex held)
275 */
276 void
277 nfs_buf_remfree(struct nfsbuf *bp)
278 {
279 if (bp->nb_free.tqe_next == NFSNOLIST)
280 panic("nfsbuf not on free list");
281 if (ISSET(bp->nb_flags, NB_DELWRI)) {
282 nfsbufdelwricnt--;
283 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
284 } else if (ISSET(bp->nb_flags, NB_META)) {
285 nfsbuffreemetacnt--;
286 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
287 } else {
288 nfsbuffreecnt--;
289 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
290 }
291 bp->nb_free.tqe_next = NFSNOLIST;
292 NFSBUFCNTCHK(1);
293 }
294
295 /*
296 * check for existence of nfsbuf in cache
297 */
298 boolean_t
299 nfs_buf_is_incore(vnode_t vp, daddr64_t blkno)
300 {
301 boolean_t rv;
302 lck_mtx_lock(nfs_buf_mutex);
303 if (nfs_buf_incore(vp, blkno))
304 rv = TRUE;
305 else
306 rv = FALSE;
307 lck_mtx_unlock(nfs_buf_mutex);
308 return (rv);
309 }
310
311 /*
312 * return incore buffer (must be called with nfs_buf_mutex held)
313 */
314 struct nfsbuf *
315 nfs_buf_incore(vnode_t vp, daddr64_t blkno)
316 {
317 /* Search hash chain */
318 struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first;
319 for (; bp != NULL; bp = bp->nb_hash.le_next)
320 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
321 if (!ISSET(bp->nb_flags, NB_INVAL)) {
322 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
323 return (bp);
324 }
325 }
326 return (NULL);
327 }
328
329 /*
330 * Check if it's OK to drop a page.
331 *
332 * Called by vnode_pager() on pageout request of non-dirty page.
333 * We need to make sure that it's not part of a delayed write.
334 * If it is, we can't let the VM drop it because we may need it
335 * later when/if we need to write the data (again).
336 */
337 int
338 nfs_buf_page_inval(vnode_t vp, off_t offset)
339 {
340 struct nfsbuf *bp;
341 int error = 0;
342
343 lck_mtx_lock(nfs_buf_mutex);
344 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
345 if (!bp)
346 goto out;
347 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
348 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
349 error = EBUSY;
350 goto out;
351 }
352 /*
353 * If there's a dirty range in the buffer, check to
354 * see if this page intersects with the dirty range.
355 * If it does, we can't let the pager drop the page.
356 */
357 if (bp->nb_dirtyend > 0) {
358 int start = offset - NBOFF(bp);
359 if (bp->nb_dirtyend <= start ||
360 bp->nb_dirtyoff >= (start + PAGE_SIZE))
361 error = 0;
362 else
363 error = EBUSY;
364 }
365 out:
366 lck_mtx_unlock(nfs_buf_mutex);
367 return (error);
368 }
369
370 /*
371 * set up the UPL for a buffer
372 * (must NOT be called with nfs_buf_mutex held)
373 */
374 int
375 nfs_buf_upl_setup(struct nfsbuf *bp)
376 {
377 kern_return_t kret;
378 upl_t upl;
379 int upl_flags;
380
381 if (ISSET(bp->nb_flags, NB_PAGELIST))
382 return (0);
383
384 upl_flags = UPL_PRECIOUS;
385 if (! ISSET(bp->nb_flags, NB_READ)) {
386 /*
387 * We're doing a "write", so we intend to modify
388 * the pages we're gathering.
389 */
390 upl_flags |= UPL_WILL_MODIFY;
391 }
392 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
393 &upl, NULL, upl_flags);
394 if (kret == KERN_INVALID_ARGUMENT) {
395 /* vm object probably doesn't exist any more */
396 bp->nb_pagelist = NULL;
397 return (EINVAL);
398 }
399 if (kret != KERN_SUCCESS) {
400 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
401 bp->nb_pagelist = NULL;
402 return (EIO);
403 }
404
405 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
406
407 bp->nb_pagelist = upl;
408 SET(bp->nb_flags, NB_PAGELIST);
409 return (0);
410 }
411
412 /*
413 * update buffer's valid/dirty info from UBC
414 * (must NOT be called with nfs_buf_mutex held)
415 */
416 void
417 nfs_buf_upl_check(struct nfsbuf *bp)
418 {
419 upl_page_info_t *pl;
420 off_t filesize, fileoffset;
421 int i, npages;
422
423 if (!ISSET(bp->nb_flags, NB_PAGELIST))
424 return;
425
426 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
427 filesize = ubc_getsize(bp->nb_vp);
428 fileoffset = NBOFF(bp);
429 if (fileoffset < filesize)
430 SET(bp->nb_flags, NB_CACHE);
431 else
432 CLR(bp->nb_flags, NB_CACHE);
433
434 pl = ubc_upl_pageinfo(bp->nb_pagelist);
435 bp->nb_valid = bp->nb_dirty = 0;
436
437 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
438 /* anything beyond the end of the file is not valid or dirty */
439 if (fileoffset >= filesize)
440 break;
441 if (!upl_valid_page(pl, i)) {
442 CLR(bp->nb_flags, NB_CACHE);
443 continue;
444 }
445 NBPGVALID_SET(bp,i);
446 if (upl_dirty_page(pl, i)) {
447 NBPGDIRTY_SET(bp, i);
448 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
449 SET(bp->nb_flags, NB_WASDIRTY);
450 }
451 }
452 fileoffset = NBOFF(bp);
453 if (ISSET(bp->nb_flags, NB_CACHE)) {
454 bp->nb_validoff = 0;
455 bp->nb_validend = bp->nb_bufsize;
456 if (fileoffset + bp->nb_validend > filesize)
457 bp->nb_validend = filesize - fileoffset;
458 } else {
459 bp->nb_validoff = bp->nb_validend = -1;
460 }
461 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
462 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
463 }
464
465 /*
466 * make sure that a buffer is mapped
467 * (must NOT be called with nfs_buf_mutex held)
468 */
469 static int
470 nfs_buf_map(struct nfsbuf *bp)
471 {
472 kern_return_t kret;
473
474 if (bp->nb_data)
475 return (0);
476 if (!ISSET(bp->nb_flags, NB_PAGELIST))
477 return (EINVAL);
478
479 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
480 if (kret != KERN_SUCCESS)
481 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
482 if (bp->nb_data == 0)
483 panic("ubc_upl_map mapped 0");
484 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
485 return (0);
486 }
487
488 /*
489 * check range of pages in nfsbuf's UPL for validity
490 */
491 static int
492 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
493 {
494 off_t fileoffset, filesize;
495 int pg, lastpg;
496 upl_page_info_t *pl;
497
498 if (!ISSET(bp->nb_flags, NB_PAGELIST))
499 return (0);
500 pl = ubc_upl_pageinfo(bp->nb_pagelist);
501
502 size += off & PAGE_MASK;
503 off &= ~PAGE_MASK;
504 fileoffset = NBOFF(bp);
505 filesize = VTONFS(bp->nb_vp)->n_size;
506 if ((fileoffset + off + size) > filesize)
507 size = filesize - (fileoffset + off);
508
509 pg = off/PAGE_SIZE;
510 lastpg = (off + size - 1)/PAGE_SIZE;
511 while (pg <= lastpg) {
512 if (!upl_valid_page(pl, pg))
513 return (0);
514 pg++;
515 }
516 return (1);
517 }
518
519 /*
520 * normalize an nfsbuf's valid range
521 *
522 * the read/write code guarantees that we'll always have a valid
523 * region that is an integral number of pages. If either end
524 * of the valid range isn't page-aligned, it gets corrected
525 * here as we extend the valid range through all of the
526 * contiguous valid pages.
527 */
528 static void
529 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
530 {
531 int pg, npg;
532 /* pull validoff back to start of contiguous valid page range */
533 pg = bp->nb_validoff/PAGE_SIZE;
534 while (pg >= 0 && NBPGVALID(bp,pg))
535 pg--;
536 bp->nb_validoff = (pg+1) * PAGE_SIZE;
537 /* push validend forward to end of contiguous valid page range */
538 npg = bp->nb_bufsize/PAGE_SIZE;
539 pg = bp->nb_validend/PAGE_SIZE;
540 while (pg < npg && NBPGVALID(bp,pg))
541 pg++;
542 bp->nb_validend = pg * PAGE_SIZE;
543 /* clip to EOF */
544 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
545 bp->nb_validend = np->n_size % bp->nb_bufsize;
546 }
547
548 /*
549 * try to push out some delayed/uncommitted writes
550 * ("locked" indicates whether nfs_buf_mutex is already held)
551 */
552 static void
553 nfs_buf_delwri_push(int locked)
554 {
555 struct nfsbuf *bp;
556 int i, error;
557
558 if (TAILQ_EMPTY(&nfsbufdelwri))
559 return;
560
561 /* first try to tell the nfsiods to do it */
562 if (nfs_asyncio(NULL, NULL) == 0)
563 return;
564
565 /* otherwise, try to do some of the work ourselves */
566 i = 0;
567 if (!locked)
568 lck_mtx_lock(nfs_buf_mutex);
569 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
570 struct nfsnode *np = VTONFS(bp->nb_vp);
571 nfs_buf_remfree(bp);
572 nfs_buf_refget(bp);
573 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
574 nfs_buf_refrele(bp);
575 if (error)
576 break;
577 if (!bp->nb_vp) {
578 /* buffer is no longer valid */
579 nfs_buf_drop(bp);
580 continue;
581 }
582 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
583 /* put buffer at end of delwri list */
584 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
585 nfsbufdelwricnt++;
586 nfs_buf_drop(bp);
587 lck_mtx_unlock(nfs_buf_mutex);
588 nfs_flushcommits(np->n_vnode, NULL, 1);
589 } else {
590 SET(bp->nb_flags, NB_ASYNC);
591 lck_mtx_unlock(nfs_buf_mutex);
592 nfs_buf_write(bp);
593 }
594 i++;
595 lck_mtx_lock(nfs_buf_mutex);
596 }
597 if (!locked)
598 lck_mtx_unlock(nfs_buf_mutex);
599 }
600
601 /*
602 * Get an nfs buffer.
603 *
604 * Returns errno on error, 0 otherwise.
605 * Any buffer is returned in *bpp.
606 *
607 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
608 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
609 *
610 * Check for existence of buffer in cache.
611 * Or attempt to reuse a buffer from one of the free lists.
612 * Or allocate a new buffer if we haven't already hit max allocation.
613 * Or wait for a free buffer.
614 *
615 * If available buffer found, prepare it, and return it.
616 *
617 * If the calling process is interrupted by a signal for
618 * an interruptible mount point, return EINTR.
619 */
620 int
621 nfs_buf_get(
622 vnode_t vp,
623 daddr64_t blkno,
624 int size,
625 proc_t p,
626 int flags,
627 struct nfsbuf **bpp)
628 {
629 struct nfsnode *np = VTONFS(vp);
630 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
631 struct nfsbuf *bp;
632 int biosize, bufsize;
633 kauth_cred_t cred;
634 int slpflag = PCATCH;
635 int operation = (flags & NBLK_OPMASK);
636 int error = 0;
637 struct timespec ts;
638
639 FSDBG_TOP(541, vp, blkno, size, flags);
640 *bpp = NULL;
641
642 bufsize = size;
643 if (bufsize > NFS_MAXBSIZE)
644 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
645
646 if (!nmp) {
647 FSDBG_BOT(541, vp, blkno, 0, ENXIO);
648 return (ENXIO);
649 }
650 biosize = nmp->nm_biosize;
651
652 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) {
653 operation = NBLK_META;
654 } else if (bufsize < biosize) {
655 /* reg files should always have biosize blocks */
656 bufsize = biosize;
657 }
658
659 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
660 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
661 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
662
663 /* poke the delwri list */
664 nfs_buf_delwri_push(0);
665
666 /* sleep to let other threads run... */
667 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
668 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
669 }
670
671 loop:
672 lck_mtx_lock(nfs_buf_mutex);
673
674 /* check for existence of nfsbuf in cache */
675 if ((bp = nfs_buf_incore(vp, blkno))) {
676 /* if busy, set wanted and wait */
677 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
678 if (flags & NBLK_NOWAIT) {
679 lck_mtx_unlock(nfs_buf_mutex);
680 FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc);
681 return (0);
682 }
683 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
684 SET(bp->nb_lflags, NBL_WANTED);
685
686 ts.tv_sec = 2;
687 ts.tv_nsec = 0;
688 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
689 "nfsbufget", (slpflag == PCATCH) ? 0 : &ts);
690 slpflag = 0;
691 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
692 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
693 FSDBG_BOT(541, vp, blkno, 0, error);
694 return (error);
695 }
696 goto loop;
697 }
698 if (bp->nb_bufsize != bufsize)
699 panic("nfsbuf size mismatch");
700 SET(bp->nb_lflags, NBL_BUSY);
701 SET(bp->nb_flags, NB_CACHE);
702 nfs_buf_remfree(bp);
703 /* additional paranoia: */
704 if (ISSET(bp->nb_flags, NB_PAGELIST))
705 panic("pagelist buffer was not busy");
706 goto buffer_setup;
707 }
708
709 if (flags & NBLK_ONLYVALID) {
710 lck_mtx_unlock(nfs_buf_mutex);
711 FSDBG_BOT(541, vp, blkno, 0, 0x0000cace);
712 return (0);
713 }
714
715 /*
716 * where to get a free buffer:
717 * - if meta and maxmeta reached, must reuse meta
718 * - alloc new if we haven't reached min bufs
719 * - if free lists are NOT empty
720 * - if free list is stale, use it
721 * - else if freemeta list is stale, use it
722 * - else if max bufs allocated, use least-time-to-stale
723 * - alloc new if we haven't reached max allowed
724 * - start clearing out delwri list and try again
725 */
726
727 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
728 /* if we've hit max meta buffers, must reuse a meta buffer */
729 bp = TAILQ_FIRST(&nfsbuffreemeta);
730 } else if ((nfsbufcnt > nfsbufmin) &&
731 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
732 /* try to pull an nfsbuf off a free list */
733 struct nfsbuf *lrubp, *metabp;
734 struct timeval now;
735 microuptime(&now);
736
737 /* if the next LRU or META buffer is invalid or stale, use it */
738 lrubp = TAILQ_FIRST(&nfsbuffree);
739 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
740 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
741 bp = lrubp;
742 metabp = TAILQ_FIRST(&nfsbuffreemeta);
743 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
744 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
745 bp = metabp;
746
747 if (!bp && (nfsbufcnt >= nfsbufmax)) {
748 /* we've already allocated all bufs, so */
749 /* choose the buffer that'll go stale first */
750 if (!metabp)
751 bp = lrubp;
752 else if (!lrubp)
753 bp = metabp;
754 else {
755 int32_t lru_stale_time, meta_stale_time;
756 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
757 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
758 if (lru_stale_time <= meta_stale_time)
759 bp = lrubp;
760 else
761 bp = metabp;
762 }
763 }
764 }
765
766 if (bp) {
767 /* we have a buffer to reuse */
768 FSDBG(544, vp, blkno, bp, bp->nb_flags);
769 nfs_buf_remfree(bp);
770 if (ISSET(bp->nb_flags, NB_DELWRI))
771 panic("nfs_buf_get: delwri");
772 SET(bp->nb_lflags, NBL_BUSY);
773 /* disassociate buffer from previous vnode */
774 if (bp->nb_vp) {
775 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
776 LIST_REMOVE(bp, nb_vnbufs);
777 bp->nb_vnbufs.le_next = NFSNOLIST;
778 }
779 bp->nb_vp = NULL;
780 }
781 LIST_REMOVE(bp, nb_hash);
782 /* nuke any creds we're holding */
783 cred = bp->nb_rcred;
784 if (cred != NOCRED) {
785 bp->nb_rcred = NOCRED;
786 kauth_cred_rele(cred);
787 }
788 cred = bp->nb_wcred;
789 if (cred != NOCRED) {
790 bp->nb_wcred = NOCRED;
791 kauth_cred_rele(cred);
792 }
793 /* if buf will no longer be NB_META, dump old buffer */
794 if (operation == NBLK_META) {
795 if (!ISSET(bp->nb_flags, NB_META))
796 nfsbufmetacnt++;
797 } else if (ISSET(bp->nb_flags, NB_META)) {
798 if (bp->nb_data) {
799 kfree(bp->nb_data, bp->nb_bufsize);
800 bp->nb_data = NULL;
801 }
802 nfsbufmetacnt--;
803 }
804 /* re-init buf fields */
805 bp->nb_error = 0;
806 bp->nb_validoff = bp->nb_validend = -1;
807 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
808 bp->nb_valid = 0;
809 bp->nb_dirty = 0;
810 } else {
811 /* no buffer to reuse */
812 if ((nfsbufcnt < nfsbufmax) &&
813 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
814 /* just alloc a new one */
815 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
816 if (!bp) {
817 lck_mtx_unlock(nfs_buf_mutex);
818 FSDBG_BOT(541, vp, blkno, 0, error);
819 return (ENOMEM);
820 }
821 nfsbufcnt++;
822 if (operation == NBLK_META)
823 nfsbufmetacnt++;
824 NFSBUFCNTCHK(1);
825 /* init nfsbuf */
826 bzero(bp, sizeof(*bp));
827 bp->nb_free.tqe_next = NFSNOLIST;
828 bp->nb_validoff = bp->nb_validend = -1;
829 FSDBG(545, vp, blkno, bp, 0);
830 } else {
831 /* too many bufs... wait for buffers to free up */
832 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
833
834 /* poke the delwri list */
835 nfs_buf_delwri_push(1);
836
837 nfsneedbuffer = 1;
838 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP,
839 "nfsbufget", 0);
840 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
841 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
842 FSDBG_BOT(541, vp, blkno, 0, error);
843 return (error);
844 }
845 goto loop;
846 }
847 }
848
849 /* setup nfsbuf */
850 bp->nb_lflags = NBL_BUSY;
851 bp->nb_flags = 0;
852 bp->nb_lblkno = blkno;
853 /* insert buf in hash */
854 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
855 /* associate buffer with new vnode */
856 bp->nb_vp = vp;
857 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
858
859 buffer_setup:
860
861 /* unlock hash */
862 lck_mtx_unlock(nfs_buf_mutex);
863
864 switch (operation) {
865 case NBLK_META:
866 SET(bp->nb_flags, NB_META);
867 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
868 kfree(bp->nb_data, bp->nb_bufsize);
869 bp->nb_data = NULL;
870 bp->nb_validoff = bp->nb_validend = -1;
871 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
872 bp->nb_valid = 0;
873 bp->nb_dirty = 0;
874 CLR(bp->nb_flags, NB_CACHE);
875 }
876 if (!bp->nb_data)
877 bp->nb_data = kalloc(bufsize);
878 if (!bp->nb_data) {
879 /* Ack! couldn't allocate the data buffer! */
880 /* cleanup buffer and return error */
881 lck_mtx_lock(nfs_buf_mutex);
882 LIST_REMOVE(bp, nb_vnbufs);
883 bp->nb_vnbufs.le_next = NFSNOLIST;
884 bp->nb_vp = NULL;
885 /* invalidate usage timestamp to allow immediate freeing */
886 NBUFSTAMPINVALIDATE(bp);
887 if (bp->nb_free.tqe_next != NFSNOLIST)
888 panic("nfsbuf on freelist");
889 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
890 nfsbuffreecnt++;
891 lck_mtx_unlock(nfs_buf_mutex);
892 FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM);
893 return (ENOMEM);
894 }
895 bp->nb_bufsize = bufsize;
896 break;
897
898 case NBLK_READ:
899 case NBLK_WRITE:
900 /*
901 * Set or clear NB_READ now to let the UPL subsystem know
902 * if we intend to modify the pages or not.
903 */
904 if (operation == NBLK_READ) {
905 SET(bp->nb_flags, NB_READ);
906 } else {
907 CLR(bp->nb_flags, NB_READ);
908 }
909 if (bufsize < PAGE_SIZE)
910 bufsize = PAGE_SIZE;
911 bp->nb_bufsize = bufsize;
912 bp->nb_validoff = bp->nb_validend = -1;
913
914 if (UBCINFOEXISTS(vp)) {
915 /* setup upl */
916 if (nfs_buf_upl_setup(bp)) {
917 /* unable to create upl */
918 /* vm object must no longer exist */
919 /* cleanup buffer and return error */
920 lck_mtx_lock(nfs_buf_mutex);
921 LIST_REMOVE(bp, nb_vnbufs);
922 bp->nb_vnbufs.le_next = NFSNOLIST;
923 bp->nb_vp = NULL;
924 /* invalidate usage timestamp to allow immediate freeing */
925 NBUFSTAMPINVALIDATE(bp);
926 if (bp->nb_free.tqe_next != NFSNOLIST)
927 panic("nfsbuf on freelist");
928 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
929 nfsbuffreecnt++;
930 lck_mtx_unlock(nfs_buf_mutex);
931 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
932 return (EIO);
933 }
934 nfs_buf_upl_check(bp);
935 }
936 break;
937
938 default:
939 panic("nfs_buf_get: %d unknown operation", operation);
940 }
941
942 *bpp = bp;
943
944 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
945
946 return (0);
947 }
948
949 void
950 nfs_buf_release(struct nfsbuf *bp, int freeup)
951 {
952 vnode_t vp = bp->nb_vp;
953 struct timeval now;
954 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
955
956 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
957 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
958 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
959
960 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
961 int upl_flags;
962 upl_t upl;
963 int i, rv;
964
965 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
966 rv = nfs_buf_upl_setup(bp);
967 if (rv)
968 printf("nfs_buf_release: upl create failed %d\n", rv);
969 else
970 nfs_buf_upl_check(bp);
971 }
972 upl = bp->nb_pagelist;
973 if (!upl)
974 goto pagelist_cleanup_done;
975 if (bp->nb_data) {
976 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
977 panic("ubc_upl_unmap failed");
978 bp->nb_data = NULL;
979 }
980 /* abort pages if error, invalid, or non-needcommit nocache */
981 if ((bp->nb_flags & (NB_ERROR | NB_INVAL)) ||
982 ((bp->nb_flags & NB_NOCACHE) && !(bp->nb_flags & (NB_NEEDCOMMIT | NB_DELWRI)))) {
983 if (bp->nb_flags & (NB_READ | NB_INVAL | NB_NOCACHE))
984 upl_flags = UPL_ABORT_DUMP_PAGES;
985 else
986 upl_flags = 0;
987 ubc_upl_abort(upl, upl_flags);
988 goto pagelist_cleanup_done;
989 }
990 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
991 if (!NBPGVALID(bp,i))
992 ubc_upl_abort_range(upl,
993 i*PAGE_SIZE, PAGE_SIZE,
994 UPL_ABORT_DUMP_PAGES |
995 UPL_ABORT_FREE_ON_EMPTY);
996 else {
997 if (NBPGDIRTY(bp,i))
998 upl_flags = UPL_COMMIT_SET_DIRTY;
999 else
1000 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1001 ubc_upl_commit_range(upl,
1002 i*PAGE_SIZE, PAGE_SIZE,
1003 upl_flags |
1004 UPL_COMMIT_INACTIVATE |
1005 UPL_COMMIT_FREE_ON_EMPTY);
1006 }
1007 }
1008 pagelist_cleanup_done:
1009 /* was this the last buffer in the file? */
1010 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) {
1011 /* if so, invalidate all pages of last buffer past EOF */
1012 off_t start, end;
1013 start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
1014 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1015 if (end > start) {
1016 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1017 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1018 }
1019 }
1020 CLR(bp->nb_flags, NB_PAGELIST);
1021 bp->nb_pagelist = NULL;
1022 }
1023
1024 lck_mtx_lock(nfs_buf_mutex);
1025
1026 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1027
1028 /* Wake up any processes waiting for any buffer to become free. */
1029 if (nfsneedbuffer) {
1030 nfsneedbuffer = 0;
1031 wakeup_needbuffer = 1;
1032 }
1033 /* Wake up any processes waiting for _this_ buffer to become free. */
1034 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1035 CLR(bp->nb_lflags, NBL_WANTED);
1036 wakeup_buffer = 1;
1037 }
1038
1039 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1040 if (ISSET(bp->nb_flags, NB_ERROR) ||
1041 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
1042 SET(bp->nb_flags, NB_INVAL);
1043
1044 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1045 /* If it's invalid or empty, dissociate it from its vnode */
1046 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1047 LIST_REMOVE(bp, nb_vnbufs);
1048 bp->nb_vnbufs.le_next = NFSNOLIST;
1049 }
1050 bp->nb_vp = NULL;
1051 /* if this was a delayed write, wakeup anyone */
1052 /* waiting for delayed writes to complete */
1053 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1054 CLR(bp->nb_flags, NB_DELWRI);
1055 OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite);
1056 NFSBUFCNTCHK(1);
1057 wakeup_nbdwrite = 1;
1058 }
1059 /* invalidate usage timestamp to allow immediate freeing */
1060 NBUFSTAMPINVALIDATE(bp);
1061 /* put buffer at head of free list */
1062 if (bp->nb_free.tqe_next != NFSNOLIST)
1063 panic("nfsbuf on freelist");
1064 SET(bp->nb_flags, NB_INVAL);
1065 if (ISSET(bp->nb_flags, NB_META)) {
1066 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1067 nfsbuffreemetacnt++;
1068 } else {
1069 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1070 nfsbuffreecnt++;
1071 }
1072 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1073 /* put buffer at end of delwri list */
1074 if (bp->nb_free.tqe_next != NFSNOLIST)
1075 panic("nfsbuf on freelist");
1076 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1077 nfsbufdelwricnt++;
1078 freeup = 0;
1079 } else {
1080 /* update usage timestamp */
1081 microuptime(&now);
1082 bp->nb_timestamp = now.tv_sec;
1083 /* put buffer at end of free list */
1084 if (bp->nb_free.tqe_next != NFSNOLIST)
1085 panic("nfsbuf on freelist");
1086 if (ISSET(bp->nb_flags, NB_META)) {
1087 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1088 nfsbuffreemetacnt++;
1089 } else {
1090 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1091 nfsbuffreecnt++;
1092 }
1093 }
1094
1095 NFSBUFCNTCHK(1);
1096
1097 /* Unlock the buffer. */
1098 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE | NB_IOD));
1099 CLR(bp->nb_lflags, NBL_BUSY);
1100
1101 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1102
1103 lck_mtx_unlock(nfs_buf_mutex);
1104
1105 if (wakeup_needbuffer)
1106 wakeup(&nfsneedbuffer);
1107 if (wakeup_buffer)
1108 wakeup(bp);
1109 if (wakeup_nbdwrite)
1110 wakeup(&nfs_nbdwrite);
1111 if (freeup)
1112 NFS_BUF_FREEUP();
1113 }
1114
1115 /*
1116 * Wait for operations on the buffer to complete.
1117 * When they do, extract and return the I/O's error value.
1118 */
1119 int
1120 nfs_buf_iowait(struct nfsbuf *bp)
1121 {
1122 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1123
1124 lck_mtx_lock(nfs_buf_mutex);
1125
1126 while (!ISSET(bp->nb_flags, NB_DONE))
1127 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0);
1128
1129 lck_mtx_unlock(nfs_buf_mutex);
1130
1131 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1132
1133 /* check for interruption of I/O, then errors. */
1134 if (ISSET(bp->nb_flags, NB_EINTR)) {
1135 CLR(bp->nb_flags, NB_EINTR);
1136 return (EINTR);
1137 } else if (ISSET(bp->nb_flags, NB_ERROR))
1138 return (bp->nb_error ? bp->nb_error : EIO);
1139 return (0);
1140 }
1141
1142 /*
1143 * Mark I/O complete on a buffer.
1144 */
1145 void
1146 nfs_buf_iodone(struct nfsbuf *bp)
1147 {
1148
1149 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1150
1151 if (ISSET(bp->nb_flags, NB_DONE))
1152 panic("nfs_buf_iodone already");
1153 /*
1154 * I/O was done, so don't believe
1155 * the DIRTY state from VM anymore
1156 */
1157 CLR(bp->nb_flags, NB_WASDIRTY);
1158
1159 if (!ISSET(bp->nb_flags, NB_READ)) {
1160 CLR(bp->nb_flags, NB_WRITEINPROG);
1161 /*
1162 * vnode_writedone() takes care of waking up
1163 * any throttled write operations
1164 */
1165 vnode_writedone(bp->nb_vp);
1166 }
1167 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1168 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1169 nfs_buf_release(bp, 1);
1170 } else { /* or just wakeup the buffer */
1171 lck_mtx_lock(nfs_buf_mutex);
1172 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1173 CLR(bp->nb_lflags, NBL_WANTED);
1174 lck_mtx_unlock(nfs_buf_mutex);
1175 wakeup(bp);
1176 }
1177
1178 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1179 }
1180
1181 void
1182 nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p)
1183 {
1184 vnode_t vp = bp->nb_vp;
1185
1186 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1187 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1188
1189 /*
1190 * If the block hasn't been seen before:
1191 * (1) Mark it as having been seen,
1192 * (2) Charge for the write.
1193 * (3) Make sure it's on its vnode's correct block list,
1194 */
1195 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1196 SET(bp->nb_flags, NB_DELWRI);
1197 if (p && p->p_stats)
1198 p->p_stats->p_ru.ru_oublock++; /* XXX */
1199 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
1200 NFSBUFCNTCHK(0);
1201 /* move to dirty list */
1202 lck_mtx_lock(nfs_buf_mutex);
1203 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1204 LIST_REMOVE(bp, nb_vnbufs);
1205 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1206 lck_mtx_unlock(nfs_buf_mutex);
1207 }
1208
1209 /*
1210 * If the vnode has "too many" write operations in progress
1211 * wait for them to finish the IO
1212 */
1213 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1214
1215 /*
1216 * If we have too many delayed write buffers,
1217 * more than we can "safely" handle, just fall back to
1218 * doing the async write
1219 */
1220 if (nfs_nbdwrite < 0)
1221 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1222
1223 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1224 /* issue async write */
1225 SET(bp->nb_flags, NB_ASYNC);
1226 nfs_buf_write(bp);
1227 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1228 return;
1229 }
1230
1231 /* Otherwise, the "write" is done, so mark and release the buffer. */
1232 SET(bp->nb_flags, NB_DONE);
1233 nfs_buf_release(bp, 1);
1234 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1235 return;
1236 }
1237
1238 /*
1239 * add a reference to a buffer so it doesn't disappear while being used
1240 * (must be called with nfs_buf_mutex held)
1241 */
1242 void
1243 nfs_buf_refget(struct nfsbuf *bp)
1244 {
1245 bp->nb_refs++;
1246 }
1247 /*
1248 * release a reference on a buffer
1249 * (must be called with nfs_buf_mutex held)
1250 */
1251 void
1252 nfs_buf_refrele(struct nfsbuf *bp)
1253 {
1254 bp->nb_refs--;
1255 }
1256
1257 /*
1258 * mark a particular buffer as BUSY
1259 * (must be called with nfs_buf_mutex held)
1260 */
1261 errno_t
1262 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1263 {
1264 errno_t error;
1265 struct timespec ts;
1266
1267 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1268 /*
1269 * since the mutex_lock may block, the buffer
1270 * may become BUSY, so we need to recheck for
1271 * a NOWAIT request
1272 */
1273 if (flags & NBAC_NOWAIT)
1274 return (EBUSY);
1275 SET(bp->nb_lflags, NBL_WANTED);
1276
1277 ts.tv_sec = (slptimeo/100);
1278 /* the hz value is 100; which leads to 10ms */
1279 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1280
1281 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1282 "nfs_buf_acquire", &ts);
1283 if (error)
1284 return (error);
1285 return (EAGAIN);
1286 }
1287 if (flags & NBAC_REMOVE)
1288 nfs_buf_remfree(bp);
1289 SET(bp->nb_lflags, NBL_BUSY);
1290
1291 return (0);
1292 }
1293
1294 /*
1295 * simply drop the BUSY status of a buffer
1296 * (must be called with nfs_buf_mutex held)
1297 */
1298 void
1299 nfs_buf_drop(struct nfsbuf *bp)
1300 {
1301 int need_wakeup = 0;
1302
1303 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1304 panic("nfs_buf_drop: buffer not busy!");
1305 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1306 /*
1307 * delay the actual wakeup until after we
1308 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1309 */
1310 need_wakeup = 1;
1311 }
1312 /* Unlock the buffer. */
1313 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1314
1315 if (need_wakeup)
1316 wakeup(bp);
1317 }
1318
1319 /*
1320 * prepare for iterating over an nfsnode's buffer list
1321 * this lock protects the queue manipulation
1322 * (must be called with nfs_buf_mutex held)
1323 */
1324 int
1325 nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1326 {
1327 struct nfsbuflists *listheadp;
1328
1329 if (flags & NBI_DIRTY)
1330 listheadp = &np->n_dirtyblkhd;
1331 else
1332 listheadp = &np->n_cleanblkhd;
1333
1334 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1335 LIST_INIT(iterheadp);
1336 return(EWOULDBLOCK);
1337 }
1338
1339 while (np->n_bufiterflags & NBI_ITER) {
1340 np->n_bufiterflags |= NBI_ITERWANT;
1341 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0);
1342 }
1343 if (LIST_EMPTY(listheadp)) {
1344 LIST_INIT(iterheadp);
1345 return(EINVAL);
1346 }
1347 np->n_bufiterflags |= NBI_ITER;
1348
1349 iterheadp->lh_first = listheadp->lh_first;
1350 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1351 LIST_INIT(listheadp);
1352
1353 return(0);
1354 }
1355
1356 /*
1357 * cleanup after iterating over an nfsnode's buffer list
1358 * this lock protects the queue manipulation
1359 * (must be called with nfs_buf_mutex held)
1360 */
1361 void
1362 nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1363 {
1364 struct nfsbuflists * listheadp;
1365 struct nfsbuf *bp;
1366
1367 if (flags & NBI_DIRTY)
1368 listheadp = &np->n_dirtyblkhd;
1369 else
1370 listheadp = &np->n_cleanblkhd;
1371
1372 while (!LIST_EMPTY(iterheadp)) {
1373 bp = LIST_FIRST(iterheadp);
1374 LIST_REMOVE(bp, nb_vnbufs);
1375 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1376 }
1377
1378 np->n_bufiterflags &= ~NBI_ITER;
1379 if (np->n_bufiterflags & NBI_ITERWANT) {
1380 np->n_bufiterflags &= ~NBI_ITERWANT;
1381 wakeup(&np->n_bufiterflags);
1382 }
1383 }
1384
1385
1386 /*
1387 * Vnode op for read using bio
1388 * Any similarity to readip() is purely coincidental
1389 */
1390 int
1391 nfs_bioread(
1392 vnode_t vp,
1393 struct uio *uio,
1394 __unused int ioflag,
1395 kauth_cred_t cred,
1396 proc_t p)
1397 {
1398 struct nfsnode *np = VTONFS(vp);
1399 int biosize;
1400 off_t diff;
1401 struct nfsbuf *bp = NULL, *rabp;
1402 struct nfs_vattr nvattr;
1403 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1404 daddr64_t lbn, rabn, lastrabn = -1, tlbn;
1405 int bufsize;
1406 int nra, error = 0, n = 0, on = 0;
1407 caddr_t dp;
1408 struct dirent *direntp = NULL;
1409 enum vtype vtype;
1410 int nocachereadahead = 0;
1411
1412 FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1413
1414 #if DIAGNOSTIC
1415 if (uio->uio_rw != UIO_READ)
1416 panic("nfs_read mode");
1417 #endif
1418 if (uio_uio_resid(uio) == 0) {
1419 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1420 return (0);
1421 }
1422 if (uio->uio_offset < 0) {
1423 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1424 return (EINVAL);
1425 }
1426
1427 biosize = nmp->nm_biosize;
1428 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1429 nfs_fsinfo(nmp, vp, cred, p);
1430
1431 vtype = vnode_vtype(vp);
1432 /*
1433 * For nfs, cache consistency can only be maintained approximately.
1434 * Although RFC1094 does not specify the criteria, the following is
1435 * believed to be compatible with the reference port.
1436 * For nfs:
1437 * If the file's modify time on the server has changed since the
1438 * last read rpc or you have written to the file,
1439 * you may have lost data cache consistency with the
1440 * server, so flush all of the file's data out of the cache.
1441 * Then force a getattr rpc to ensure that you have up to date
1442 * attributes.
1443 * NB: This implies that cache data can be read when up to
1444 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1445 * current attributes this could be forced by setting calling
1446 * NATTRINVALIDATE() before the nfs_getattr() call.
1447 */
1448 if (np->n_flag & NNEEDINVALIDATE) {
1449 np->n_flag &= ~NNEEDINVALIDATE;
1450 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1451 }
1452 if (np->n_flag & NMODIFIED) {
1453 if (vtype != VREG) {
1454 if (vtype != VDIR)
1455 panic("nfs: bioread, not dir");
1456 nfs_invaldir(vp);
1457 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1458 if (error) {
1459 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1460 return (error);
1461 }
1462 }
1463 NATTRINVALIDATE(np);
1464 error = nfs_getattr(vp, &nvattr, cred, p);
1465 if (error) {
1466 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1467 return (error);
1468 }
1469 if (vtype == VDIR) {
1470 /* if directory changed, purge any name cache entries */
1471 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1472 cache_purge(vp);
1473 np->n_ncmtime = nvattr.nva_mtime;
1474 }
1475 np->n_mtime = nvattr.nva_mtime;
1476 } else {
1477 error = nfs_getattr(vp, &nvattr, cred, p);
1478 if (error) {
1479 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1480 return (error);
1481 }
1482 if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) {
1483 if (vtype == VDIR) {
1484 nfs_invaldir(vp);
1485 /* purge name cache entries */
1486 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1487 cache_purge(vp);
1488 }
1489 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1490 if (error) {
1491 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1492 return (error);
1493 }
1494 if (vtype == VDIR)
1495 np->n_ncmtime = nvattr.nva_mtime;
1496 np->n_mtime = nvattr.nva_mtime;
1497 }
1498 }
1499
1500 if (vnode_isnocache(vp)) {
1501 if (!(np->n_flag & NNOCACHE)) {
1502 if (NVALIDBUFS(np)) {
1503 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1504 if (error) {
1505 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1506 return (error);
1507 }
1508 }
1509 np->n_flag |= NNOCACHE;
1510 }
1511 } else if (np->n_flag & NNOCACHE) {
1512 np->n_flag &= ~NNOCACHE;
1513 }
1514
1515 do {
1516 if (np->n_flag & NNOCACHE) {
1517 switch (vtype) {
1518 case VREG:
1519 /*
1520 * If we have only a block or so to read,
1521 * just do the rpc directly.
1522 * If we have a couple blocks or more to read,
1523 * then we'll take advantage of readahead within
1524 * this loop to try to fetch all the data in parallel
1525 */
1526 if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) {
1527 error = nfs_readrpc(vp, uio, cred, p);
1528 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1529 return (error);
1530 }
1531 nocachereadahead = 1;
1532 break;
1533 case VLNK:
1534 error = nfs_readlinkrpc(vp, uio, cred, p);
1535 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1536 return (error);
1537 case VDIR:
1538 break;
1539 default:
1540 printf(" NFSNOCACHE: type %x unexpected\n", vtype);
1541 };
1542 }
1543 switch (vtype) {
1544 case VREG:
1545 lbn = uio->uio_offset / biosize;
1546
1547 /*
1548 * Copy directly from any cached pages without grabbing the bufs.
1549 *
1550 * Note: for "nocache" reads, we don't copy directly from UBC
1551 * because any cached pages will be for readahead buffers that
1552 * need to be invalidated anyway before we finish this request.
1553 */
1554 if (!(np->n_flag & NNOCACHE) &&
1555 (uio->uio_segflg == UIO_USERSPACE32 ||
1556 uio->uio_segflg == UIO_USERSPACE64 ||
1557 uio->uio_segflg == UIO_USERSPACE)) {
1558 // LP64todo - fix this!
1559 int io_resid = uio_uio_resid(uio);
1560 diff = np->n_size - uio->uio_offset;
1561 if (diff < io_resid)
1562 io_resid = diff;
1563 if (io_resid > 0) {
1564 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1565 if (error) {
1566 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1567 return (error);
1568 }
1569 }
1570 /* count any biocache reads that we just copied directly */
1571 if (lbn != uio->uio_offset / biosize) {
1572 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1573 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1574 }
1575 }
1576
1577 lbn = uio->uio_offset / biosize;
1578 on = uio->uio_offset % biosize;
1579
1580 /*
1581 * Start the read ahead(s), as required.
1582 */
1583 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1584 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1585 rabn = lbn + 1 + nra;
1586 if (rabn <= lastrabn) {
1587 /* we've already (tried to) read this block */
1588 /* no need to try it again... */
1589 continue;
1590 }
1591 lastrabn = rabn;
1592 if ((off_t)rabn * biosize >= (off_t)np->n_size)
1593 break;
1594 if ((np->n_flag & NNOCACHE) &&
1595 (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio))))
1596 /* for uncached readahead, don't go beyond end of request */
1597 break;
1598 /* check if block exists and is valid. */
1599 error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1600 if (error) {
1601 FSDBG_BOT(514, vp, 0xd1e000b, 1, error);
1602 return (error);
1603 }
1604 if (!rabp)
1605 continue;
1606 if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) {
1607 nfs_buf_release(rabp, 1);
1608 continue;
1609 }
1610 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1611 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1612 if (nfs_asyncio(rabp, cred)) {
1613 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1614 rabp->nb_error = EIO;
1615 nfs_buf_release(rabp, 1);
1616 }
1617 } else
1618 nfs_buf_release(rabp, 1);
1619 }
1620 }
1621
1622 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
1623 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
1624 return (0);
1625 }
1626
1627 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
1628
1629 /*
1630 * If the block is in the cache and has the required data
1631 * in a valid region, just copy it out.
1632 * Otherwise, get the block and write back/read in,
1633 * as required.
1634 */
1635 again:
1636 bufsize = biosize;
1637 // LP64todo - fix this!
1638 n = min((unsigned)(bufsize - on), uio_uio_resid(uio));
1639 diff = np->n_size - uio->uio_offset;
1640 if (diff < n)
1641 n = diff;
1642
1643 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp);
1644 if (error) {
1645 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1646 return (EINTR);
1647 }
1648
1649 /* if any pages are valid... */
1650 if (bp->nb_valid) {
1651 /* ...check for any invalid pages in the read range */
1652 int pg, firstpg, lastpg, dirtypg;
1653 dirtypg = firstpg = lastpg = -1;
1654 pg = on/PAGE_SIZE;
1655 while (pg <= (on + n - 1)/PAGE_SIZE) {
1656 if (!NBPGVALID(bp,pg)) {
1657 if (firstpg < 0)
1658 firstpg = pg;
1659 lastpg = pg;
1660 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1661 dirtypg = pg;
1662 pg++;
1663 }
1664
1665 /* if there are no invalid pages, we're all set */
1666 if (firstpg < 0) {
1667 if (bp->nb_validoff < 0) {
1668 /* valid range isn't set up, so */
1669 /* set it to what we know is valid */
1670 bp->nb_validoff = trunc_page(on);
1671 bp->nb_validend = round_page(on+n);
1672 nfs_buf_normalize_valid_range(np, bp);
1673 }
1674 goto buffer_ready;
1675 }
1676
1677 /* there are invalid pages in the read range */
1678 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1679 /* there are also dirty page(s) in the range, */
1680 /* so write the buffer out and try again */
1681 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1682 SET(bp->nb_flags, NB_ASYNC);
1683 if (bp->nb_wcred == NOCRED) {
1684 kauth_cred_ref(cred);
1685 bp->nb_wcred = cred;
1686 }
1687 error = nfs_buf_write(bp);
1688 if (error) {
1689 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1690 return (error);
1691 }
1692 goto again;
1693 }
1694 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1695 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1696 /* we need to read in more than half the buffer and the */
1697 /* buffer's not dirty, so just fetch the whole buffer */
1698 bp->nb_valid = 0;
1699 } else {
1700 /* read the page range in */
1701 uio_t auio;
1702 char uio_buf[ UIO_SIZEOF(1) ];
1703
1704 NFS_BUF_MAP(bp);
1705 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
1706 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1707 if (!auio) {
1708 error = ENOMEM;
1709 } else {
1710 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
1711 ((lastpg - firstpg + 1) * PAGE_SIZE));
1712 error = nfs_readrpc(vp, auio, cred, p);
1713 }
1714 if (error) {
1715 if (np->n_flag & NNOCACHE)
1716 SET(bp->nb_flags, NB_NOCACHE);
1717 nfs_buf_release(bp, 1);
1718 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1719 return (error);
1720 }
1721 /* Make sure that the valid range is set to cover this read. */
1722 bp->nb_validoff = trunc_page_32(on);
1723 bp->nb_validend = round_page_32(on+n);
1724 nfs_buf_normalize_valid_range(np, bp);
1725 if (uio_resid(auio) > 0) {
1726 /* if short read, must have hit EOF, */
1727 /* so zero the rest of the range */
1728 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
1729 }
1730 /* mark the pages (successfully read) as valid */
1731 for (pg=firstpg; pg <= lastpg; pg++)
1732 NBPGVALID_SET(bp,pg);
1733 }
1734 }
1735 /* if no pages are valid, read the whole block */
1736 if (!bp->nb_valid) {
1737 SET(bp->nb_flags, NB_READ);
1738 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1739 error = nfs_doio(bp, cred, p);
1740 if (error) {
1741 if (np->n_flag & NNOCACHE)
1742 SET(bp->nb_flags, NB_NOCACHE);
1743 nfs_buf_release(bp, 1);
1744 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1745 return (error);
1746 }
1747 }
1748 buffer_ready:
1749 /* validate read range against valid range and clip */
1750 if (bp->nb_validend > 0) {
1751 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1752 if (diff < n)
1753 n = diff;
1754 }
1755 if (n > 0)
1756 NFS_BUF_MAP(bp);
1757 break;
1758 case VLNK:
1759 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks);
1760 error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp);
1761 if (error) {
1762 FSDBG_BOT(514, vp, 0xd1e0010, 0, error);
1763 return (error);
1764 }
1765 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1766 SET(bp->nb_flags, NB_READ);
1767 error = nfs_doio(bp, cred, p);
1768 if (error) {
1769 SET(bp->nb_flags, NB_ERROR);
1770 nfs_buf_release(bp, 1);
1771 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1772 return (error);
1773 }
1774 }
1775 // LP64todo - fix this!
1776 n = min(uio_uio_resid(uio), bp->nb_validend);
1777 on = 0;
1778 break;
1779 case VDIR:
1780 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
1781 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1782 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1783 return (0);
1784 }
1785 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1786 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1787 error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1788 if (error) {
1789 FSDBG_BOT(514, vp, 0xd1e0012, 0, error);
1790 return (error);
1791 }
1792 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1793 SET(bp->nb_flags, NB_READ);
1794 error = nfs_doio(bp, cred, p);
1795 if (error) {
1796 nfs_buf_release(bp, 1);
1797 }
1798 while (error == NFSERR_BAD_COOKIE) {
1799 nfs_invaldir(vp);
1800 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1801 /*
1802 * Yuck! The directory has been modified on the
1803 * server. The only way to get the block is by
1804 * reading from the beginning to get all the
1805 * offset cookies.
1806 */
1807 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
1808 if (np->n_direofoffset
1809 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1810 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1811 return (0);
1812 }
1813 error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1814 if (error) {
1815 FSDBG_BOT(514, vp, 0xd1e0013, 0, error);
1816 return (error);
1817 }
1818 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1819 SET(bp->nb_flags, NB_READ);
1820 error = nfs_doio(bp, cred, p);
1821 /*
1822 * no error + NB_INVAL == directory EOF,
1823 * use the block.
1824 */
1825 if (error == 0 && (bp->nb_flags & NB_INVAL))
1826 break;
1827 }
1828 /*
1829 * An error will throw away the block and the
1830 * for loop will break out. If no error and this
1831 * is not the block we want, we throw away the
1832 * block and go for the next one via the for loop.
1833 */
1834 if (error || tlbn < lbn)
1835 nfs_buf_release(bp, 1);
1836 }
1837 }
1838 /*
1839 * The above while is repeated if we hit another cookie
1840 * error. If we hit an error and it wasn't a cookie error,
1841 * we give up.
1842 */
1843 if (error) {
1844 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1845 return (error);
1846 }
1847 }
1848
1849 /*
1850 * If not eof and read aheads are enabled, start one.
1851 * (You need the current block first, so that you have the
1852 * directory offset cookie of the next block.)
1853 */
1854 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1855 (np->n_direofoffset == 0 ||
1856 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1857 !nfs_buf_is_incore(vp, lbn + 1)) {
1858 error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1859 if (error) {
1860 FSDBG_BOT(514, vp, 0xd1e0015, 0, error);
1861 return (error);
1862 }
1863 if (rabp) {
1864 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1865 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1866 if (nfs_asyncio(rabp, cred)) {
1867 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1868 rabp->nb_error = EIO;
1869 nfs_buf_release(rabp, 1);
1870 }
1871 } else {
1872 nfs_buf_release(rabp, 1);
1873 }
1874 }
1875 }
1876 /*
1877 * Make sure we use a signed variant of min() since
1878 * the second term may be negative.
1879 */
1880 // LP64todo - fix this!
1881 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
1882 /*
1883 * We keep track of the directory eof in
1884 * np->n_direofoffset and chop it off as an
1885 * extra step right here.
1886 */
1887 if (np->n_direofoffset &&
1888 n > np->n_direofoffset - uio->uio_offset)
1889 n = np->n_direofoffset - uio->uio_offset;
1890 /*
1891 * Make sure that we return an integral number of entries so
1892 * that any subsequent calls will start copying from the start
1893 * of the next entry.
1894 *
1895 * If the current value of n has the last entry cut short,
1896 * set n to copy everything up to the last entry instead.
1897 */
1898 if (n > 0) {
1899 dp = bp->nb_data + on;
1900 while (dp < (bp->nb_data + on + n)) {
1901 direntp = (struct dirent *)dp;
1902 dp += direntp->d_reclen;
1903 }
1904 if (dp > (bp->nb_data + on + n))
1905 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1906 }
1907 break;
1908 default:
1909 printf("nfs_bioread: type %x unexpected\n", vtype);
1910 FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL);
1911 return (EINVAL);
1912 };
1913
1914 if (n > 0) {
1915 error = uiomove(bp->nb_data + on, (int)n, uio);
1916 }
1917 switch (vtype) {
1918 case VREG:
1919 if (np->n_flag & NNOCACHE)
1920 SET(bp->nb_flags, NB_NOCACHE);
1921 break;
1922 case VLNK:
1923 n = 0;
1924 break;
1925 case VDIR:
1926 break;
1927 default:
1928 break;
1929 }
1930 nfs_buf_release(bp, 1);
1931 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
1932 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1933 return (error);
1934 }
1935
1936
1937 /*
1938 * Vnode op for write using bio
1939 */
1940 int
1941 nfs_write(ap)
1942 struct vnop_write_args /* {
1943 struct vnodeop_desc *a_desc;
1944 vnode_t a_vp;
1945 struct uio *a_uio;
1946 int a_ioflag;
1947 vfs_context_t a_context;
1948 } */ *ap;
1949 {
1950 struct uio *uio = ap->a_uio;
1951 vnode_t vp = ap->a_vp;
1952 struct nfsnode *np = VTONFS(vp);
1953 proc_t p;
1954 kauth_cred_t cred;
1955 int ioflag = ap->a_ioflag;
1956 struct nfsbuf *bp;
1957 struct nfs_vattr nvattr;
1958 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1959 daddr64_t lbn;
1960 int biosize, bufsize;
1961 int n, on, error = 0;
1962 off_t boff, start, end, cureof;
1963 struct iovec_32 iov;
1964 struct uio auio;
1965
1966 FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1967
1968 #if DIAGNOSTIC
1969 if (uio->uio_rw != UIO_WRITE)
1970 panic("nfs_write mode");
1971 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1972 panic("nfs_write proc");
1973 #endif
1974
1975 p = vfs_context_proc(ap->a_context);
1976 cred = vfs_context_ucred(ap->a_context);
1977
1978 if (vnode_vtype(vp) != VREG)
1979 return (EIO);
1980
1981 np->n_flag |= NWRBUSY;
1982
1983 if (np->n_flag & NNEEDINVALIDATE) {
1984 np->n_flag &= ~NNEEDINVALIDATE;
1985 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1986 }
1987 if (np->n_flag & NWRITEERR) {
1988 np->n_flag &= ~(NWRITEERR | NWRBUSY);
1989 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error);
1990 return (np->n_error);
1991 }
1992
1993 biosize = nmp->nm_biosize;
1994 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1995 nfs_fsinfo(nmp, vp, cred, p);
1996
1997 if (ioflag & (IO_APPEND | IO_SYNC)) {
1998 if (np->n_flag & NMODIFIED) {
1999 NATTRINVALIDATE(np);
2000 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2001 if (error) {
2002 np->n_flag &= ~NWRBUSY;
2003 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
2004 return (error);
2005 }
2006 }
2007 if (ioflag & IO_APPEND) {
2008 NATTRINVALIDATE(np);
2009 error = nfs_getattr(vp, &nvattr, cred, p);
2010 if (error) {
2011 np->n_flag &= ~NWRBUSY;
2012 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
2013 return (error);
2014 }
2015 uio->uio_offset = np->n_size;
2016 }
2017 }
2018 if (uio->uio_offset < 0) {
2019 np->n_flag &= ~NWRBUSY;
2020 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
2021 return (EINVAL);
2022 }
2023 if (uio_uio_resid(uio) == 0) {
2024 np->n_flag &= ~NWRBUSY;
2025 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2026 return (0);
2027 }
2028
2029 if (vnode_isnocache(vp)) {
2030 if (!(np->n_flag & NNOCACHE)) {
2031 if (NVALIDBUFS(np)) {
2032 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2033 if (error) {
2034 np->n_flag &= ~NWRBUSY;
2035 FSDBG_BOT(515, vp, 0, 0, error);
2036 return (error);
2037 }
2038 }
2039 np->n_flag |= NNOCACHE;
2040 }
2041 } else if (np->n_flag & NNOCACHE) {
2042 np->n_flag &= ~NNOCACHE;
2043 }
2044
2045 do {
2046 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes);
2047 lbn = uio->uio_offset / biosize;
2048 on = uio->uio_offset % biosize;
2049 // LP64todo - fix this
2050 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
2051 again:
2052 bufsize = biosize;
2053 /*
2054 * Get a cache block for writing. The range to be written is
2055 * (off..off+n) within the block. We ensure that the block
2056 * either has no dirty region or that the given range is
2057 * contiguous with the existing dirty region.
2058 */
2059 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp);
2060 if (error) {
2061 np->n_flag &= ~NWRBUSY;
2062 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2063 return (error);
2064 }
2065 /* map the block because we know we're going to write to it */
2066 NFS_BUF_MAP(bp);
2067
2068 if (np->n_flag & NNOCACHE)
2069 SET(bp->nb_flags, NB_NOCACHE);
2070
2071 if (bp->nb_wcred == NOCRED) {
2072 kauth_cred_ref(cred);
2073 bp->nb_wcred = cred;
2074 }
2075
2076 /*
2077 * If there's already a dirty range AND dirty pages in this block we
2078 * need to send a commit AND write the dirty pages before continuing.
2079 *
2080 * If there's already a dirty range OR dirty pages in this block
2081 * and the new write range is not contiguous with the existing range,
2082 * then force the buffer to be written out now.
2083 * (We used to just extend the dirty range to cover the valid,
2084 * but unwritten, data in between also. But writing ranges
2085 * of data that weren't actually written by an application
2086 * risks overwriting some other client's data with stale data
2087 * that's just masquerading as new written data.)
2088 */
2089 if (bp->nb_dirtyend > 0) {
2090 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2091 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
2092 /* write/commit buffer "synchronously" */
2093 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2094 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2095 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2096 error = nfs_buf_write(bp);
2097 if (error) {
2098 np->n_flag &= ~NWRBUSY;
2099 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2100 return (error);
2101 }
2102 goto again;
2103 }
2104 } else if (bp->nb_dirty) {
2105 int firstpg, lastpg;
2106 u_int32_t pagemask;
2107 /* calculate write range pagemask */
2108 firstpg = on/PAGE_SIZE;
2109 lastpg = (on+n-1)/PAGE_SIZE;
2110 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2111 /* check if there are dirty pages outside the write range */
2112 if (bp->nb_dirty & ~pagemask) {
2113 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
2114 /* write/commit buffer "synchronously" */
2115 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2116 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2117 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2118 error = nfs_buf_write(bp);
2119 if (error) {
2120 np->n_flag &= ~NWRBUSY;
2121 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2122 return (error);
2123 }
2124 goto again;
2125 }
2126 /* if the first or last pages are already dirty */
2127 /* make sure that the dirty range encompasses those pages */
2128 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2129 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
2130 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2131 if (NBPGDIRTY(bp,lastpg)) {
2132 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2133 /* clip to EOF */
2134 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2135 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2136 } else
2137 bp->nb_dirtyend = on+n;
2138 }
2139 }
2140
2141 /*
2142 * Are we extending the size of the file with this write?
2143 * If so, update file size now that we have the block.
2144 * If there was a partial buf at the old eof, validate
2145 * and zero the new bytes.
2146 */
2147 cureof = (off_t)np->n_size;
2148 if (uio->uio_offset + n > (off_t)np->n_size) {
2149 struct nfsbuf *eofbp = NULL;
2150 daddr64_t eofbn = np->n_size / biosize;
2151 int eofoff = np->n_size % biosize;
2152 int neweofoff = (uio->uio_offset + n) % biosize;
2153
2154 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
2155
2156 if (eofoff && (eofbn < lbn)) {
2157 error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp);
2158 if (error) {
2159 np->n_flag &= ~NWRBUSY;
2160 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2161 return (error);
2162 }
2163 }
2164
2165 /* if we're extending within the same last block */
2166 /* and the block is flagged as being cached... */
2167 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2168 /* ...check that all pages in buffer are valid */
2169 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2170 u_int32_t pagemask;
2171 /* pagemask only has to extend to last page being written to */
2172 pagemask = (1 << (endpg+1)) - 1;
2173 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2174 if ((bp->nb_valid & pagemask) != pagemask) {
2175 /* zerofill any hole */
2176 if (on > bp->nb_validend) {
2177 int i;
2178 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2179 NBPGVALID_SET(bp, i);
2180 NFS_BUF_MAP(bp);
2181 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2182 bzero((char *)bp->nb_data + bp->nb_validend,
2183 on - bp->nb_validend);
2184 }
2185 /* zerofill any trailing data in the last page */
2186 if (neweofoff) {
2187 NFS_BUF_MAP(bp);
2188 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2189 bzero((char *)bp->nb_data + neweofoff,
2190 PAGE_SIZE - (neweofoff & PAGE_MASK));
2191 }
2192 }
2193 }
2194 np->n_flag |= NMODIFIED;
2195 np->n_size = uio->uio_offset + n;
2196 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
2197 if (eofbp) {
2198 /*
2199 * We may need to zero any previously invalid data
2200 * after the old EOF in the previous EOF buffer.
2201 *
2202 * For the old last page, don't zero bytes if there
2203 * are invalid bytes in that page (i.e. the page isn't
2204 * currently valid).
2205 * For pages after the old last page, zero them and
2206 * mark them as valid.
2207 */
2208 char *d;
2209 int i;
2210 if (np->n_flag & NNOCACHE)
2211 SET(eofbp->nb_flags, NB_NOCACHE);
2212 NFS_BUF_MAP(eofbp);
2213 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2214 d = eofbp->nb_data;
2215 i = eofoff/PAGE_SIZE;
2216 while (eofoff < biosize) {
2217 int poff = eofoff & PAGE_MASK;
2218 if (!poff || NBPGVALID(eofbp,i)) {
2219 bzero(d + eofoff, PAGE_SIZE - poff);
2220 NBPGVALID_SET(eofbp, i);
2221 }
2222 if (bp->nb_validend == eofoff)
2223 bp->nb_validend += PAGE_SIZE - poff;
2224 eofoff += PAGE_SIZE - poff;
2225 i++;
2226 }
2227 nfs_buf_release(eofbp, 1);
2228 }
2229 }
2230 /*
2231 * If dirtyend exceeds file size, chop it down. This should
2232 * not occur unless there is a race.
2233 */
2234 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2235 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2236 /*
2237 * UBC doesn't handle partial pages, so we need to make sure
2238 * that any pages left in the page cache are completely valid.
2239 *
2240 * Writes that are smaller than a block are delayed if they
2241 * don't extend to the end of the block.
2242 *
2243 * If the block isn't (completely) cached, we may need to read
2244 * in some parts of pages that aren't covered by the write.
2245 * If the write offset (on) isn't page aligned, we'll need to
2246 * read the start of the first page being written to. Likewise,
2247 * if the offset of the end of the write (on+n) isn't page aligned,
2248 * we'll need to read the end of the last page being written to.
2249 *
2250 * Notes:
2251 * We don't want to read anything we're just going to write over.
2252 * We don't want to issue multiple I/Os if we don't have to
2253 * (because they're synchronous rpcs).
2254 * We don't want to read anything we already have modified in the
2255 * page cache.
2256 */
2257 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
2258 int firstpg, lastpg, dirtypg;
2259 int firstpgoff, lastpgoff;
2260 start = end = -1;
2261 firstpg = on/PAGE_SIZE;
2262 firstpgoff = on & PAGE_MASK;
2263 lastpg = (on+n-1)/PAGE_SIZE;
2264 lastpgoff = (on+n) & PAGE_MASK;
2265 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2266 /* need to read start of first page */
2267 start = firstpg * PAGE_SIZE;
2268 end = start + firstpgoff;
2269 }
2270 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2271 /* need to read end of last page */
2272 if (start < 0)
2273 start = (lastpg * PAGE_SIZE) + lastpgoff;
2274 end = (lastpg + 1) * PAGE_SIZE;
2275 }
2276 if (end > start) {
2277 /* need to read the data in range: start...end-1 */
2278
2279 /* first, check for dirty pages in between */
2280 /* if there are, we'll have to do two reads because */
2281 /* we don't want to overwrite the dirty pages. */
2282 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2283 if (NBPGDIRTY(bp,dirtypg))
2284 break;
2285
2286 /* if start is at beginning of page, try */
2287 /* to get any preceeding pages as well. */
2288 if (!(start & PAGE_MASK)) {
2289 /* stop at next dirty/valid page or start of block */
2290 for (; start > 0; start-=PAGE_SIZE)
2291 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2292 break;
2293 }
2294
2295 NFS_BUF_MAP(bp);
2296 /* setup uio for read(s) */
2297 boff = NBOFF(bp);
2298 auio.uio_iovs.iov32p = &iov;
2299 auio.uio_iovcnt = 1;
2300 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2301 auio.uio_segflg = UIO_SYSSPACE;
2302 #else
2303 auio.uio_segflg = UIO_SYSSPACE32;
2304 #endif
2305 auio.uio_rw = UIO_READ;
2306
2307 if (dirtypg <= (end-1)/PAGE_SIZE) {
2308 /* there's a dirty page in the way, so just do two reads */
2309 /* we'll read the preceding data here */
2310 auio.uio_offset = boff + start;
2311 iov.iov_len = on - start;
2312 uio_uio_resid_set(&auio, iov.iov_len);
2313 iov.iov_base = (uintptr_t) bp->nb_data + start;
2314 error = nfs_readrpc(vp, &auio, cred, p);
2315 if (error) {
2316 bp->nb_error = error;
2317 SET(bp->nb_flags, NB_ERROR);
2318 printf("nfs_write: readrpc %d", error);
2319 }
2320 if (uio_uio_resid(&auio) > 0) {
2321 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01);
2322 // LP64todo - fix this
2323 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2324 }
2325 /* update validoff/validend if necessary */
2326 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2327 bp->nb_validoff = start;
2328 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2329 bp->nb_validend = on;
2330 if ((off_t)np->n_size > boff + bp->nb_validend)
2331 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2332 /* validate any pages before the write offset */
2333 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2334 NBPGVALID_SET(bp, start/PAGE_SIZE);
2335 /* adjust start to read any trailing data */
2336 start = on+n;
2337 }
2338
2339 /* if end is at end of page, try to */
2340 /* get any following pages as well. */
2341 if (!(end & PAGE_MASK)) {
2342 /* stop at next valid page or end of block */
2343 for (; end < bufsize; end+=PAGE_SIZE)
2344 if (NBPGVALID(bp,end/PAGE_SIZE))
2345 break;
2346 }
2347
2348 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2349 /*
2350 * Either this entire read is beyond the current EOF
2351 * or the range that we won't be modifying (on+n...end)
2352 * is all beyond the current EOF.
2353 * No need to make a trip across the network to
2354 * read nothing. So, just zero the buffer instead.
2355 */
2356 FSDBG(516, bp, start, end - start, 0xd00dee00);
2357 bzero(bp->nb_data + start, end - start);
2358 } else {
2359 /* now we'll read the (rest of the) data */
2360 auio.uio_offset = boff + start;
2361 iov.iov_len = end - start;
2362 uio_uio_resid_set(&auio, iov.iov_len);
2363 iov.iov_base = (uintptr_t) (bp->nb_data + start);
2364 error = nfs_readrpc(vp, &auio, cred, p);
2365 if (error) {
2366 bp->nb_error = error;
2367 SET(bp->nb_flags, NB_ERROR);
2368 printf("nfs_write: readrpc %d", error);
2369 }
2370 if (uio_uio_resid(&auio) > 0) {
2371 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02);
2372 // LP64todo - fix this
2373 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2374 }
2375 }
2376 /* update validoff/validend if necessary */
2377 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2378 bp->nb_validoff = start;
2379 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2380 bp->nb_validend = end;
2381 if ((off_t)np->n_size > boff + bp->nb_validend)
2382 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2383 /* validate any pages before the write offset's page */
2384 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2385 NBPGVALID_SET(bp, start/PAGE_SIZE);
2386 /* validate any pages after the range of pages being written to */
2387 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2388 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2389 /* Note: pages being written to will be validated when written */
2390 }
2391 }
2392
2393 if (ISSET(bp->nb_flags, NB_ERROR)) {
2394 error = bp->nb_error;
2395 nfs_buf_release(bp, 1);
2396 np->n_flag &= ~NWRBUSY;
2397 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2398 return (error);
2399 }
2400
2401 np->n_flag |= NMODIFIED;
2402
2403 NFS_BUF_MAP(bp);
2404 error = uiomove((char *)bp->nb_data + on, n, uio);
2405 if (error) {
2406 SET(bp->nb_flags, NB_ERROR);
2407 nfs_buf_release(bp, 1);
2408 np->n_flag &= ~NWRBUSY;
2409 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2410 return (error);
2411 }
2412
2413 /* validate any pages written to */
2414 start = on & ~PAGE_MASK;
2415 for (; start < on+n; start += PAGE_SIZE) {
2416 NBPGVALID_SET(bp, start/PAGE_SIZE);
2417 /*
2418 * This may seem a little weird, but we don't actually set the
2419 * dirty bits for writes. This is because we keep the dirty range
2420 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2421 * delayed writes, when we give the pages back to the VM we don't
2422 * want to keep them marked dirty, because when we later write the
2423 * buffer we won't be able to tell which pages were written dirty
2424 * and which pages were mmapped and dirtied.
2425 */
2426 }
2427 if (bp->nb_dirtyend > 0) {
2428 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2429 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2430 } else {
2431 bp->nb_dirtyoff = on;
2432 bp->nb_dirtyend = on + n;
2433 }
2434 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2435 bp->nb_validoff > bp->nb_dirtyend) {
2436 bp->nb_validoff = bp->nb_dirtyoff;
2437 bp->nb_validend = bp->nb_dirtyend;
2438 } else {
2439 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2440 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2441 }
2442 if (!ISSET(bp->nb_flags, NB_CACHE))
2443 nfs_buf_normalize_valid_range(np, bp);
2444
2445 /*
2446 * Since this block is being modified, it must be written
2447 * again and not just committed.
2448 */
2449 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2450 np->n_needcommitcnt--;
2451 CHECK_NEEDCOMMITCNT(np);
2452 }
2453 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2454
2455 if (ioflag & IO_SYNC) {
2456 bp->nb_proc = p;
2457 error = nfs_buf_write(bp);
2458 if (error) {
2459 np->n_flag &= ~NWRBUSY;
2460 FSDBG_BOT(515, vp, uio->uio_offset,
2461 uio_uio_resid(uio), error);
2462 return (error);
2463 }
2464 } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) {
2465 bp->nb_proc = NULL;
2466 SET(bp->nb_flags, NB_ASYNC);
2467 nfs_buf_write(bp);
2468 } else
2469 nfs_buf_write_delayed(bp, p);
2470
2471 if (np->n_needcommitcnt > (nfsbufcnt/16))
2472 nfs_flushcommits(vp, p, 1);
2473
2474 } while (uio_uio_resid(uio) > 0 && n > 0);
2475
2476 if (np->n_flag & NNOCACHE) {
2477 /* make sure all the buffers are flushed out */
2478 error = nfs_flush(vp, MNT_WAIT, cred, p, 0);
2479 }
2480
2481 np->n_flag &= ~NWRBUSY;
2482 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2483 return (error);
2484 }
2485
2486 /*
2487 * Flush out and invalidate all buffers associated with a vnode.
2488 * Called with the underlying object locked.
2489 */
2490 static int
2491 nfs_vinvalbuf_internal(
2492 vnode_t vp,
2493 int flags,
2494 kauth_cred_t cred,
2495 proc_t p,
2496 int slpflag,
2497 int slptimeo)
2498 {
2499 struct nfsbuf *bp;
2500 struct nfsbuflists blist;
2501 int list, error = 0;
2502 struct nfsnode *np = VTONFS(vp);
2503
2504 if (flags & V_SAVE) {
2505 if ((error = nfs_flush(vp, MNT_WAIT, cred, p,
2506 (flags & V_IGNORE_WRITEERR))))
2507 return (error);
2508 if (!LIST_EMPTY(&np->n_dirtyblkhd))
2509 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2510 vp, LIST_FIRST(&np->n_dirtyblkhd));
2511 }
2512
2513 lck_mtx_lock(nfs_buf_mutex);
2514 for (;;) {
2515 list = NBI_CLEAN;
2516 if (nfs_buf_iterprepare(np, &blist, list)) {
2517 list = NBI_DIRTY;
2518 if (nfs_buf_iterprepare(np, &blist, list))
2519 break;
2520 }
2521 while ((bp = LIST_FIRST(&blist))) {
2522 LIST_REMOVE(bp, nb_vnbufs);
2523 if (list == NBI_CLEAN)
2524 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2525 else
2526 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2527 nfs_buf_refget(bp);
2528 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2529 FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags);
2530 if (error != EAGAIN) {
2531 FSDBG(554, vp, bp, -1, error);
2532 nfs_buf_refrele(bp);
2533 nfs_buf_itercomplete(np, &blist, list);
2534 lck_mtx_unlock(nfs_buf_mutex);
2535 return (error);
2536 }
2537 }
2538 nfs_buf_refrele(bp);
2539 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2540 lck_mtx_unlock(nfs_buf_mutex);
2541 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp &&
2542 (NBOFF(bp) < (off_t)np->n_size)) {
2543 /* XXX extra paranoia: make sure we're not */
2544 /* somehow leaving any dirty data around */
2545 int mustwrite = 0;
2546 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
2547 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
2548 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2549 error = nfs_buf_upl_setup(bp);
2550 if (error == EINVAL) {
2551 /* vm object must no longer exist */
2552 /* hopefully we don't need to do */
2553 /* anything for this buffer */
2554 } else if (error)
2555 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
2556 bp->nb_valid = bp->nb_dirty = 0;
2557 }
2558 nfs_buf_upl_check(bp);
2559 /* check for any dirty data before the EOF */
2560 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2561 /* clip dirty range to EOF */
2562 if (bp->nb_dirtyend > end)
2563 bp->nb_dirtyend = end;
2564 mustwrite++;
2565 }
2566 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2567 /* also make sure we'll have a credential to do the write */
2568 if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) {
2569 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2570 mustwrite = 0;
2571 }
2572 if (mustwrite) {
2573 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2574 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2575 panic("nfs_vinvalbuf: dirty buffer without upl");
2576 /* gotta write out dirty data before invalidating */
2577 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2578 /* (NB_NOCACHE indicates buffer should be discarded) */
2579 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2580 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2581 if (bp->nb_wcred == NOCRED) {
2582 kauth_cred_ref(cred);
2583 bp->nb_wcred = cred;
2584 }
2585 error = nfs_buf_write(bp);
2586 // Note: bp has been released
2587 if (error) {
2588 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2589 np->n_error = error;
2590 np->n_flag |= NWRITEERR;
2591 /*
2592 * There was a write error and we need to
2593 * invalidate attrs to sync with server.
2594 * (if this write was extending the file,
2595 * we may no longer know the correct size)
2596 */
2597 NATTRINVALIDATE(np);
2598 error = 0;
2599 }
2600 lck_mtx_lock(nfs_buf_mutex);
2601 continue;
2602 }
2603 }
2604 SET(bp->nb_flags, NB_INVAL);
2605 // hold off on FREEUPs until we're done here
2606 nfs_buf_release(bp, 0);
2607 lck_mtx_lock(nfs_buf_mutex);
2608 }
2609 nfs_buf_itercomplete(np, &blist, list);
2610 }
2611 lck_mtx_unlock(nfs_buf_mutex);
2612 NFS_BUF_FREEUP();
2613 if (NVALIDBUFS(np))
2614 panic("nfs_vinvalbuf: flush failed");
2615 return (0);
2616 }
2617
2618
2619 /*
2620 * Flush and invalidate all dirty buffers. If another process is already
2621 * doing the flush, just wait for completion.
2622 */
2623 int
2624 nfs_vinvalbuf(
2625 vnode_t vp,
2626 int flags,
2627 kauth_cred_t cred,
2628 proc_t p,
2629 int intrflg)
2630 {
2631 struct nfsnode *np = VTONFS(vp);
2632 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
2633 int error = 0, slpflag, slptimeo;
2634 off_t size;
2635
2636 FSDBG_TOP(554, vp, flags, intrflg, 0);
2637
2638 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2639 intrflg = 0;
2640 if (intrflg) {
2641 slpflag = PCATCH;
2642 slptimeo = 2 * hz;
2643 } else {
2644 slpflag = 0;
2645 slptimeo = 0;
2646 }
2647 /*
2648 * First wait for any other process doing a flush to complete.
2649 */
2650 while (np->n_flag & NFLUSHINPROG) {
2651 np->n_flag |= NFLUSHWANT;
2652 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2653 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2654 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2655 if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
2656 FSDBG_BOT(554, vp, flags, intrflg, error);
2657 return (error);
2658 }
2659 }
2660
2661 /*
2662 * Now, flush as required.
2663 */
2664 np->n_flag |= NFLUSHINPROG;
2665 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2666 while (error) {
2667 FSDBG(554, vp, 0, 0, error);
2668 error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p);
2669 if (error) {
2670 np->n_flag &= ~NFLUSHINPROG;
2671 if (np->n_flag & NFLUSHWANT) {
2672 np->n_flag &= ~NFLUSHWANT;
2673 wakeup((caddr_t)&np->n_flag);
2674 }
2675 FSDBG_BOT(554, vp, flags, intrflg, error);
2676 return (error);
2677 }
2678 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2679 }
2680 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2681 if (np->n_flag & NFLUSHWANT) {
2682 np->n_flag &= ~NFLUSHWANT;
2683 wakeup((caddr_t)&np->n_flag);
2684 }
2685 /*
2686 * get the pages out of vm also
2687 */
2688 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
2689 int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE);
2690 if (!rv)
2691 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2692 }
2693
2694 FSDBG_BOT(554, vp, flags, intrflg, 0);
2695 return (0);
2696 }
2697
2698 /*
2699 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2700 * This is mainly to avoid queueing async I/O requests when the nfsiods
2701 * are all hung on a dead server.
2702 */
2703 int
2704 nfs_asyncio(bp, cred)
2705 struct nfsbuf *bp;
2706 kauth_cred_t cred;
2707 {
2708 struct nfsmount *nmp;
2709 int i;
2710 int gotiod;
2711 int slpflag = 0;
2712 int slptimeo = 0;
2713 int error, error2;
2714 void *wakeme = NULL;
2715 struct timespec ts;
2716
2717 if (nfs_numasync == 0)
2718 return (EIO);
2719
2720 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2721
2722 nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL);
2723 again:
2724 if (nmp && nmp->nm_flag & NFSMNT_INT)
2725 slpflag = PCATCH;
2726 gotiod = FALSE;
2727
2728 lck_mtx_lock(nfs_iod_mutex);
2729
2730 /* no nfsbuf means tell nfsiod to process delwri list */
2731 if (!bp)
2732 nfs_ioddelwri = 1;
2733
2734 /*
2735 * Find a free iod to process this request.
2736 */
2737 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2738 if (nfs_iodwant[i]) {
2739 /*
2740 * Found one, so wake it up and tell it which
2741 * mount to process.
2742 */
2743 nfs_iodwant[i] = NULL;
2744 nfs_iodmount[i] = nmp;
2745 if (nmp)
2746 nmp->nm_bufqiods++;
2747 wakeme = &nfs_iodwant[i];
2748 gotiod = TRUE;
2749 break;
2750 }
2751
2752 /* if we're just poking the delwri list, we're done */
2753 if (!bp) {
2754 lck_mtx_unlock(nfs_iod_mutex);
2755 if (wakeme)
2756 wakeup(wakeme);
2757 FSDBG_BOT(552, bp, 0x10101010, wakeme, 0);
2758 return (0);
2759 }
2760
2761 /*
2762 * If none are free, we may already have an iod working on this mount
2763 * point. If so, it will process our request.
2764 */
2765 if (!gotiod) {
2766 if (nmp->nm_bufqiods > 0) {
2767 gotiod = TRUE;
2768 }
2769 }
2770
2771 /*
2772 * If we have an iod which can process the request, then queue
2773 * the buffer.
2774 */
2775 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2776 if (gotiod) {
2777 /*
2778 * Ensure that the queue never grows too large.
2779 */
2780 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2781 if (ISSET(bp->nb_flags, NB_IOD)) {
2782 /* An nfsiod is attempting this async operation so */
2783 /* we must not fall asleep on the bufq because we */
2784 /* could be waiting on ourself. Just return error */
2785 /* and we'll do this operation syncrhonously. */
2786 goto out;
2787 }
2788 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2789 nmp->nm_bufqwant = TRUE;
2790
2791 ts.tv_sec = (slptimeo/100);
2792 /* the hz value is 100; which leads to 10ms */
2793 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2794
2795 error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO,
2796 "nfsaio", &ts);
2797 if (error) {
2798 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2799 if (error2) {
2800 lck_mtx_unlock(nfs_iod_mutex);
2801 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2802 return (error2);
2803 }
2804 if (slpflag == PCATCH) {
2805 slpflag = 0;
2806 slptimeo = 2 * hz;
2807 }
2808 }
2809 /*
2810 * We might have lost our iod while sleeping,
2811 * so check and loop if nescessary.
2812 */
2813 if (nmp->nm_bufqiods == 0) {
2814 lck_mtx_unlock(nfs_iod_mutex);
2815 goto again;
2816 }
2817 }
2818
2819 if (ISSET(bp->nb_flags, NB_READ)) {
2820 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2821 kauth_cred_ref(cred);
2822 bp->nb_rcred = cred;
2823 }
2824 } else {
2825 SET(bp->nb_flags, NB_WRITEINPROG);
2826 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2827 kauth_cred_ref(cred);
2828 bp->nb_wcred = cred;
2829 }
2830 }
2831
2832 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2833 nmp->nm_bufqlen++;
2834 lck_mtx_unlock(nfs_iod_mutex);
2835 if (wakeme)
2836 wakeup(wakeme);
2837 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2838 return (0);
2839 }
2840
2841 out:
2842 lck_mtx_unlock(nfs_iod_mutex);
2843 /*
2844 * All the iods are busy on other mounts, so return EIO to
2845 * force the caller to process the i/o synchronously.
2846 */
2847 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2848 return (EIO);
2849 }
2850
2851 /*
2852 * Do an I/O operation to/from a cache block. This may be called
2853 * synchronously or from an nfsiod.
2854 */
2855 int
2856 nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p)
2857 {
2858 struct uio *uiop;
2859 vnode_t vp;
2860 struct nfsnode *np;
2861 struct nfsmount *nmp;
2862 int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0;
2863 struct uio uio;
2864 struct iovec_32 io;
2865 enum vtype vtype;
2866
2867 vp = bp->nb_vp;
2868 vtype = vnode_vtype(vp);
2869 np = VTONFS(vp);
2870 nmp = VFSTONFS(vnode_mount(vp));
2871 uiop = &uio;
2872 uiop->uio_iovs.iov32p = &io;
2873 uiop->uio_iovcnt = 1;
2874 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2875 uiop->uio_segflg = UIO_SYSSPACE;
2876 #else
2877 uiop->uio_segflg = UIO_SYSSPACE32;
2878 #endif
2879
2880 /*
2881 * we've decided to perform I/O for this block,
2882 * so we couldn't possibly NB_DONE. So, clear it.
2883 */
2884 if (ISSET(bp->nb_flags, NB_DONE)) {
2885 if (!ISSET(bp->nb_flags, NB_ASYNC))
2886 panic("nfs_doio: done and not async");
2887 CLR(bp->nb_flags, NB_DONE);
2888 }
2889 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2890 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2891 bp->nb_dirtyend);
2892
2893 if (ISSET(bp->nb_flags, NB_READ)) {
2894 if (vtype == VREG)
2895 NFS_BUF_MAP(bp);
2896 io.iov_len = bp->nb_bufsize;
2897 uio_uio_resid_set(uiop, io.iov_len);
2898 io.iov_base = (uintptr_t) bp->nb_data;
2899 uiop->uio_rw = UIO_READ;
2900 switch (vtype) {
2901 case VREG:
2902 uiop->uio_offset = NBOFF(bp);
2903 OSAddAtomic(1, (SInt32*)&nfsstats.read_bios);
2904 error = nfs_readrpc(vp, uiop, cr, p);
2905 FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error);
2906 if (!error) {
2907 /* update valid range */
2908 bp->nb_validoff = 0;
2909 if (uio_uio_resid(uiop) != 0) {
2910 /*
2911 * If len > 0, there is a hole in the file and
2912 * no writes after the hole have been pushed to
2913 * the server yet.
2914 * Just zero fill the rest of the valid area.
2915 */
2916 // LP64todo - fix this
2917 diff = bp->nb_bufsize - uio_uio_resid(uiop);
2918 len = np->n_size - (NBOFF(bp) + diff);
2919 if (len > 0) {
2920 // LP64todo - fix this
2921 len = min(len, uio_uio_resid(uiop));
2922 bzero((char *)bp->nb_data + diff, len);
2923 bp->nb_validend = diff + len;
2924 FSDBG(258, diff, len, 0, 1);
2925 } else
2926 bp->nb_validend = diff;
2927 } else
2928 bp->nb_validend = bp->nb_bufsize;
2929 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2930 if (bp->nb_validend & PAGE_MASK) {
2931 /* valid range ends in the middle of a page so we */
2932 /* need to zero-fill any invalid data at the end */
2933 /* of the last page */
2934 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2935 bp->nb_bufsize - bp->nb_validend);
2936 FSDBG(258, bp->nb_validend,
2937 bp->nb_bufsize - bp->nb_validend, 0, 2);
2938 }
2939 }
2940 break;
2941 case VLNK:
2942 uiop->uio_offset = (off_t)0;
2943 OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios);
2944 error = nfs_readlinkrpc(vp, uiop, cr, p);
2945 if (!error) {
2946 bp->nb_validoff = 0;
2947 bp->nb_validend = uiop->uio_offset;
2948 }
2949 break;
2950 case VDIR:
2951 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
2952 uiop->uio_offset = NBOFF(bp);
2953 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2954 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2955 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2956 error = nfs_readdirplusrpc(vp, uiop, cr, p);
2957 if (error == NFSERR_NOTSUPP)
2958 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2959 }
2960 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2961 error = nfs_readdirrpc(vp, uiop, cr, p);
2962 if (!error) {
2963 bp->nb_validoff = 0;
2964 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2965 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2966 }
2967 break;
2968 default:
2969 printf("nfs_doio: type %x unexpected\n", vtype);
2970 break;
2971 };
2972 if (error) {
2973 SET(bp->nb_flags, NB_ERROR);
2974 bp->nb_error = error;
2975 }
2976
2977 } else {
2978 /* we're doing a write */
2979 int doff, dend = 0;
2980
2981 /* We need to make sure the pages are locked before doing I/O. */
2982 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) {
2983 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2984 error = nfs_buf_upl_setup(bp);
2985 if (error) {
2986 printf("nfs_doio: upl create failed %d\n", error);
2987 SET(bp->nb_flags, NB_ERROR);
2988 bp->nb_error = EIO;
2989 return (EIO);
2990 }
2991 nfs_buf_upl_check(bp);
2992 }
2993 }
2994
2995 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2996 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2997 /*
2998 * There are pages marked dirty that need to be written out.
2999 *
3000 * We don't want to just combine the write range with the
3001 * range of pages that are dirty because that could cause us
3002 * to write data that wasn't actually written to.
3003 * We also don't want to write data more than once.
3004 *
3005 * If the dirty range just needs to be committed, we do that.
3006 * Otherwise, we write the dirty range and clear the dirty bits
3007 * for any COMPLETE pages covered by that range.
3008 * If there are dirty pages left after that, we write out the
3009 * parts that we haven't written yet.
3010 */
3011 }
3012
3013 /*
3014 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3015 * an actual write will have to be done.
3016 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
3017 */
3018 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
3019 doff = NBOFF(bp) + bp->nb_dirtyoff;
3020 SET(bp->nb_flags, NB_WRITEINPROG);
3021 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
3022 bp->nb_wcred, bp->nb_proc);
3023 CLR(bp->nb_flags, NB_WRITEINPROG);
3024 if (!error) {
3025 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3026 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3027 np->n_needcommitcnt--;
3028 CHECK_NEEDCOMMITCNT(np);
3029 } else if (error == NFSERR_STALEWRITEVERF)
3030 nfs_clearcommit(vnode_mount(vp));
3031 }
3032
3033 if (!error && bp->nb_dirtyend > 0) {
3034 /* there's a dirty range that needs to be written out */
3035 u_int32_t pagemask;
3036 int firstpg, lastpg;
3037
3038 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
3039 bp->nb_dirtyend = np->n_size - NBOFF(bp);
3040
3041 NFS_BUF_MAP(bp);
3042
3043 doff = bp->nb_dirtyoff;
3044 dend = bp->nb_dirtyend;
3045
3046 /* if doff page is dirty, move doff to start of page */
3047 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
3048 doff -= doff & PAGE_MASK;
3049 /* try to expand write range to include preceding dirty pages */
3050 if (!(doff & PAGE_MASK))
3051 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
3052 doff -= PAGE_SIZE;
3053 /* if dend page is dirty, move dend to start of next page */
3054 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
3055 dend = round_page_32(dend);
3056 /* try to expand write range to include trailing dirty pages */
3057 if (!(dend & PAGE_MASK))
3058 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
3059 dend += PAGE_SIZE;
3060 /* make sure to keep dend clipped to EOF */
3061 if (NBOFF(bp) + dend > (off_t)np->n_size)
3062 dend = np->n_size - NBOFF(bp);
3063 /* calculate range of complete pages being written */
3064 firstpg = round_page_32(doff) / PAGE_SIZE;
3065 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
3066 /* calculate mask for that page range */
3067 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
3068
3069 /* compare page mask to nb_dirty; if there are other dirty pages */
3070 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
3071 /* not needcommit/stable; otherwise write FILESYNC */
3072 if (bp->nb_dirty & ~pagemask)
3073 iomode = NFSV3WRITE_FILESYNC;
3074 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
3075 iomode = NFSV3WRITE_UNSTABLE;
3076 else
3077 iomode = NFSV3WRITE_FILESYNC;
3078
3079 /* write the dirty range */
3080 io.iov_len = dend - doff;
3081 uio_uio_resid_set(uiop, io.iov_len);
3082 uiop->uio_offset = NBOFF(bp) + doff;
3083 io.iov_base = (uintptr_t) bp->nb_data + doff;
3084 uiop->uio_rw = UIO_WRITE;
3085
3086 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3087
3088 SET(bp->nb_flags, NB_WRITEINPROG);
3089 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3090 if (must_commit)
3091 nfs_clearcommit(vnode_mount(vp));
3092 /* clear dirty bits for pages we've written */
3093 if (!error)
3094 bp->nb_dirty &= ~pagemask;
3095 /* set/clear needcommit flag */
3096 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
3097 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3098 np->n_needcommitcnt++;
3099 SET(bp->nb_flags, NB_NEEDCOMMIT);
3100 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3101 bp->nb_dirtyoff = doff;
3102 bp->nb_dirtyend = dend;
3103 } else {
3104 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3105 np->n_needcommitcnt--;
3106 CHECK_NEEDCOMMITCNT(np);
3107 }
3108 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3109 }
3110 CLR(bp->nb_flags, NB_WRITEINPROG);
3111 /*
3112 * For an interrupted write, the buffer is still valid and the write
3113 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3114 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3115 * NB_EINTR is not relevant.
3116 *
3117 * For the case of a V3 write rpc not being committed to stable
3118 * storage, the block is still dirty and requires either a commit rpc
3119 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3120 * block is reused. This is indicated by setting the NB_DELWRI and
3121 * NB_NEEDCOMMIT flags.
3122 */
3123 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
3124 CLR(bp->nb_flags, NB_INVAL);
3125 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3126 SET(bp->nb_flags, NB_DELWRI);
3127 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
3128 NFSBUFCNTCHK(0);
3129 }
3130 FSDBG(261, bp->nb_validoff, bp->nb_validend,
3131 bp->nb_bufsize, 0);
3132 /*
3133 * Since for the NB_ASYNC case, nfs_bwrite() has
3134 * reassigned the buffer to the clean list, we have to
3135 * reassign it back to the dirty one. Ugh.
3136 */
3137 if (ISSET(bp->nb_flags, NB_ASYNC)) {
3138 /* move to dirty list */
3139 lck_mtx_lock(nfs_buf_mutex);
3140 if (bp->nb_vnbufs.le_next != NFSNOLIST)
3141 LIST_REMOVE(bp, nb_vnbufs);
3142 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3143 lck_mtx_unlock(nfs_buf_mutex);
3144 } else {
3145 SET(bp->nb_flags, NB_EINTR);
3146 }
3147 } else {
3148 /* either there's an error or we don't need to commit */
3149 if (error) {
3150 SET(bp->nb_flags, NB_ERROR);
3151 bp->nb_error = np->n_error = error;
3152 np->n_flag |= NWRITEERR;
3153 /*
3154 * There was a write error and we need to
3155 * invalidate attrs and flush buffers in
3156 * order to sync up with the server.
3157 * (if this write was extending the file,
3158 * we may no longer know the correct size)
3159 *
3160 * But we can't call vinvalbuf while holding
3161 * this buffer busy. Set a flag to do it after
3162 * releasing the buffer.
3163 *
3164 * Note we can only invalidate in this function
3165 * if this is an async write and so the iodone
3166 * below will release the buffer. Also, we
3167 * shouldn't call vinvalbuf from nfsiod because
3168 * that may deadlock waiting for the completion
3169 * of writes that are queued up behind this one.
3170 */
3171 if (ISSET(bp->nb_flags, NB_ASYNC) &&
3172 !ISSET(bp->nb_flags, NB_IOD)) {
3173 invalidate = 1;
3174 } else {
3175 /* invalidate later */
3176 np->n_flag |= NNEEDINVALIDATE;
3177 }
3178 NATTRINVALIDATE(np);
3179 }
3180 /* clear the dirty range */
3181 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3182 }
3183 }
3184
3185 if (!error && bp->nb_dirty) {
3186 /* there are pages marked dirty that need to be written out */
3187 int pg, count, npages, off;
3188
3189 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3190
3191 NFS_BUF_MAP(bp);
3192
3193 /*
3194 * we do these writes synchronously because we can't really
3195 * support the unstable/needommit method. We could write
3196 * them unstable, clear the dirty bits, and then commit the
3197 * whole block later, but if we need to rewrite the data, we
3198 * won't have any idea which pages were written because that
3199 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3200 * also can't leave the dirty bits set because then we wouldn't
3201 * be able to tell if the pages were re-dirtied between the end
3202 * of the write and the commit.
3203 */
3204 iomode = NFSV3WRITE_FILESYNC;
3205 uiop->uio_rw = UIO_WRITE;
3206
3207 SET(bp->nb_flags, NB_WRITEINPROG);
3208 npages = bp->nb_bufsize/PAGE_SIZE;
3209 for (pg=0; pg < npages; pg++) {
3210 if (!NBPGDIRTY(bp,pg))
3211 continue;
3212 count = 1;
3213 while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count))
3214 count++;
3215 /* write count pages starting with page pg */
3216 off = pg * PAGE_SIZE;
3217 len = count * PAGE_SIZE;
3218
3219 /* clip writes to EOF */
3220 if (NBOFF(bp) + off + len > (off_t)np->n_size)
3221 len -= (NBOFF(bp) + off + len) - np->n_size;
3222 if (len > 0) {
3223 io.iov_len = len;
3224 uio_uio_resid_set(uiop, io.iov_len);
3225 uiop->uio_offset = NBOFF(bp) + off;
3226 io.iov_base = (uintptr_t) bp->nb_data + off;
3227 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3228 if (must_commit)
3229 nfs_clearcommit(vnode_mount(vp));
3230 if (error)
3231 break;
3232 }
3233 /* clear dirty bits */
3234 while (count--) {
3235 bp->nb_dirty &= ~(1 << pg);
3236 /* leave pg on last page */
3237 if (count) pg++;
3238 }
3239 }
3240 if (!error) {
3241 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3242 np->n_needcommitcnt--;
3243 CHECK_NEEDCOMMITCNT(np);
3244 }
3245 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3246 }
3247 CLR(bp->nb_flags, NB_WRITEINPROG);
3248 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
3249 np->n_size);
3250 }
3251
3252 if (error) {
3253 SET(bp->nb_flags, NB_ERROR);
3254 bp->nb_error = error;
3255 }
3256 }
3257
3258 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
3259
3260 nfs_buf_iodone(bp);
3261
3262 if (invalidate) {
3263 /*
3264 * There was a write error and we need to
3265 * invalidate attrs and flush buffers in
3266 * order to sync up with the server.
3267 * (if this write was extending the file,
3268 * we may no longer know the correct size)
3269 *
3270 * But we couldn't call vinvalbuf while holding
3271 * the buffer busy. So we call vinvalbuf() after
3272 * releasing the buffer.
3273 *
3274 * Note: we don't bother calling nfs_vinvalbuf() if
3275 * there's already a flush in progress.
3276 */
3277 if (!(np->n_flag & NFLUSHINPROG))
3278 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1);
3279 }
3280
3281 return (error);
3282 }