]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-792.13.8.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 /*
32 * Copyright (c) 1989, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * Rick Macklem at The University of Guelph.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
67 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
68 */
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/resourcevar.h>
72 #include <sys/signalvar.h>
73 #include <sys/proc_internal.h>
74 #include <sys/kauth.h>
75 #include <sys/malloc.h>
76 #include <sys/vnode.h>
77 #include <sys/dirent.h>
78 #include <sys/mount_internal.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81 #include <sys/ubc_internal.h>
82 #include <sys/uio_internal.h>
83
84 #include <sys/vm.h>
85 #include <sys/vmparam.h>
86
87 #include <sys/time.h>
88 #include <kern/clock.h>
89 #include <libkern/OSAtomic.h>
90 #include <kern/kalloc.h>
91
92 #include <nfs/rpcv2.h>
93 #include <nfs/nfsproto.h>
94 #include <nfs/nfs.h>
95 #include <nfs/nfsmount.h>
96 #include <nfs/nfsnode.h>
97 #include <sys/buf_internal.h>
98
99 #include <sys/kdebug.h>
100
101 #define FSDBG(A, B, C, D, E) \
102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
103 (int)(B), (int)(C), (int)(D), (int)(E), 0)
104 #define FSDBG_TOP(A, B, C, D, E) \
105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
106 (int)(B), (int)(C), (int)(D), (int)(E), 0)
107 #define FSDBG_BOT(A, B, C, D, E) \
108 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
109 (int)(B), (int)(C), (int)(D), (int)(E), 0)
110
111 extern int nfs_numasync;
112 extern int nfs_ioddelwri;
113 extern struct nfsstats nfsstats;
114
115 #define NFSBUFHASH(np, lbn) \
116 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
117 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
118 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
119 u_long nfsbufhash;
120 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
121 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
122 int nfs_nbdwrite;
123 time_t nfsbuffreeuptimestamp;
124
125 lck_grp_t *nfs_buf_lck_grp;
126 lck_grp_attr_t *nfs_buf_lck_grp_attr;
127 lck_attr_t *nfs_buf_lck_attr;
128 lck_mtx_t *nfs_buf_mutex;
129
130 #define NFSBUFWRITE_THROTTLE 9
131 #define NFSBUF_LRU_STALE 120
132 #define NFSBUF_META_STALE 240
133
134 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
135 #define LRU_TO_FREEUP 6
136 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
137 #define META_TO_FREEUP 3
138 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
139 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
140 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
141 #define LRU_FREEUP_FRAC_ON_TIMER 8
142 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
143 #define META_FREEUP_FRAC_ON_TIMER 16
144 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
145 #define LRU_FREEUP_MIN_FRAC 4
146 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
147 #define META_FREEUP_MIN_FRAC 2
148
149 #define NFS_BUF_FREEUP() \
150 do { \
151 /* only call nfs_buf_freeup() if it has work to do: */ \
152 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
153 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
154 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
155 nfs_buf_freeup(0); \
156 } while (0)
157
158 /*
159 * Initialize nfsbuf lists
160 */
161 void
162 nfs_nbinit(void)
163 {
164 nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init();
165 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr);
166
167 nfs_buf_lck_attr = lck_attr_alloc_init();
168
169 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr);
170
171 nfsbufcnt = nfsbufmetacnt =
172 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
173 nfsbufmin = 128;
174 nfsbufmax = (sane_size >> PAGE_SHIFT) / 4;
175 nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16;
176 nfsneedbuffer = 0;
177 nfs_nbdwrite = 0;
178 nfsbuffreeuptimestamp = 0;
179
180 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
181 TAILQ_INIT(&nfsbuffree);
182 TAILQ_INIT(&nfsbuffreemeta);
183 TAILQ_INIT(&nfsbufdelwri);
184
185 }
186
187 /*
188 * try to free up some excess, unused nfsbufs
189 */
190 void
191 nfs_buf_freeup(int timer)
192 {
193 struct nfsbuf *fbp;
194 struct timeval now;
195 int count;
196 struct nfsbuffreehead nfsbuffreeup;
197
198 TAILQ_INIT(&nfsbuffreeup);
199
200 lck_mtx_lock(nfs_buf_mutex);
201
202 microuptime(&now);
203 nfsbuffreeuptimestamp = now.tv_sec;
204
205 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
206
207 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
208 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
209 fbp = TAILQ_FIRST(&nfsbuffree);
210 if (!fbp)
211 break;
212 if (fbp->nb_refs)
213 break;
214 if (NBUFSTAMPVALID(fbp) &&
215 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
216 break;
217 nfs_buf_remfree(fbp);
218 /* disassociate buffer from any vnode */
219 if (fbp->nb_vp) {
220 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
221 LIST_REMOVE(fbp, nb_vnbufs);
222 fbp->nb_vnbufs.le_next = NFSNOLIST;
223 }
224 fbp->nb_vp = NULL;
225 }
226 LIST_REMOVE(fbp, nb_hash);
227 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
228 nfsbufcnt--;
229 }
230
231 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
232 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
233 fbp = TAILQ_FIRST(&nfsbuffreemeta);
234 if (!fbp)
235 break;
236 if (fbp->nb_refs)
237 break;
238 if (NBUFSTAMPVALID(fbp) &&
239 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
240 break;
241 nfs_buf_remfree(fbp);
242 /* disassociate buffer from any vnode */
243 if (fbp->nb_vp) {
244 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
245 LIST_REMOVE(fbp, nb_vnbufs);
246 fbp->nb_vnbufs.le_next = NFSNOLIST;
247 }
248 fbp->nb_vp = NULL;
249 }
250 LIST_REMOVE(fbp, nb_hash);
251 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
252 nfsbufcnt--;
253 nfsbufmetacnt--;
254 }
255
256 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
257 NFSBUFCNTCHK(1);
258
259 lck_mtx_unlock(nfs_buf_mutex);
260
261 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
262 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
263 /* nuke any creds */
264 if (fbp->nb_rcred != NOCRED) {
265 kauth_cred_rele(fbp->nb_rcred);
266 fbp->nb_rcred = NOCRED;
267 }
268 if (fbp->nb_wcred != NOCRED) {
269 kauth_cred_rele(fbp->nb_wcred);
270 fbp->nb_wcred = NOCRED;
271 }
272 /* if buf was NB_META, dump buffer */
273 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
274 kfree(fbp->nb_data, fbp->nb_bufsize);
275 FREE(fbp, M_TEMP);
276 }
277
278 }
279
280 /*
281 * remove a buffer from the freelist
282 * (must be called with nfs_buf_mutex held)
283 */
284 void
285 nfs_buf_remfree(struct nfsbuf *bp)
286 {
287 if (bp->nb_free.tqe_next == NFSNOLIST)
288 panic("nfsbuf not on free list");
289 if (ISSET(bp->nb_flags, NB_DELWRI)) {
290 nfsbufdelwricnt--;
291 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
292 } else if (ISSET(bp->nb_flags, NB_META)) {
293 nfsbuffreemetacnt--;
294 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
295 } else {
296 nfsbuffreecnt--;
297 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
298 }
299 bp->nb_free.tqe_next = NFSNOLIST;
300 NFSBUFCNTCHK(1);
301 }
302
303 /*
304 * check for existence of nfsbuf in cache
305 */
306 boolean_t
307 nfs_buf_is_incore(vnode_t vp, daddr64_t blkno)
308 {
309 boolean_t rv;
310 lck_mtx_lock(nfs_buf_mutex);
311 if (nfs_buf_incore(vp, blkno))
312 rv = TRUE;
313 else
314 rv = FALSE;
315 lck_mtx_unlock(nfs_buf_mutex);
316 return (rv);
317 }
318
319 /*
320 * return incore buffer (must be called with nfs_buf_mutex held)
321 */
322 struct nfsbuf *
323 nfs_buf_incore(vnode_t vp, daddr64_t blkno)
324 {
325 /* Search hash chain */
326 struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first;
327 for (; bp != NULL; bp = bp->nb_hash.le_next)
328 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
329 if (!ISSET(bp->nb_flags, NB_INVAL)) {
330 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
331 return (bp);
332 }
333 }
334 return (NULL);
335 }
336
337 /*
338 * Check if it's OK to drop a page.
339 *
340 * Called by vnode_pager() on pageout request of non-dirty page.
341 * We need to make sure that it's not part of a delayed write.
342 * If it is, we can't let the VM drop it because we may need it
343 * later when/if we need to write the data (again).
344 */
345 int
346 nfs_buf_page_inval(vnode_t vp, off_t offset)
347 {
348 struct nfsbuf *bp;
349 int error = 0;
350
351 lck_mtx_lock(nfs_buf_mutex);
352 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
353 if (!bp)
354 goto out;
355 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
356 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
357 error = EBUSY;
358 goto out;
359 }
360 /*
361 * If there's a dirty range in the buffer, check to
362 * see if this page intersects with the dirty range.
363 * If it does, we can't let the pager drop the page.
364 */
365 if (bp->nb_dirtyend > 0) {
366 int start = offset - NBOFF(bp);
367 if (bp->nb_dirtyend <= start ||
368 bp->nb_dirtyoff >= (start + PAGE_SIZE))
369 error = 0;
370 else
371 error = EBUSY;
372 }
373 out:
374 lck_mtx_unlock(nfs_buf_mutex);
375 return (error);
376 }
377
378 /*
379 * set up the UPL for a buffer
380 * (must NOT be called with nfs_buf_mutex held)
381 */
382 int
383 nfs_buf_upl_setup(struct nfsbuf *bp)
384 {
385 kern_return_t kret;
386 upl_t upl;
387 int upl_flags;
388
389 if (ISSET(bp->nb_flags, NB_PAGELIST))
390 return (0);
391
392 upl_flags = UPL_PRECIOUS;
393 if (! ISSET(bp->nb_flags, NB_READ)) {
394 /*
395 * We're doing a "write", so we intend to modify
396 * the pages we're gathering.
397 */
398 upl_flags |= UPL_WILL_MODIFY;
399 }
400 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
401 &upl, NULL, upl_flags);
402 if (kret == KERN_INVALID_ARGUMENT) {
403 /* vm object probably doesn't exist any more */
404 bp->nb_pagelist = NULL;
405 return (EINVAL);
406 }
407 if (kret != KERN_SUCCESS) {
408 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
409 bp->nb_pagelist = NULL;
410 return (EIO);
411 }
412
413 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
414
415 bp->nb_pagelist = upl;
416 SET(bp->nb_flags, NB_PAGELIST);
417 return (0);
418 }
419
420 /*
421 * update buffer's valid/dirty info from UBC
422 * (must NOT be called with nfs_buf_mutex held)
423 */
424 void
425 nfs_buf_upl_check(struct nfsbuf *bp)
426 {
427 upl_page_info_t *pl;
428 off_t filesize, fileoffset;
429 int i, npages;
430
431 if (!ISSET(bp->nb_flags, NB_PAGELIST))
432 return;
433
434 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
435 filesize = ubc_getsize(bp->nb_vp);
436 fileoffset = NBOFF(bp);
437 if (fileoffset < filesize)
438 SET(bp->nb_flags, NB_CACHE);
439 else
440 CLR(bp->nb_flags, NB_CACHE);
441
442 pl = ubc_upl_pageinfo(bp->nb_pagelist);
443 bp->nb_valid = bp->nb_dirty = 0;
444
445 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
446 /* anything beyond the end of the file is not valid or dirty */
447 if (fileoffset >= filesize)
448 break;
449 if (!upl_valid_page(pl, i)) {
450 CLR(bp->nb_flags, NB_CACHE);
451 continue;
452 }
453 NBPGVALID_SET(bp,i);
454 if (upl_dirty_page(pl, i)) {
455 NBPGDIRTY_SET(bp, i);
456 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
457 SET(bp->nb_flags, NB_WASDIRTY);
458 }
459 }
460 fileoffset = NBOFF(bp);
461 if (ISSET(bp->nb_flags, NB_CACHE)) {
462 bp->nb_validoff = 0;
463 bp->nb_validend = bp->nb_bufsize;
464 if (fileoffset + bp->nb_validend > filesize)
465 bp->nb_validend = filesize - fileoffset;
466 } else {
467 bp->nb_validoff = bp->nb_validend = -1;
468 }
469 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
470 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
471 }
472
473 /*
474 * make sure that a buffer is mapped
475 * (must NOT be called with nfs_buf_mutex held)
476 */
477 static int
478 nfs_buf_map(struct nfsbuf *bp)
479 {
480 kern_return_t kret;
481
482 if (bp->nb_data)
483 return (0);
484 if (!ISSET(bp->nb_flags, NB_PAGELIST))
485 return (EINVAL);
486
487 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
488 if (kret != KERN_SUCCESS)
489 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
490 if (bp->nb_data == 0)
491 panic("ubc_upl_map mapped 0");
492 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
493 return (0);
494 }
495
496 /*
497 * check range of pages in nfsbuf's UPL for validity
498 */
499 static int
500 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
501 {
502 off_t fileoffset, filesize;
503 int pg, lastpg;
504 upl_page_info_t *pl;
505
506 if (!ISSET(bp->nb_flags, NB_PAGELIST))
507 return (0);
508 pl = ubc_upl_pageinfo(bp->nb_pagelist);
509
510 size += off & PAGE_MASK;
511 off &= ~PAGE_MASK;
512 fileoffset = NBOFF(bp);
513 filesize = VTONFS(bp->nb_vp)->n_size;
514 if ((fileoffset + off + size) > filesize)
515 size = filesize - (fileoffset + off);
516
517 pg = off/PAGE_SIZE;
518 lastpg = (off + size - 1)/PAGE_SIZE;
519 while (pg <= lastpg) {
520 if (!upl_valid_page(pl, pg))
521 return (0);
522 pg++;
523 }
524 return (1);
525 }
526
527 /*
528 * normalize an nfsbuf's valid range
529 *
530 * the read/write code guarantees that we'll always have a valid
531 * region that is an integral number of pages. If either end
532 * of the valid range isn't page-aligned, it gets corrected
533 * here as we extend the valid range through all of the
534 * contiguous valid pages.
535 */
536 static void
537 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
538 {
539 int pg, npg;
540 /* pull validoff back to start of contiguous valid page range */
541 pg = bp->nb_validoff/PAGE_SIZE;
542 while (pg >= 0 && NBPGVALID(bp,pg))
543 pg--;
544 bp->nb_validoff = (pg+1) * PAGE_SIZE;
545 /* push validend forward to end of contiguous valid page range */
546 npg = bp->nb_bufsize/PAGE_SIZE;
547 pg = bp->nb_validend/PAGE_SIZE;
548 while (pg < npg && NBPGVALID(bp,pg))
549 pg++;
550 bp->nb_validend = pg * PAGE_SIZE;
551 /* clip to EOF */
552 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
553 bp->nb_validend = np->n_size % bp->nb_bufsize;
554 }
555
556 /*
557 * try to push out some delayed/uncommitted writes
558 * ("locked" indicates whether nfs_buf_mutex is already held)
559 */
560 static void
561 nfs_buf_delwri_push(int locked)
562 {
563 struct nfsbuf *bp;
564 int i, error;
565
566 if (TAILQ_EMPTY(&nfsbufdelwri))
567 return;
568
569 /* first try to tell the nfsiods to do it */
570 if (nfs_asyncio(NULL, NULL) == 0)
571 return;
572
573 /* otherwise, try to do some of the work ourselves */
574 i = 0;
575 if (!locked)
576 lck_mtx_lock(nfs_buf_mutex);
577 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
578 struct nfsnode *np = VTONFS(bp->nb_vp);
579 nfs_buf_remfree(bp);
580 nfs_buf_refget(bp);
581 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
582 nfs_buf_refrele(bp);
583 if (error)
584 break;
585 if (!bp->nb_vp) {
586 /* buffer is no longer valid */
587 nfs_buf_drop(bp);
588 continue;
589 }
590 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
591 /* put buffer at end of delwri list */
592 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
593 nfsbufdelwricnt++;
594 nfs_buf_drop(bp);
595 lck_mtx_unlock(nfs_buf_mutex);
596 nfs_flushcommits(np->n_vnode, NULL, 1);
597 } else {
598 SET(bp->nb_flags, NB_ASYNC);
599 lck_mtx_unlock(nfs_buf_mutex);
600 nfs_buf_write(bp);
601 }
602 i++;
603 lck_mtx_lock(nfs_buf_mutex);
604 }
605 if (!locked)
606 lck_mtx_unlock(nfs_buf_mutex);
607 }
608
609 /*
610 * Get an nfs buffer.
611 *
612 * Returns errno on error, 0 otherwise.
613 * Any buffer is returned in *bpp.
614 *
615 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
616 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
617 *
618 * Check for existence of buffer in cache.
619 * Or attempt to reuse a buffer from one of the free lists.
620 * Or allocate a new buffer if we haven't already hit max allocation.
621 * Or wait for a free buffer.
622 *
623 * If available buffer found, prepare it, and return it.
624 *
625 * If the calling process is interrupted by a signal for
626 * an interruptible mount point, return EINTR.
627 */
628 int
629 nfs_buf_get(
630 vnode_t vp,
631 daddr64_t blkno,
632 int size,
633 proc_t p,
634 int flags,
635 struct nfsbuf **bpp)
636 {
637 struct nfsnode *np = VTONFS(vp);
638 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
639 struct nfsbuf *bp;
640 int biosize, bufsize;
641 kauth_cred_t cred;
642 int slpflag = PCATCH;
643 int operation = (flags & NBLK_OPMASK);
644 int error = 0;
645 struct timespec ts;
646
647 FSDBG_TOP(541, vp, blkno, size, flags);
648 *bpp = NULL;
649
650 bufsize = size;
651 if (bufsize > NFS_MAXBSIZE)
652 panic("nfs_buf_get: buffer larger than NFS_MAXBSIZE requested");
653
654 if (!nmp) {
655 FSDBG_BOT(541, vp, blkno, 0, ENXIO);
656 return (ENXIO);
657 }
658 biosize = nmp->nm_biosize;
659
660 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) {
661 operation = NBLK_META;
662 } else if (bufsize < biosize) {
663 /* reg files should always have biosize blocks */
664 bufsize = biosize;
665 }
666
667 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
668 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
669 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
670
671 /* poke the delwri list */
672 nfs_buf_delwri_push(0);
673
674 /* sleep to let other threads run... */
675 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
676 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
677 }
678
679 loop:
680 lck_mtx_lock(nfs_buf_mutex);
681
682 /* check for existence of nfsbuf in cache */
683 if ((bp = nfs_buf_incore(vp, blkno))) {
684 /* if busy, set wanted and wait */
685 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
686 if (flags & NBLK_NOWAIT) {
687 lck_mtx_unlock(nfs_buf_mutex);
688 FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc);
689 return (0);
690 }
691 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
692 SET(bp->nb_lflags, NBL_WANTED);
693
694 ts.tv_sec = 2;
695 ts.tv_nsec = 0;
696 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
697 "nfsbufget", (slpflag == PCATCH) ? 0 : &ts);
698 slpflag = 0;
699 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
700 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
701 FSDBG_BOT(541, vp, blkno, 0, error);
702 return (error);
703 }
704 goto loop;
705 }
706 if (bp->nb_bufsize != bufsize)
707 panic("nfsbuf size mismatch");
708 SET(bp->nb_lflags, NBL_BUSY);
709 SET(bp->nb_flags, NB_CACHE);
710 nfs_buf_remfree(bp);
711 /* additional paranoia: */
712 if (ISSET(bp->nb_flags, NB_PAGELIST))
713 panic("pagelist buffer was not busy");
714 goto buffer_setup;
715 }
716
717 if (flags & NBLK_ONLYVALID) {
718 lck_mtx_unlock(nfs_buf_mutex);
719 FSDBG_BOT(541, vp, blkno, 0, 0x0000cace);
720 return (0);
721 }
722
723 /*
724 * where to get a free buffer:
725 * - if meta and maxmeta reached, must reuse meta
726 * - alloc new if we haven't reached min bufs
727 * - if free lists are NOT empty
728 * - if free list is stale, use it
729 * - else if freemeta list is stale, use it
730 * - else if max bufs allocated, use least-time-to-stale
731 * - alloc new if we haven't reached max allowed
732 * - start clearing out delwri list and try again
733 */
734
735 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
736 /* if we've hit max meta buffers, must reuse a meta buffer */
737 bp = TAILQ_FIRST(&nfsbuffreemeta);
738 } else if ((nfsbufcnt > nfsbufmin) &&
739 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
740 /* try to pull an nfsbuf off a free list */
741 struct nfsbuf *lrubp, *metabp;
742 struct timeval now;
743 microuptime(&now);
744
745 /* if the next LRU or META buffer is invalid or stale, use it */
746 lrubp = TAILQ_FIRST(&nfsbuffree);
747 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
748 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
749 bp = lrubp;
750 metabp = TAILQ_FIRST(&nfsbuffreemeta);
751 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
752 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
753 bp = metabp;
754
755 if (!bp && (nfsbufcnt >= nfsbufmax)) {
756 /* we've already allocated all bufs, so */
757 /* choose the buffer that'll go stale first */
758 if (!metabp)
759 bp = lrubp;
760 else if (!lrubp)
761 bp = metabp;
762 else {
763 int32_t lru_stale_time, meta_stale_time;
764 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
765 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
766 if (lru_stale_time <= meta_stale_time)
767 bp = lrubp;
768 else
769 bp = metabp;
770 }
771 }
772 }
773
774 if (bp) {
775 /* we have a buffer to reuse */
776 FSDBG(544, vp, blkno, bp, bp->nb_flags);
777 nfs_buf_remfree(bp);
778 if (ISSET(bp->nb_flags, NB_DELWRI))
779 panic("nfs_buf_get: delwri");
780 SET(bp->nb_lflags, NBL_BUSY);
781 /* disassociate buffer from previous vnode */
782 if (bp->nb_vp) {
783 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
784 LIST_REMOVE(bp, nb_vnbufs);
785 bp->nb_vnbufs.le_next = NFSNOLIST;
786 }
787 bp->nb_vp = NULL;
788 }
789 LIST_REMOVE(bp, nb_hash);
790 /* nuke any creds we're holding */
791 cred = bp->nb_rcred;
792 if (cred != NOCRED) {
793 bp->nb_rcred = NOCRED;
794 kauth_cred_rele(cred);
795 }
796 cred = bp->nb_wcred;
797 if (cred != NOCRED) {
798 bp->nb_wcred = NOCRED;
799 kauth_cred_rele(cred);
800 }
801 /* if buf will no longer be NB_META, dump old buffer */
802 if (operation == NBLK_META) {
803 if (!ISSET(bp->nb_flags, NB_META))
804 nfsbufmetacnt++;
805 } else if (ISSET(bp->nb_flags, NB_META)) {
806 if (bp->nb_data) {
807 kfree(bp->nb_data, bp->nb_bufsize);
808 bp->nb_data = NULL;
809 }
810 nfsbufmetacnt--;
811 }
812 /* re-init buf fields */
813 bp->nb_error = 0;
814 bp->nb_validoff = bp->nb_validend = -1;
815 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
816 bp->nb_valid = 0;
817 bp->nb_dirty = 0;
818 } else {
819 /* no buffer to reuse */
820 if ((nfsbufcnt < nfsbufmax) &&
821 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
822 /* just alloc a new one */
823 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
824 if (!bp) {
825 lck_mtx_unlock(nfs_buf_mutex);
826 FSDBG_BOT(541, vp, blkno, 0, error);
827 return (ENOMEM);
828 }
829 nfsbufcnt++;
830 if (operation == NBLK_META)
831 nfsbufmetacnt++;
832 NFSBUFCNTCHK(1);
833 /* init nfsbuf */
834 bzero(bp, sizeof(*bp));
835 bp->nb_free.tqe_next = NFSNOLIST;
836 bp->nb_validoff = bp->nb_validend = -1;
837 FSDBG(545, vp, blkno, bp, 0);
838 } else {
839 /* too many bufs... wait for buffers to free up */
840 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
841
842 /* poke the delwri list */
843 nfs_buf_delwri_push(1);
844
845 nfsneedbuffer = 1;
846 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP,
847 "nfsbufget", 0);
848 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
849 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
850 FSDBG_BOT(541, vp, blkno, 0, error);
851 return (error);
852 }
853 goto loop;
854 }
855 }
856
857 /* setup nfsbuf */
858 bp->nb_lflags = NBL_BUSY;
859 bp->nb_flags = 0;
860 bp->nb_lblkno = blkno;
861 /* insert buf in hash */
862 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
863 /* associate buffer with new vnode */
864 bp->nb_vp = vp;
865 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
866
867 buffer_setup:
868
869 /* unlock hash */
870 lck_mtx_unlock(nfs_buf_mutex);
871
872 switch (operation) {
873 case NBLK_META:
874 SET(bp->nb_flags, NB_META);
875 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
876 kfree(bp->nb_data, bp->nb_bufsize);
877 bp->nb_data = NULL;
878 bp->nb_validoff = bp->nb_validend = -1;
879 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
880 bp->nb_valid = 0;
881 bp->nb_dirty = 0;
882 CLR(bp->nb_flags, NB_CACHE);
883 }
884 if (!bp->nb_data)
885 bp->nb_data = kalloc(bufsize);
886 if (!bp->nb_data) {
887 /* Ack! couldn't allocate the data buffer! */
888 /* cleanup buffer and return error */
889 lck_mtx_lock(nfs_buf_mutex);
890 LIST_REMOVE(bp, nb_vnbufs);
891 bp->nb_vnbufs.le_next = NFSNOLIST;
892 bp->nb_vp = NULL;
893 /* invalidate usage timestamp to allow immediate freeing */
894 NBUFSTAMPINVALIDATE(bp);
895 if (bp->nb_free.tqe_next != NFSNOLIST)
896 panic("nfsbuf on freelist");
897 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
898 nfsbuffreecnt++;
899 lck_mtx_unlock(nfs_buf_mutex);
900 FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM);
901 return (ENOMEM);
902 }
903 bp->nb_bufsize = bufsize;
904 break;
905
906 case NBLK_READ:
907 case NBLK_WRITE:
908 /*
909 * Set or clear NB_READ now to let the UPL subsystem know
910 * if we intend to modify the pages or not.
911 */
912 if (operation == NBLK_READ) {
913 SET(bp->nb_flags, NB_READ);
914 } else {
915 CLR(bp->nb_flags, NB_READ);
916 }
917 if (bufsize < PAGE_SIZE)
918 bufsize = PAGE_SIZE;
919 bp->nb_bufsize = bufsize;
920 bp->nb_validoff = bp->nb_validend = -1;
921
922 if (UBCINFOEXISTS(vp)) {
923 /* setup upl */
924 if (nfs_buf_upl_setup(bp)) {
925 /* unable to create upl */
926 /* vm object must no longer exist */
927 /* cleanup buffer and return error */
928 lck_mtx_lock(nfs_buf_mutex);
929 LIST_REMOVE(bp, nb_vnbufs);
930 bp->nb_vnbufs.le_next = NFSNOLIST;
931 bp->nb_vp = NULL;
932 /* invalidate usage timestamp to allow immediate freeing */
933 NBUFSTAMPINVALIDATE(bp);
934 if (bp->nb_free.tqe_next != NFSNOLIST)
935 panic("nfsbuf on freelist");
936 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
937 nfsbuffreecnt++;
938 lck_mtx_unlock(nfs_buf_mutex);
939 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
940 return (EIO);
941 }
942 nfs_buf_upl_check(bp);
943 }
944 break;
945
946 default:
947 panic("nfs_buf_get: %d unknown operation", operation);
948 }
949
950 *bpp = bp;
951
952 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
953
954 return (0);
955 }
956
957 void
958 nfs_buf_release(struct nfsbuf *bp, int freeup)
959 {
960 vnode_t vp = bp->nb_vp;
961 struct timeval now;
962 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
963
964 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
965 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
966 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
967
968 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
969 int upl_flags;
970 upl_t upl;
971 int i, rv;
972
973 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
974 rv = nfs_buf_upl_setup(bp);
975 if (rv)
976 printf("nfs_buf_release: upl create failed %d\n", rv);
977 else
978 nfs_buf_upl_check(bp);
979 }
980 upl = bp->nb_pagelist;
981 if (!upl)
982 goto pagelist_cleanup_done;
983 if (bp->nb_data) {
984 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
985 panic("ubc_upl_unmap failed");
986 bp->nb_data = NULL;
987 }
988 /* abort pages if error, invalid, or non-needcommit nocache */
989 if ((bp->nb_flags & (NB_ERROR | NB_INVAL)) ||
990 ((bp->nb_flags & NB_NOCACHE) && !(bp->nb_flags & (NB_NEEDCOMMIT | NB_DELWRI)))) {
991 if (bp->nb_flags & (NB_READ | NB_INVAL | NB_NOCACHE))
992 upl_flags = UPL_ABORT_DUMP_PAGES;
993 else
994 upl_flags = 0;
995 ubc_upl_abort(upl, upl_flags);
996 goto pagelist_cleanup_done;
997 }
998 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
999 if (!NBPGVALID(bp,i))
1000 ubc_upl_abort_range(upl,
1001 i*PAGE_SIZE, PAGE_SIZE,
1002 UPL_ABORT_DUMP_PAGES |
1003 UPL_ABORT_FREE_ON_EMPTY);
1004 else {
1005 if (NBPGDIRTY(bp,i))
1006 upl_flags = UPL_COMMIT_SET_DIRTY;
1007 else
1008 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1009 ubc_upl_commit_range(upl,
1010 i*PAGE_SIZE, PAGE_SIZE,
1011 upl_flags |
1012 UPL_COMMIT_INACTIVATE |
1013 UPL_COMMIT_FREE_ON_EMPTY);
1014 }
1015 }
1016 pagelist_cleanup_done:
1017 /* was this the last buffer in the file? */
1018 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) {
1019 /* if so, invalidate all pages of last buffer past EOF */
1020 off_t start, end;
1021 start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
1022 end = trunc_page_64(NBOFF(bp) + bp->nb_bufsize);
1023 if (end > start) {
1024 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1025 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1026 }
1027 }
1028 CLR(bp->nb_flags, NB_PAGELIST);
1029 bp->nb_pagelist = NULL;
1030 }
1031
1032 lck_mtx_lock(nfs_buf_mutex);
1033
1034 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1035
1036 /* Wake up any processes waiting for any buffer to become free. */
1037 if (nfsneedbuffer) {
1038 nfsneedbuffer = 0;
1039 wakeup_needbuffer = 1;
1040 }
1041 /* Wake up any processes waiting for _this_ buffer to become free. */
1042 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1043 CLR(bp->nb_lflags, NBL_WANTED);
1044 wakeup_buffer = 1;
1045 }
1046
1047 /* If it's non-needcommit nocache, or an error, mark it invalid. */
1048 if (ISSET(bp->nb_flags, NB_ERROR) ||
1049 (ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, (NB_NEEDCOMMIT | NB_DELWRI))))
1050 SET(bp->nb_flags, NB_INVAL);
1051
1052 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1053 /* If it's invalid or empty, dissociate it from its vnode */
1054 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1055 LIST_REMOVE(bp, nb_vnbufs);
1056 bp->nb_vnbufs.le_next = NFSNOLIST;
1057 }
1058 bp->nb_vp = NULL;
1059 /* if this was a delayed write, wakeup anyone */
1060 /* waiting for delayed writes to complete */
1061 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1062 CLR(bp->nb_flags, NB_DELWRI);
1063 OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite);
1064 NFSBUFCNTCHK(1);
1065 wakeup_nbdwrite = 1;
1066 }
1067 /* invalidate usage timestamp to allow immediate freeing */
1068 NBUFSTAMPINVALIDATE(bp);
1069 /* put buffer at head of free list */
1070 if (bp->nb_free.tqe_next != NFSNOLIST)
1071 panic("nfsbuf on freelist");
1072 SET(bp->nb_flags, NB_INVAL);
1073 if (ISSET(bp->nb_flags, NB_META)) {
1074 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1075 nfsbuffreemetacnt++;
1076 } else {
1077 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1078 nfsbuffreecnt++;
1079 }
1080 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1081 /* put buffer at end of delwri list */
1082 if (bp->nb_free.tqe_next != NFSNOLIST)
1083 panic("nfsbuf on freelist");
1084 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1085 nfsbufdelwricnt++;
1086 freeup = 0;
1087 } else {
1088 /* update usage timestamp */
1089 microuptime(&now);
1090 bp->nb_timestamp = now.tv_sec;
1091 /* put buffer at end of free list */
1092 if (bp->nb_free.tqe_next != NFSNOLIST)
1093 panic("nfsbuf on freelist");
1094 if (ISSET(bp->nb_flags, NB_META)) {
1095 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1096 nfsbuffreemetacnt++;
1097 } else {
1098 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1099 nfsbuffreecnt++;
1100 }
1101 }
1102
1103 NFSBUFCNTCHK(1);
1104
1105 /* Unlock the buffer. */
1106 CLR(bp->nb_flags, (NB_ASYNC | NB_STABLE | NB_IOD));
1107 CLR(bp->nb_lflags, NBL_BUSY);
1108
1109 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1110
1111 lck_mtx_unlock(nfs_buf_mutex);
1112
1113 if (wakeup_needbuffer)
1114 wakeup(&nfsneedbuffer);
1115 if (wakeup_buffer)
1116 wakeup(bp);
1117 if (wakeup_nbdwrite)
1118 wakeup(&nfs_nbdwrite);
1119 if (freeup)
1120 NFS_BUF_FREEUP();
1121 }
1122
1123 /*
1124 * Wait for operations on the buffer to complete.
1125 * When they do, extract and return the I/O's error value.
1126 */
1127 int
1128 nfs_buf_iowait(struct nfsbuf *bp)
1129 {
1130 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1131
1132 lck_mtx_lock(nfs_buf_mutex);
1133
1134 while (!ISSET(bp->nb_flags, NB_DONE))
1135 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0);
1136
1137 lck_mtx_unlock(nfs_buf_mutex);
1138
1139 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1140
1141 /* check for interruption of I/O, then errors. */
1142 if (ISSET(bp->nb_flags, NB_EINTR)) {
1143 CLR(bp->nb_flags, NB_EINTR);
1144 return (EINTR);
1145 } else if (ISSET(bp->nb_flags, NB_ERROR))
1146 return (bp->nb_error ? bp->nb_error : EIO);
1147 return (0);
1148 }
1149
1150 /*
1151 * Mark I/O complete on a buffer.
1152 */
1153 void
1154 nfs_buf_iodone(struct nfsbuf *bp)
1155 {
1156
1157 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1158
1159 if (ISSET(bp->nb_flags, NB_DONE))
1160 panic("nfs_buf_iodone already");
1161 /*
1162 * I/O was done, so don't believe
1163 * the DIRTY state from VM anymore
1164 */
1165 CLR(bp->nb_flags, NB_WASDIRTY);
1166
1167 if (!ISSET(bp->nb_flags, NB_READ)) {
1168 CLR(bp->nb_flags, NB_WRITEINPROG);
1169 /*
1170 * vnode_writedone() takes care of waking up
1171 * any throttled write operations
1172 */
1173 vnode_writedone(bp->nb_vp);
1174 }
1175 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1176 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1177 nfs_buf_release(bp, 1);
1178 } else { /* or just wakeup the buffer */
1179 lck_mtx_lock(nfs_buf_mutex);
1180 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1181 CLR(bp->nb_lflags, NBL_WANTED);
1182 lck_mtx_unlock(nfs_buf_mutex);
1183 wakeup(bp);
1184 }
1185
1186 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1187 }
1188
1189 void
1190 nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p)
1191 {
1192 vnode_t vp = bp->nb_vp;
1193
1194 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1195 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1196
1197 /*
1198 * If the block hasn't been seen before:
1199 * (1) Mark it as having been seen,
1200 * (2) Charge for the write.
1201 * (3) Make sure it's on its vnode's correct block list,
1202 */
1203 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1204 SET(bp->nb_flags, NB_DELWRI);
1205 if (p && p->p_stats)
1206 p->p_stats->p_ru.ru_oublock++; /* XXX */
1207 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
1208 NFSBUFCNTCHK(0);
1209 /* move to dirty list */
1210 lck_mtx_lock(nfs_buf_mutex);
1211 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1212 LIST_REMOVE(bp, nb_vnbufs);
1213 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1214 lck_mtx_unlock(nfs_buf_mutex);
1215 }
1216
1217 /*
1218 * If the vnode has "too many" write operations in progress
1219 * wait for them to finish the IO
1220 */
1221 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1222
1223 /*
1224 * If we have too many delayed write buffers,
1225 * more than we can "safely" handle, just fall back to
1226 * doing the async write
1227 */
1228 if (nfs_nbdwrite < 0)
1229 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1230
1231 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1232 /* issue async write */
1233 SET(bp->nb_flags, NB_ASYNC);
1234 nfs_buf_write(bp);
1235 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1236 return;
1237 }
1238
1239 /* Otherwise, the "write" is done, so mark and release the buffer. */
1240 SET(bp->nb_flags, NB_DONE);
1241 nfs_buf_release(bp, 1);
1242 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1243 return;
1244 }
1245
1246 /*
1247 * add a reference to a buffer so it doesn't disappear while being used
1248 * (must be called with nfs_buf_mutex held)
1249 */
1250 void
1251 nfs_buf_refget(struct nfsbuf *bp)
1252 {
1253 bp->nb_refs++;
1254 }
1255 /*
1256 * release a reference on a buffer
1257 * (must be called with nfs_buf_mutex held)
1258 */
1259 void
1260 nfs_buf_refrele(struct nfsbuf *bp)
1261 {
1262 bp->nb_refs--;
1263 }
1264
1265 /*
1266 * mark a particular buffer as BUSY
1267 * (must be called with nfs_buf_mutex held)
1268 */
1269 errno_t
1270 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1271 {
1272 errno_t error;
1273 struct timespec ts;
1274
1275 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1276 /*
1277 * since the mutex_lock may block, the buffer
1278 * may become BUSY, so we need to recheck for
1279 * a NOWAIT request
1280 */
1281 if (flags & NBAC_NOWAIT)
1282 return (EBUSY);
1283 SET(bp->nb_lflags, NBL_WANTED);
1284
1285 ts.tv_sec = (slptimeo/100);
1286 /* the hz value is 100; which leads to 10ms */
1287 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1288
1289 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1290 "nfs_buf_acquire", &ts);
1291 if (error)
1292 return (error);
1293 return (EAGAIN);
1294 }
1295 if (flags & NBAC_REMOVE)
1296 nfs_buf_remfree(bp);
1297 SET(bp->nb_lflags, NBL_BUSY);
1298
1299 return (0);
1300 }
1301
1302 /*
1303 * simply drop the BUSY status of a buffer
1304 * (must be called with nfs_buf_mutex held)
1305 */
1306 void
1307 nfs_buf_drop(struct nfsbuf *bp)
1308 {
1309 int need_wakeup = 0;
1310
1311 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1312 panic("nfs_buf_drop: buffer not busy!");
1313 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1314 /*
1315 * delay the actual wakeup until after we
1316 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1317 */
1318 need_wakeup = 1;
1319 }
1320 /* Unlock the buffer. */
1321 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1322
1323 if (need_wakeup)
1324 wakeup(bp);
1325 }
1326
1327 /*
1328 * prepare for iterating over an nfsnode's buffer list
1329 * this lock protects the queue manipulation
1330 * (must be called with nfs_buf_mutex held)
1331 */
1332 int
1333 nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1334 {
1335 struct nfsbuflists *listheadp;
1336
1337 if (flags & NBI_DIRTY)
1338 listheadp = &np->n_dirtyblkhd;
1339 else
1340 listheadp = &np->n_cleanblkhd;
1341
1342 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1343 LIST_INIT(iterheadp);
1344 return(EWOULDBLOCK);
1345 }
1346
1347 while (np->n_bufiterflags & NBI_ITER) {
1348 np->n_bufiterflags |= NBI_ITERWANT;
1349 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0);
1350 }
1351 if (LIST_EMPTY(listheadp)) {
1352 LIST_INIT(iterheadp);
1353 return(EINVAL);
1354 }
1355 np->n_bufiterflags |= NBI_ITER;
1356
1357 iterheadp->lh_first = listheadp->lh_first;
1358 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1359 LIST_INIT(listheadp);
1360
1361 return(0);
1362 }
1363
1364 /*
1365 * cleanup after iterating over an nfsnode's buffer list
1366 * this lock protects the queue manipulation
1367 * (must be called with nfs_buf_mutex held)
1368 */
1369 void
1370 nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1371 {
1372 struct nfsbuflists * listheadp;
1373 struct nfsbuf *bp;
1374
1375 if (flags & NBI_DIRTY)
1376 listheadp = &np->n_dirtyblkhd;
1377 else
1378 listheadp = &np->n_cleanblkhd;
1379
1380 while (!LIST_EMPTY(iterheadp)) {
1381 bp = LIST_FIRST(iterheadp);
1382 LIST_REMOVE(bp, nb_vnbufs);
1383 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1384 }
1385
1386 np->n_bufiterflags &= ~NBI_ITER;
1387 if (np->n_bufiterflags & NBI_ITERWANT) {
1388 np->n_bufiterflags &= ~NBI_ITERWANT;
1389 wakeup(&np->n_bufiterflags);
1390 }
1391 }
1392
1393
1394 /*
1395 * Vnode op for read using bio
1396 * Any similarity to readip() is purely coincidental
1397 */
1398 int
1399 nfs_bioread(
1400 vnode_t vp,
1401 struct uio *uio,
1402 __unused int ioflag,
1403 kauth_cred_t cred,
1404 proc_t p)
1405 {
1406 struct nfsnode *np = VTONFS(vp);
1407 int biosize;
1408 off_t diff;
1409 struct nfsbuf *bp = NULL, *rabp;
1410 struct nfs_vattr nvattr;
1411 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1412 daddr64_t lbn, rabn, lastrabn = -1, tlbn;
1413 int bufsize;
1414 int nra, error = 0, n = 0, on = 0;
1415 caddr_t dp;
1416 struct dirent *direntp = NULL;
1417 enum vtype vtype;
1418 int nocachereadahead = 0;
1419
1420 FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1421
1422 #if DIAGNOSTIC
1423 if (uio->uio_rw != UIO_READ)
1424 panic("nfs_read mode");
1425 #endif
1426 if (uio_uio_resid(uio) == 0) {
1427 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1428 return (0);
1429 }
1430 if (uio->uio_offset < 0) {
1431 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1432 return (EINVAL);
1433 }
1434
1435 biosize = nmp->nm_biosize;
1436 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1437 nfs_fsinfo(nmp, vp, cred, p);
1438
1439 vtype = vnode_vtype(vp);
1440 /*
1441 * For nfs, cache consistency can only be maintained approximately.
1442 * Although RFC1094 does not specify the criteria, the following is
1443 * believed to be compatible with the reference port.
1444 * For nfs:
1445 * If the file's modify time on the server has changed since the
1446 * last read rpc or you have written to the file,
1447 * you may have lost data cache consistency with the
1448 * server, so flush all of the file's data out of the cache.
1449 * Then force a getattr rpc to ensure that you have up to date
1450 * attributes.
1451 * NB: This implies that cache data can be read when up to
1452 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1453 * current attributes this could be forced by setting calling
1454 * NATTRINVALIDATE() before the nfs_getattr() call.
1455 */
1456 if (np->n_flag & NNEEDINVALIDATE) {
1457 np->n_flag &= ~NNEEDINVALIDATE;
1458 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1459 }
1460 if (np->n_flag & NMODIFIED) {
1461 if (vtype != VREG) {
1462 if (vtype != VDIR)
1463 panic("nfs: bioread, not dir");
1464 nfs_invaldir(vp);
1465 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1466 if (error) {
1467 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1468 return (error);
1469 }
1470 }
1471 NATTRINVALIDATE(np);
1472 error = nfs_getattr(vp, &nvattr, cred, p);
1473 if (error) {
1474 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1475 return (error);
1476 }
1477 if (vtype == VDIR) {
1478 /* if directory changed, purge any name cache entries */
1479 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1480 cache_purge(vp);
1481 np->n_ncmtime = nvattr.nva_mtime;
1482 }
1483 np->n_mtime = nvattr.nva_mtime;
1484 } else {
1485 error = nfs_getattr(vp, &nvattr, cred, p);
1486 if (error) {
1487 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1488 return (error);
1489 }
1490 if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) {
1491 if (vtype == VDIR) {
1492 nfs_invaldir(vp);
1493 /* purge name cache entries */
1494 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1495 cache_purge(vp);
1496 }
1497 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1498 if (error) {
1499 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1500 return (error);
1501 }
1502 if (vtype == VDIR)
1503 np->n_ncmtime = nvattr.nva_mtime;
1504 np->n_mtime = nvattr.nva_mtime;
1505 }
1506 }
1507
1508 if (vnode_isnocache(vp)) {
1509 if (!(np->n_flag & NNOCACHE)) {
1510 if (NVALIDBUFS(np)) {
1511 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1512 if (error) {
1513 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1514 return (error);
1515 }
1516 }
1517 np->n_flag |= NNOCACHE;
1518 }
1519 } else if (np->n_flag & NNOCACHE) {
1520 np->n_flag &= ~NNOCACHE;
1521 }
1522
1523 do {
1524 if (np->n_flag & NNOCACHE) {
1525 switch (vtype) {
1526 case VREG:
1527 /*
1528 * If we have only a block or so to read,
1529 * just do the rpc directly.
1530 * If we have a couple blocks or more to read,
1531 * then we'll take advantage of readahead within
1532 * this loop to try to fetch all the data in parallel
1533 */
1534 if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) {
1535 error = nfs_readrpc(vp, uio, cred, p);
1536 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1537 return (error);
1538 }
1539 nocachereadahead = 1;
1540 break;
1541 case VLNK:
1542 error = nfs_readlinkrpc(vp, uio, cred, p);
1543 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1544 return (error);
1545 case VDIR:
1546 break;
1547 default:
1548 printf(" NFSNOCACHE: type %x unexpected\n", vtype);
1549 };
1550 }
1551 switch (vtype) {
1552 case VREG:
1553 lbn = uio->uio_offset / biosize;
1554
1555 /*
1556 * Copy directly from any cached pages without grabbing the bufs.
1557 *
1558 * Note: for "nocache" reads, we don't copy directly from UBC
1559 * because any cached pages will be for readahead buffers that
1560 * need to be invalidated anyway before we finish this request.
1561 */
1562 if (!(np->n_flag & NNOCACHE) &&
1563 (uio->uio_segflg == UIO_USERSPACE32 ||
1564 uio->uio_segflg == UIO_USERSPACE64 ||
1565 uio->uio_segflg == UIO_USERSPACE)) {
1566 // LP64todo - fix this!
1567 int io_resid = uio_uio_resid(uio);
1568 diff = np->n_size - uio->uio_offset;
1569 if (diff < io_resid)
1570 io_resid = diff;
1571 if (io_resid > 0) {
1572 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1573 if (error) {
1574 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1575 return (error);
1576 }
1577 }
1578 /* count any biocache reads that we just copied directly */
1579 if (lbn != uio->uio_offset / biosize) {
1580 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1581 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1582 }
1583 }
1584
1585 lbn = uio->uio_offset / biosize;
1586 on = uio->uio_offset % biosize;
1587
1588 /*
1589 * Start the read ahead(s), as required.
1590 */
1591 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1592 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1593 rabn = lbn + 1 + nra;
1594 if (rabn <= lastrabn) {
1595 /* we've already (tried to) read this block */
1596 /* no need to try it again... */
1597 continue;
1598 }
1599 lastrabn = rabn;
1600 if ((off_t)rabn * biosize >= (off_t)np->n_size)
1601 break;
1602 if ((np->n_flag & NNOCACHE) &&
1603 (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio))))
1604 /* for uncached readahead, don't go beyond end of request */
1605 break;
1606 /* check if block exists and is valid. */
1607 error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1608 if (error) {
1609 FSDBG_BOT(514, vp, 0xd1e000b, 1, error);
1610 return (error);
1611 }
1612 if (!rabp)
1613 continue;
1614 if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) {
1615 nfs_buf_release(rabp, 1);
1616 continue;
1617 }
1618 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1619 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1620 if (nfs_asyncio(rabp, cred)) {
1621 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1622 rabp->nb_error = EIO;
1623 nfs_buf_release(rabp, 1);
1624 }
1625 } else
1626 nfs_buf_release(rabp, 1);
1627 }
1628 }
1629
1630 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
1631 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
1632 return (0);
1633 }
1634
1635 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
1636
1637 /*
1638 * If the block is in the cache and has the required data
1639 * in a valid region, just copy it out.
1640 * Otherwise, get the block and write back/read in,
1641 * as required.
1642 */
1643 again:
1644 bufsize = biosize;
1645 // LP64todo - fix this!
1646 n = min((unsigned)(bufsize - on), uio_uio_resid(uio));
1647 diff = np->n_size - uio->uio_offset;
1648 if (diff < n)
1649 n = diff;
1650
1651 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp);
1652 if (error) {
1653 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1654 return (EINTR);
1655 }
1656
1657 /* if any pages are valid... */
1658 if (bp->nb_valid) {
1659 /* ...check for any invalid pages in the read range */
1660 int pg, firstpg, lastpg, dirtypg;
1661 dirtypg = firstpg = lastpg = -1;
1662 pg = on/PAGE_SIZE;
1663 while (pg <= (on + n - 1)/PAGE_SIZE) {
1664 if (!NBPGVALID(bp,pg)) {
1665 if (firstpg < 0)
1666 firstpg = pg;
1667 lastpg = pg;
1668 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1669 dirtypg = pg;
1670 pg++;
1671 }
1672
1673 /* if there are no invalid pages, we're all set */
1674 if (firstpg < 0) {
1675 if (bp->nb_validoff < 0) {
1676 /* valid range isn't set up, so */
1677 /* set it to what we know is valid */
1678 bp->nb_validoff = trunc_page(on);
1679 bp->nb_validend = round_page(on+n);
1680 nfs_buf_normalize_valid_range(np, bp);
1681 }
1682 goto buffer_ready;
1683 }
1684
1685 /* there are invalid pages in the read range */
1686 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1687 /* there are also dirty page(s) in the range, */
1688 /* so write the buffer out and try again */
1689 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1690 SET(bp->nb_flags, NB_ASYNC);
1691 if (bp->nb_wcred == NOCRED) {
1692 kauth_cred_ref(cred);
1693 bp->nb_wcred = cred;
1694 }
1695 error = nfs_buf_write(bp);
1696 if (error) {
1697 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1698 return (error);
1699 }
1700 goto again;
1701 }
1702 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1703 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1704 /* we need to read in more than half the buffer and the */
1705 /* buffer's not dirty, so just fetch the whole buffer */
1706 bp->nb_valid = 0;
1707 } else {
1708 /* read the page range in */
1709 uio_t auio;
1710 char uio_buf[ UIO_SIZEOF(1) ];
1711
1712 NFS_BUF_MAP(bp);
1713 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
1714 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1715 if (!auio) {
1716 error = ENOMEM;
1717 } else {
1718 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
1719 ((lastpg - firstpg + 1) * PAGE_SIZE));
1720 error = nfs_readrpc(vp, auio, cred, p);
1721 }
1722 if (error) {
1723 if (np->n_flag & NNOCACHE)
1724 SET(bp->nb_flags, NB_NOCACHE);
1725 nfs_buf_release(bp, 1);
1726 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1727 return (error);
1728 }
1729 /* Make sure that the valid range is set to cover this read. */
1730 bp->nb_validoff = trunc_page_32(on);
1731 bp->nb_validend = round_page_32(on+n);
1732 nfs_buf_normalize_valid_range(np, bp);
1733 if (uio_resid(auio) > 0) {
1734 /* if short read, must have hit EOF, */
1735 /* so zero the rest of the range */
1736 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
1737 }
1738 /* mark the pages (successfully read) as valid */
1739 for (pg=firstpg; pg <= lastpg; pg++)
1740 NBPGVALID_SET(bp,pg);
1741 }
1742 }
1743 /* if no pages are valid, read the whole block */
1744 if (!bp->nb_valid) {
1745 SET(bp->nb_flags, NB_READ);
1746 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1747 error = nfs_doio(bp, cred, p);
1748 if (error) {
1749 if (np->n_flag & NNOCACHE)
1750 SET(bp->nb_flags, NB_NOCACHE);
1751 nfs_buf_release(bp, 1);
1752 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1753 return (error);
1754 }
1755 }
1756 buffer_ready:
1757 /* validate read range against valid range and clip */
1758 if (bp->nb_validend > 0) {
1759 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1760 if (diff < n)
1761 n = diff;
1762 }
1763 if (n > 0)
1764 NFS_BUF_MAP(bp);
1765 break;
1766 case VLNK:
1767 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks);
1768 error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp);
1769 if (error) {
1770 FSDBG_BOT(514, vp, 0xd1e0010, 0, error);
1771 return (error);
1772 }
1773 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1774 SET(bp->nb_flags, NB_READ);
1775 error = nfs_doio(bp, cred, p);
1776 if (error) {
1777 SET(bp->nb_flags, NB_ERROR);
1778 nfs_buf_release(bp, 1);
1779 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1780 return (error);
1781 }
1782 }
1783 // LP64todo - fix this!
1784 n = min(uio_uio_resid(uio), bp->nb_validend);
1785 on = 0;
1786 break;
1787 case VDIR:
1788 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
1789 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1790 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1791 return (0);
1792 }
1793 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1794 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1795 error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1796 if (error) {
1797 FSDBG_BOT(514, vp, 0xd1e0012, 0, error);
1798 return (error);
1799 }
1800 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1801 SET(bp->nb_flags, NB_READ);
1802 error = nfs_doio(bp, cred, p);
1803 if (error) {
1804 nfs_buf_release(bp, 1);
1805 }
1806 while (error == NFSERR_BAD_COOKIE) {
1807 nfs_invaldir(vp);
1808 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1809 /*
1810 * Yuck! The directory has been modified on the
1811 * server. The only way to get the block is by
1812 * reading from the beginning to get all the
1813 * offset cookies.
1814 */
1815 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
1816 if (np->n_direofoffset
1817 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1818 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1819 return (0);
1820 }
1821 error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1822 if (error) {
1823 FSDBG_BOT(514, vp, 0xd1e0013, 0, error);
1824 return (error);
1825 }
1826 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1827 SET(bp->nb_flags, NB_READ);
1828 error = nfs_doio(bp, cred, p);
1829 /*
1830 * no error + NB_INVAL == directory EOF,
1831 * use the block.
1832 */
1833 if (error == 0 && (bp->nb_flags & NB_INVAL))
1834 break;
1835 }
1836 /*
1837 * An error will throw away the block and the
1838 * for loop will break out. If no error and this
1839 * is not the block we want, we throw away the
1840 * block and go for the next one via the for loop.
1841 */
1842 if (error || tlbn < lbn)
1843 nfs_buf_release(bp, 1);
1844 }
1845 }
1846 /*
1847 * The above while is repeated if we hit another cookie
1848 * error. If we hit an error and it wasn't a cookie error,
1849 * we give up.
1850 */
1851 if (error) {
1852 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1853 return (error);
1854 }
1855 }
1856
1857 /*
1858 * If not eof and read aheads are enabled, start one.
1859 * (You need the current block first, so that you have the
1860 * directory offset cookie of the next block.)
1861 */
1862 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1863 (np->n_direofoffset == 0 ||
1864 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1865 !nfs_buf_is_incore(vp, lbn + 1)) {
1866 error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1867 if (error) {
1868 FSDBG_BOT(514, vp, 0xd1e0015, 0, error);
1869 return (error);
1870 }
1871 if (rabp) {
1872 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1873 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1874 if (nfs_asyncio(rabp, cred)) {
1875 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1876 rabp->nb_error = EIO;
1877 nfs_buf_release(rabp, 1);
1878 }
1879 } else {
1880 nfs_buf_release(rabp, 1);
1881 }
1882 }
1883 }
1884 /*
1885 * Make sure we use a signed variant of min() since
1886 * the second term may be negative.
1887 */
1888 // LP64todo - fix this!
1889 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
1890 /*
1891 * We keep track of the directory eof in
1892 * np->n_direofoffset and chop it off as an
1893 * extra step right here.
1894 */
1895 if (np->n_direofoffset &&
1896 n > np->n_direofoffset - uio->uio_offset)
1897 n = np->n_direofoffset - uio->uio_offset;
1898 /*
1899 * Make sure that we return an integral number of entries so
1900 * that any subsequent calls will start copying from the start
1901 * of the next entry.
1902 *
1903 * If the current value of n has the last entry cut short,
1904 * set n to copy everything up to the last entry instead.
1905 */
1906 if (n > 0) {
1907 dp = bp->nb_data + on;
1908 while (dp < (bp->nb_data + on + n)) {
1909 direntp = (struct dirent *)dp;
1910 dp += direntp->d_reclen;
1911 }
1912 if (dp > (bp->nb_data + on + n))
1913 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1914 }
1915 break;
1916 default:
1917 printf("nfs_bioread: type %x unexpected\n", vtype);
1918 FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL);
1919 return (EINVAL);
1920 };
1921
1922 if (n > 0) {
1923 error = uiomove(bp->nb_data + on, (int)n, uio);
1924 }
1925 switch (vtype) {
1926 case VREG:
1927 if (np->n_flag & NNOCACHE)
1928 SET(bp->nb_flags, NB_NOCACHE);
1929 break;
1930 case VLNK:
1931 n = 0;
1932 break;
1933 case VDIR:
1934 break;
1935 default:
1936 break;
1937 }
1938 nfs_buf_release(bp, 1);
1939 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
1940 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1941 return (error);
1942 }
1943
1944
1945 /*
1946 * Vnode op for write using bio
1947 */
1948 int
1949 nfs_write(ap)
1950 struct vnop_write_args /* {
1951 struct vnodeop_desc *a_desc;
1952 vnode_t a_vp;
1953 struct uio *a_uio;
1954 int a_ioflag;
1955 vfs_context_t a_context;
1956 } */ *ap;
1957 {
1958 struct uio *uio = ap->a_uio;
1959 vnode_t vp = ap->a_vp;
1960 struct nfsnode *np = VTONFS(vp);
1961 proc_t p;
1962 kauth_cred_t cred;
1963 int ioflag = ap->a_ioflag;
1964 struct nfsbuf *bp;
1965 struct nfs_vattr nvattr;
1966 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1967 daddr64_t lbn;
1968 int biosize, bufsize;
1969 int n, on, error = 0;
1970 off_t boff, start, end, cureof;
1971 struct iovec_32 iov;
1972 struct uio auio;
1973
1974 FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1975
1976 #if DIAGNOSTIC
1977 if (uio->uio_rw != UIO_WRITE)
1978 panic("nfs_write mode");
1979 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1980 panic("nfs_write proc");
1981 #endif
1982
1983 p = vfs_context_proc(ap->a_context);
1984 cred = vfs_context_ucred(ap->a_context);
1985
1986 if (vnode_vtype(vp) != VREG)
1987 return (EIO);
1988
1989 np->n_flag |= NWRBUSY;
1990
1991 if (np->n_flag & NNEEDINVALIDATE) {
1992 np->n_flag &= ~NNEEDINVALIDATE;
1993 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1994 }
1995 if (np->n_flag & NWRITEERR) {
1996 np->n_flag &= ~(NWRITEERR | NWRBUSY);
1997 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error);
1998 return (np->n_error);
1999 }
2000
2001 biosize = nmp->nm_biosize;
2002 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
2003 nfs_fsinfo(nmp, vp, cred, p);
2004
2005 if (ioflag & (IO_APPEND | IO_SYNC)) {
2006 if (np->n_flag & NMODIFIED) {
2007 NATTRINVALIDATE(np);
2008 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2009 if (error) {
2010 np->n_flag &= ~NWRBUSY;
2011 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
2012 return (error);
2013 }
2014 }
2015 if (ioflag & IO_APPEND) {
2016 NATTRINVALIDATE(np);
2017 error = nfs_getattr(vp, &nvattr, cred, p);
2018 if (error) {
2019 np->n_flag &= ~NWRBUSY;
2020 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
2021 return (error);
2022 }
2023 uio->uio_offset = np->n_size;
2024 }
2025 }
2026 if (uio->uio_offset < 0) {
2027 np->n_flag &= ~NWRBUSY;
2028 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
2029 return (EINVAL);
2030 }
2031 if (uio_uio_resid(uio) == 0) {
2032 np->n_flag &= ~NWRBUSY;
2033 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2034 return (0);
2035 }
2036
2037 if (vnode_isnocache(vp)) {
2038 if (!(np->n_flag & NNOCACHE)) {
2039 if (NVALIDBUFS(np)) {
2040 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2041 if (error) {
2042 np->n_flag &= ~NWRBUSY;
2043 FSDBG_BOT(515, vp, 0, 0, error);
2044 return (error);
2045 }
2046 }
2047 np->n_flag |= NNOCACHE;
2048 }
2049 } else if (np->n_flag & NNOCACHE) {
2050 np->n_flag &= ~NNOCACHE;
2051 }
2052
2053 do {
2054 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes);
2055 lbn = uio->uio_offset / biosize;
2056 on = uio->uio_offset % biosize;
2057 // LP64todo - fix this
2058 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
2059 again:
2060 bufsize = biosize;
2061 /*
2062 * Get a cache block for writing. The range to be written is
2063 * (off..off+n) within the block. We ensure that the block
2064 * either has no dirty region or that the given range is
2065 * contiguous with the existing dirty region.
2066 */
2067 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp);
2068 if (error) {
2069 np->n_flag &= ~NWRBUSY;
2070 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2071 return (error);
2072 }
2073 /* map the block because we know we're going to write to it */
2074 NFS_BUF_MAP(bp);
2075
2076 if (np->n_flag & NNOCACHE)
2077 SET(bp->nb_flags, NB_NOCACHE);
2078
2079 if (bp->nb_wcred == NOCRED) {
2080 kauth_cred_ref(cred);
2081 bp->nb_wcred = cred;
2082 }
2083
2084 /*
2085 * If there's already a dirty range AND dirty pages in this block we
2086 * need to send a commit AND write the dirty pages before continuing.
2087 *
2088 * If there's already a dirty range OR dirty pages in this block
2089 * and the new write range is not contiguous with the existing range,
2090 * then force the buffer to be written out now.
2091 * (We used to just extend the dirty range to cover the valid,
2092 * but unwritten, data in between also. But writing ranges
2093 * of data that weren't actually written by an application
2094 * risks overwriting some other client's data with stale data
2095 * that's just masquerading as new written data.)
2096 */
2097 if (bp->nb_dirtyend > 0) {
2098 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2099 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
2100 /* write/commit buffer "synchronously" */
2101 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2102 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2103 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2104 error = nfs_buf_write(bp);
2105 if (error) {
2106 np->n_flag &= ~NWRBUSY;
2107 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2108 return (error);
2109 }
2110 goto again;
2111 }
2112 } else if (bp->nb_dirty) {
2113 int firstpg, lastpg;
2114 u_int32_t pagemask;
2115 /* calculate write range pagemask */
2116 firstpg = on/PAGE_SIZE;
2117 lastpg = (on+n-1)/PAGE_SIZE;
2118 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2119 /* check if there are dirty pages outside the write range */
2120 if (bp->nb_dirty & ~pagemask) {
2121 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
2122 /* write/commit buffer "synchronously" */
2123 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2124 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2125 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2126 error = nfs_buf_write(bp);
2127 if (error) {
2128 np->n_flag &= ~NWRBUSY;
2129 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2130 return (error);
2131 }
2132 goto again;
2133 }
2134 /* if the first or last pages are already dirty */
2135 /* make sure that the dirty range encompasses those pages */
2136 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2137 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
2138 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2139 if (NBPGDIRTY(bp,lastpg)) {
2140 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2141 /* clip to EOF */
2142 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2143 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2144 } else
2145 bp->nb_dirtyend = on+n;
2146 }
2147 }
2148
2149 /*
2150 * Are we extending the size of the file with this write?
2151 * If so, update file size now that we have the block.
2152 * If there was a partial buf at the old eof, validate
2153 * and zero the new bytes.
2154 */
2155 cureof = (off_t)np->n_size;
2156 if (uio->uio_offset + n > (off_t)np->n_size) {
2157 struct nfsbuf *eofbp = NULL;
2158 daddr64_t eofbn = np->n_size / biosize;
2159 int eofoff = np->n_size % biosize;
2160 int neweofoff = (uio->uio_offset + n) % biosize;
2161
2162 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
2163
2164 if (eofoff && (eofbn < lbn)) {
2165 error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp);
2166 if (error) {
2167 np->n_flag &= ~NWRBUSY;
2168 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2169 return (error);
2170 }
2171 }
2172
2173 /* if we're extending within the same last block */
2174 /* and the block is flagged as being cached... */
2175 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2176 /* ...check that all pages in buffer are valid */
2177 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2178 u_int32_t pagemask;
2179 /* pagemask only has to extend to last page being written to */
2180 pagemask = (1 << (endpg+1)) - 1;
2181 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2182 if ((bp->nb_valid & pagemask) != pagemask) {
2183 /* zerofill any hole */
2184 if (on > bp->nb_validend) {
2185 int i;
2186 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2187 NBPGVALID_SET(bp, i);
2188 NFS_BUF_MAP(bp);
2189 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2190 bzero((char *)bp->nb_data + bp->nb_validend,
2191 on - bp->nb_validend);
2192 }
2193 /* zerofill any trailing data in the last page */
2194 if (neweofoff) {
2195 NFS_BUF_MAP(bp);
2196 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2197 bzero((char *)bp->nb_data + neweofoff,
2198 PAGE_SIZE - (neweofoff & PAGE_MASK));
2199 }
2200 }
2201 }
2202 np->n_flag |= NMODIFIED;
2203 np->n_size = uio->uio_offset + n;
2204 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
2205 if (eofbp) {
2206 /*
2207 * We may need to zero any previously invalid data
2208 * after the old EOF in the previous EOF buffer.
2209 *
2210 * For the old last page, don't zero bytes if there
2211 * are invalid bytes in that page (i.e. the page isn't
2212 * currently valid).
2213 * For pages after the old last page, zero them and
2214 * mark them as valid.
2215 */
2216 char *d;
2217 int i;
2218 if (np->n_flag & NNOCACHE)
2219 SET(eofbp->nb_flags, NB_NOCACHE);
2220 NFS_BUF_MAP(eofbp);
2221 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2222 d = eofbp->nb_data;
2223 i = eofoff/PAGE_SIZE;
2224 while (eofoff < biosize) {
2225 int poff = eofoff & PAGE_MASK;
2226 if (!poff || NBPGVALID(eofbp,i)) {
2227 bzero(d + eofoff, PAGE_SIZE - poff);
2228 NBPGVALID_SET(eofbp, i);
2229 }
2230 if (bp->nb_validend == eofoff)
2231 bp->nb_validend += PAGE_SIZE - poff;
2232 eofoff += PAGE_SIZE - poff;
2233 i++;
2234 }
2235 nfs_buf_release(eofbp, 1);
2236 }
2237 }
2238 /*
2239 * If dirtyend exceeds file size, chop it down. This should
2240 * not occur unless there is a race.
2241 */
2242 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2243 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2244 /*
2245 * UBC doesn't handle partial pages, so we need to make sure
2246 * that any pages left in the page cache are completely valid.
2247 *
2248 * Writes that are smaller than a block are delayed if they
2249 * don't extend to the end of the block.
2250 *
2251 * If the block isn't (completely) cached, we may need to read
2252 * in some parts of pages that aren't covered by the write.
2253 * If the write offset (on) isn't page aligned, we'll need to
2254 * read the start of the first page being written to. Likewise,
2255 * if the offset of the end of the write (on+n) isn't page aligned,
2256 * we'll need to read the end of the last page being written to.
2257 *
2258 * Notes:
2259 * We don't want to read anything we're just going to write over.
2260 * We don't want to issue multiple I/Os if we don't have to
2261 * (because they're synchronous rpcs).
2262 * We don't want to read anything we already have modified in the
2263 * page cache.
2264 */
2265 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
2266 int firstpg, lastpg, dirtypg;
2267 int firstpgoff, lastpgoff;
2268 start = end = -1;
2269 firstpg = on/PAGE_SIZE;
2270 firstpgoff = on & PAGE_MASK;
2271 lastpg = (on+n-1)/PAGE_SIZE;
2272 lastpgoff = (on+n) & PAGE_MASK;
2273 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2274 /* need to read start of first page */
2275 start = firstpg * PAGE_SIZE;
2276 end = start + firstpgoff;
2277 }
2278 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2279 /* need to read end of last page */
2280 if (start < 0)
2281 start = (lastpg * PAGE_SIZE) + lastpgoff;
2282 end = (lastpg + 1) * PAGE_SIZE;
2283 }
2284 if (end > start) {
2285 /* need to read the data in range: start...end-1 */
2286
2287 /* first, check for dirty pages in between */
2288 /* if there are, we'll have to do two reads because */
2289 /* we don't want to overwrite the dirty pages. */
2290 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2291 if (NBPGDIRTY(bp,dirtypg))
2292 break;
2293
2294 /* if start is at beginning of page, try */
2295 /* to get any preceeding pages as well. */
2296 if (!(start & PAGE_MASK)) {
2297 /* stop at next dirty/valid page or start of block */
2298 for (; start > 0; start-=PAGE_SIZE)
2299 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2300 break;
2301 }
2302
2303 NFS_BUF_MAP(bp);
2304 /* setup uio for read(s) */
2305 boff = NBOFF(bp);
2306 auio.uio_iovs.iov32p = &iov;
2307 auio.uio_iovcnt = 1;
2308 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2309 auio.uio_segflg = UIO_SYSSPACE;
2310 #else
2311 auio.uio_segflg = UIO_SYSSPACE32;
2312 #endif
2313 auio.uio_rw = UIO_READ;
2314
2315 if (dirtypg <= (end-1)/PAGE_SIZE) {
2316 /* there's a dirty page in the way, so just do two reads */
2317 /* we'll read the preceding data here */
2318 auio.uio_offset = boff + start;
2319 iov.iov_len = on - start;
2320 uio_uio_resid_set(&auio, iov.iov_len);
2321 iov.iov_base = (uintptr_t) bp->nb_data + start;
2322 error = nfs_readrpc(vp, &auio, cred, p);
2323 if (error) {
2324 bp->nb_error = error;
2325 SET(bp->nb_flags, NB_ERROR);
2326 printf("nfs_write: readrpc %d", error);
2327 }
2328 if (uio_uio_resid(&auio) > 0) {
2329 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01);
2330 // LP64todo - fix this
2331 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2332 }
2333 /* update validoff/validend if necessary */
2334 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2335 bp->nb_validoff = start;
2336 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2337 bp->nb_validend = on;
2338 if ((off_t)np->n_size > boff + bp->nb_validend)
2339 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2340 /* validate any pages before the write offset */
2341 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2342 NBPGVALID_SET(bp, start/PAGE_SIZE);
2343 /* adjust start to read any trailing data */
2344 start = on+n;
2345 }
2346
2347 /* if end is at end of page, try to */
2348 /* get any following pages as well. */
2349 if (!(end & PAGE_MASK)) {
2350 /* stop at next valid page or end of block */
2351 for (; end < bufsize; end+=PAGE_SIZE)
2352 if (NBPGVALID(bp,end/PAGE_SIZE))
2353 break;
2354 }
2355
2356 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2357 /*
2358 * Either this entire read is beyond the current EOF
2359 * or the range that we won't be modifying (on+n...end)
2360 * is all beyond the current EOF.
2361 * No need to make a trip across the network to
2362 * read nothing. So, just zero the buffer instead.
2363 */
2364 FSDBG(516, bp, start, end - start, 0xd00dee00);
2365 bzero(bp->nb_data + start, end - start);
2366 } else {
2367 /* now we'll read the (rest of the) data */
2368 auio.uio_offset = boff + start;
2369 iov.iov_len = end - start;
2370 uio_uio_resid_set(&auio, iov.iov_len);
2371 iov.iov_base = (uintptr_t) (bp->nb_data + start);
2372 error = nfs_readrpc(vp, &auio, cred, p);
2373 if (error) {
2374 bp->nb_error = error;
2375 SET(bp->nb_flags, NB_ERROR);
2376 printf("nfs_write: readrpc %d", error);
2377 }
2378 if (uio_uio_resid(&auio) > 0) {
2379 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02);
2380 // LP64todo - fix this
2381 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2382 }
2383 }
2384 /* update validoff/validend if necessary */
2385 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2386 bp->nb_validoff = start;
2387 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2388 bp->nb_validend = end;
2389 if ((off_t)np->n_size > boff + bp->nb_validend)
2390 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2391 /* validate any pages before the write offset's page */
2392 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2393 NBPGVALID_SET(bp, start/PAGE_SIZE);
2394 /* validate any pages after the range of pages being written to */
2395 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2396 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2397 /* Note: pages being written to will be validated when written */
2398 }
2399 }
2400
2401 if (ISSET(bp->nb_flags, NB_ERROR)) {
2402 error = bp->nb_error;
2403 nfs_buf_release(bp, 1);
2404 np->n_flag &= ~NWRBUSY;
2405 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2406 return (error);
2407 }
2408
2409 np->n_flag |= NMODIFIED;
2410
2411 NFS_BUF_MAP(bp);
2412 error = uiomove((char *)bp->nb_data + on, n, uio);
2413 if (error) {
2414 SET(bp->nb_flags, NB_ERROR);
2415 nfs_buf_release(bp, 1);
2416 np->n_flag &= ~NWRBUSY;
2417 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2418 return (error);
2419 }
2420
2421 /* validate any pages written to */
2422 start = on & ~PAGE_MASK;
2423 for (; start < on+n; start += PAGE_SIZE) {
2424 NBPGVALID_SET(bp, start/PAGE_SIZE);
2425 /*
2426 * This may seem a little weird, but we don't actually set the
2427 * dirty bits for writes. This is because we keep the dirty range
2428 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2429 * delayed writes, when we give the pages back to the VM we don't
2430 * want to keep them marked dirty, because when we later write the
2431 * buffer we won't be able to tell which pages were written dirty
2432 * and which pages were mmapped and dirtied.
2433 */
2434 }
2435 if (bp->nb_dirtyend > 0) {
2436 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2437 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2438 } else {
2439 bp->nb_dirtyoff = on;
2440 bp->nb_dirtyend = on + n;
2441 }
2442 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2443 bp->nb_validoff > bp->nb_dirtyend) {
2444 bp->nb_validoff = bp->nb_dirtyoff;
2445 bp->nb_validend = bp->nb_dirtyend;
2446 } else {
2447 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2448 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2449 }
2450 if (!ISSET(bp->nb_flags, NB_CACHE))
2451 nfs_buf_normalize_valid_range(np, bp);
2452
2453 /*
2454 * Since this block is being modified, it must be written
2455 * again and not just committed.
2456 */
2457 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2458 np->n_needcommitcnt--;
2459 CHECK_NEEDCOMMITCNT(np);
2460 }
2461 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2462
2463 if (ioflag & IO_SYNC) {
2464 bp->nb_proc = p;
2465 error = nfs_buf_write(bp);
2466 if (error) {
2467 np->n_flag &= ~NWRBUSY;
2468 FSDBG_BOT(515, vp, uio->uio_offset,
2469 uio_uio_resid(uio), error);
2470 return (error);
2471 }
2472 } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) {
2473 bp->nb_proc = NULL;
2474 SET(bp->nb_flags, NB_ASYNC);
2475 nfs_buf_write(bp);
2476 } else
2477 nfs_buf_write_delayed(bp, p);
2478
2479 if (np->n_needcommitcnt > (nfsbufcnt/16))
2480 nfs_flushcommits(vp, p, 1);
2481
2482 } while (uio_uio_resid(uio) > 0 && n > 0);
2483
2484 if (np->n_flag & NNOCACHE) {
2485 /* make sure all the buffers are flushed out */
2486 error = nfs_flush(vp, MNT_WAIT, cred, p, 0);
2487 }
2488
2489 np->n_flag &= ~NWRBUSY;
2490 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2491 return (error);
2492 }
2493
2494 /*
2495 * Flush out and invalidate all buffers associated with a vnode.
2496 * Called with the underlying object locked.
2497 */
2498 static int
2499 nfs_vinvalbuf_internal(
2500 vnode_t vp,
2501 int flags,
2502 kauth_cred_t cred,
2503 proc_t p,
2504 int slpflag,
2505 int slptimeo)
2506 {
2507 struct nfsbuf *bp;
2508 struct nfsbuflists blist;
2509 int list, error = 0;
2510 struct nfsnode *np = VTONFS(vp);
2511
2512 if (flags & V_SAVE) {
2513 if ((error = nfs_flush(vp, MNT_WAIT, cred, p,
2514 (flags & V_IGNORE_WRITEERR))))
2515 return (error);
2516 if (!LIST_EMPTY(&np->n_dirtyblkhd))
2517 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2518 vp, LIST_FIRST(&np->n_dirtyblkhd));
2519 }
2520
2521 lck_mtx_lock(nfs_buf_mutex);
2522 for (;;) {
2523 list = NBI_CLEAN;
2524 if (nfs_buf_iterprepare(np, &blist, list)) {
2525 list = NBI_DIRTY;
2526 if (nfs_buf_iterprepare(np, &blist, list))
2527 break;
2528 }
2529 while ((bp = LIST_FIRST(&blist))) {
2530 LIST_REMOVE(bp, nb_vnbufs);
2531 if (list == NBI_CLEAN)
2532 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2533 else
2534 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2535 nfs_buf_refget(bp);
2536 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2537 FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags);
2538 if (error != EAGAIN) {
2539 FSDBG(554, vp, bp, -1, error);
2540 nfs_buf_refrele(bp);
2541 nfs_buf_itercomplete(np, &blist, list);
2542 lck_mtx_unlock(nfs_buf_mutex);
2543 return (error);
2544 }
2545 }
2546 nfs_buf_refrele(bp);
2547 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2548 lck_mtx_unlock(nfs_buf_mutex);
2549 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp &&
2550 (NBOFF(bp) < (off_t)np->n_size)) {
2551 /* XXX extra paranoia: make sure we're not */
2552 /* somehow leaving any dirty data around */
2553 int mustwrite = 0;
2554 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
2555 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
2556 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2557 error = nfs_buf_upl_setup(bp);
2558 if (error == EINVAL) {
2559 /* vm object must no longer exist */
2560 /* hopefully we don't need to do */
2561 /* anything for this buffer */
2562 } else if (error)
2563 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
2564 bp->nb_valid = bp->nb_dirty = 0;
2565 }
2566 nfs_buf_upl_check(bp);
2567 /* check for any dirty data before the EOF */
2568 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2569 /* clip dirty range to EOF */
2570 if (bp->nb_dirtyend > end)
2571 bp->nb_dirtyend = end;
2572 mustwrite++;
2573 }
2574 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2575 /* also make sure we'll have a credential to do the write */
2576 if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) {
2577 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2578 mustwrite = 0;
2579 }
2580 if (mustwrite) {
2581 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2582 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2583 panic("nfs_vinvalbuf: dirty buffer without upl");
2584 /* gotta write out dirty data before invalidating */
2585 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2586 /* (NB_NOCACHE indicates buffer should be discarded) */
2587 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2588 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2589 if (bp->nb_wcred == NOCRED) {
2590 kauth_cred_ref(cred);
2591 bp->nb_wcred = cred;
2592 }
2593 error = nfs_buf_write(bp);
2594 // Note: bp has been released
2595 if (error) {
2596 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2597 np->n_error = error;
2598 np->n_flag |= NWRITEERR;
2599 /*
2600 * There was a write error and we need to
2601 * invalidate attrs to sync with server.
2602 * (if this write was extending the file,
2603 * we may no longer know the correct size)
2604 */
2605 NATTRINVALIDATE(np);
2606 error = 0;
2607 }
2608 lck_mtx_lock(nfs_buf_mutex);
2609 continue;
2610 }
2611 }
2612 SET(bp->nb_flags, NB_INVAL);
2613 // hold off on FREEUPs until we're done here
2614 nfs_buf_release(bp, 0);
2615 lck_mtx_lock(nfs_buf_mutex);
2616 }
2617 nfs_buf_itercomplete(np, &blist, list);
2618 }
2619 lck_mtx_unlock(nfs_buf_mutex);
2620 NFS_BUF_FREEUP();
2621 if (NVALIDBUFS(np))
2622 panic("nfs_vinvalbuf: flush failed");
2623 return (0);
2624 }
2625
2626
2627 /*
2628 * Flush and invalidate all dirty buffers. If another process is already
2629 * doing the flush, just wait for completion.
2630 */
2631 int
2632 nfs_vinvalbuf(
2633 vnode_t vp,
2634 int flags,
2635 kauth_cred_t cred,
2636 proc_t p,
2637 int intrflg)
2638 {
2639 struct nfsnode *np = VTONFS(vp);
2640 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
2641 int error = 0, slpflag, slptimeo;
2642 off_t size;
2643
2644 FSDBG_TOP(554, vp, flags, intrflg, 0);
2645
2646 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2647 intrflg = 0;
2648 if (intrflg) {
2649 slpflag = PCATCH;
2650 slptimeo = 2 * hz;
2651 } else {
2652 slpflag = 0;
2653 slptimeo = 0;
2654 }
2655 /*
2656 * First wait for any other process doing a flush to complete.
2657 */
2658 while (np->n_flag & NFLUSHINPROG) {
2659 np->n_flag |= NFLUSHWANT;
2660 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2661 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2662 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2663 if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
2664 FSDBG_BOT(554, vp, flags, intrflg, error);
2665 return (error);
2666 }
2667 }
2668
2669 /*
2670 * Now, flush as required.
2671 */
2672 np->n_flag |= NFLUSHINPROG;
2673 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2674 while (error) {
2675 FSDBG(554, vp, 0, 0, error);
2676 error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p);
2677 if (error) {
2678 np->n_flag &= ~NFLUSHINPROG;
2679 if (np->n_flag & NFLUSHWANT) {
2680 np->n_flag &= ~NFLUSHWANT;
2681 wakeup((caddr_t)&np->n_flag);
2682 }
2683 FSDBG_BOT(554, vp, flags, intrflg, error);
2684 return (error);
2685 }
2686 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2687 }
2688 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2689 if (np->n_flag & NFLUSHWANT) {
2690 np->n_flag &= ~NFLUSHWANT;
2691 wakeup((caddr_t)&np->n_flag);
2692 }
2693 /*
2694 * get the pages out of vm also
2695 */
2696 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
2697 int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE);
2698 if (!rv)
2699 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2700 }
2701
2702 FSDBG_BOT(554, vp, flags, intrflg, 0);
2703 return (0);
2704 }
2705
2706 /*
2707 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2708 * This is mainly to avoid queueing async I/O requests when the nfsiods
2709 * are all hung on a dead server.
2710 */
2711 int
2712 nfs_asyncio(bp, cred)
2713 struct nfsbuf *bp;
2714 kauth_cred_t cred;
2715 {
2716 struct nfsmount *nmp;
2717 int i;
2718 int gotiod;
2719 int slpflag = 0;
2720 int slptimeo = 0;
2721 int error, error2;
2722 void *wakeme = NULL;
2723 struct timespec ts;
2724
2725 if (nfs_numasync == 0)
2726 return (EIO);
2727
2728 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2729
2730 nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL);
2731 again:
2732 if (nmp && nmp->nm_flag & NFSMNT_INT)
2733 slpflag = PCATCH;
2734 gotiod = FALSE;
2735
2736 lck_mtx_lock(nfs_iod_mutex);
2737
2738 /* no nfsbuf means tell nfsiod to process delwri list */
2739 if (!bp)
2740 nfs_ioddelwri = 1;
2741
2742 /*
2743 * Find a free iod to process this request.
2744 */
2745 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2746 if (nfs_iodwant[i]) {
2747 /*
2748 * Found one, so wake it up and tell it which
2749 * mount to process.
2750 */
2751 nfs_iodwant[i] = NULL;
2752 nfs_iodmount[i] = nmp;
2753 if (nmp)
2754 nmp->nm_bufqiods++;
2755 wakeme = &nfs_iodwant[i];
2756 gotiod = TRUE;
2757 break;
2758 }
2759
2760 /* if we're just poking the delwri list, we're done */
2761 if (!bp) {
2762 lck_mtx_unlock(nfs_iod_mutex);
2763 if (wakeme)
2764 wakeup(wakeme);
2765 FSDBG_BOT(552, bp, 0x10101010, wakeme, 0);
2766 return (0);
2767 }
2768
2769 /*
2770 * If none are free, we may already have an iod working on this mount
2771 * point. If so, it will process our request.
2772 */
2773 if (!gotiod) {
2774 if (nmp->nm_bufqiods > 0) {
2775 gotiod = TRUE;
2776 }
2777 }
2778
2779 /*
2780 * If we have an iod which can process the request, then queue
2781 * the buffer.
2782 */
2783 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2784 if (gotiod) {
2785 /*
2786 * Ensure that the queue never grows too large.
2787 */
2788 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2789 if (ISSET(bp->nb_flags, NB_IOD)) {
2790 /* An nfsiod is attempting this async operation so */
2791 /* we must not fall asleep on the bufq because we */
2792 /* could be waiting on ourself. Just return error */
2793 /* and we'll do this operation syncrhonously. */
2794 goto out;
2795 }
2796 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2797 nmp->nm_bufqwant = TRUE;
2798
2799 ts.tv_sec = (slptimeo/100);
2800 /* the hz value is 100; which leads to 10ms */
2801 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2802
2803 error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO,
2804 "nfsaio", &ts);
2805 if (error) {
2806 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2807 if (error2) {
2808 lck_mtx_unlock(nfs_iod_mutex);
2809 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2810 return (error2);
2811 }
2812 if (slpflag == PCATCH) {
2813 slpflag = 0;
2814 slptimeo = 2 * hz;
2815 }
2816 }
2817 /*
2818 * We might have lost our iod while sleeping,
2819 * so check and loop if nescessary.
2820 */
2821 if (nmp->nm_bufqiods == 0) {
2822 lck_mtx_unlock(nfs_iod_mutex);
2823 goto again;
2824 }
2825 }
2826
2827 if (ISSET(bp->nb_flags, NB_READ)) {
2828 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2829 kauth_cred_ref(cred);
2830 bp->nb_rcred = cred;
2831 }
2832 } else {
2833 SET(bp->nb_flags, NB_WRITEINPROG);
2834 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2835 kauth_cred_ref(cred);
2836 bp->nb_wcred = cred;
2837 }
2838 }
2839
2840 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2841 nmp->nm_bufqlen++;
2842 lck_mtx_unlock(nfs_iod_mutex);
2843 if (wakeme)
2844 wakeup(wakeme);
2845 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2846 return (0);
2847 }
2848
2849 out:
2850 lck_mtx_unlock(nfs_iod_mutex);
2851 /*
2852 * All the iods are busy on other mounts, so return EIO to
2853 * force the caller to process the i/o synchronously.
2854 */
2855 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2856 return (EIO);
2857 }
2858
2859 /*
2860 * Do an I/O operation to/from a cache block. This may be called
2861 * synchronously or from an nfsiod.
2862 */
2863 int
2864 nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p)
2865 {
2866 struct uio *uiop;
2867 vnode_t vp;
2868 struct nfsnode *np;
2869 struct nfsmount *nmp;
2870 int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0;
2871 struct uio uio;
2872 struct iovec_32 io;
2873 enum vtype vtype;
2874
2875 vp = bp->nb_vp;
2876 vtype = vnode_vtype(vp);
2877 np = VTONFS(vp);
2878 nmp = VFSTONFS(vnode_mount(vp));
2879 uiop = &uio;
2880 uiop->uio_iovs.iov32p = &io;
2881 uiop->uio_iovcnt = 1;
2882 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2883 uiop->uio_segflg = UIO_SYSSPACE;
2884 #else
2885 uiop->uio_segflg = UIO_SYSSPACE32;
2886 #endif
2887
2888 /*
2889 * we've decided to perform I/O for this block,
2890 * so we couldn't possibly NB_DONE. So, clear it.
2891 */
2892 if (ISSET(bp->nb_flags, NB_DONE)) {
2893 if (!ISSET(bp->nb_flags, NB_ASYNC))
2894 panic("nfs_doio: done and not async");
2895 CLR(bp->nb_flags, NB_DONE);
2896 }
2897 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2898 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2899 bp->nb_dirtyend);
2900
2901 if (ISSET(bp->nb_flags, NB_READ)) {
2902 if (vtype == VREG)
2903 NFS_BUF_MAP(bp);
2904 io.iov_len = bp->nb_bufsize;
2905 uio_uio_resid_set(uiop, io.iov_len);
2906 io.iov_base = (uintptr_t) bp->nb_data;
2907 uiop->uio_rw = UIO_READ;
2908 switch (vtype) {
2909 case VREG:
2910 uiop->uio_offset = NBOFF(bp);
2911 OSAddAtomic(1, (SInt32*)&nfsstats.read_bios);
2912 error = nfs_readrpc(vp, uiop, cr, p);
2913 FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error);
2914 if (!error) {
2915 /* update valid range */
2916 bp->nb_validoff = 0;
2917 if (uio_uio_resid(uiop) != 0) {
2918 /*
2919 * If len > 0, there is a hole in the file and
2920 * no writes after the hole have been pushed to
2921 * the server yet.
2922 * Just zero fill the rest of the valid area.
2923 */
2924 // LP64todo - fix this
2925 diff = bp->nb_bufsize - uio_uio_resid(uiop);
2926 len = np->n_size - (NBOFF(bp) + diff);
2927 if (len > 0) {
2928 // LP64todo - fix this
2929 len = min(len, uio_uio_resid(uiop));
2930 bzero((char *)bp->nb_data + diff, len);
2931 bp->nb_validend = diff + len;
2932 FSDBG(258, diff, len, 0, 1);
2933 } else
2934 bp->nb_validend = diff;
2935 } else
2936 bp->nb_validend = bp->nb_bufsize;
2937 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2938 if (bp->nb_validend & PAGE_MASK) {
2939 /* valid range ends in the middle of a page so we */
2940 /* need to zero-fill any invalid data at the end */
2941 /* of the last page */
2942 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2943 bp->nb_bufsize - bp->nb_validend);
2944 FSDBG(258, bp->nb_validend,
2945 bp->nb_bufsize - bp->nb_validend, 0, 2);
2946 }
2947 }
2948 break;
2949 case VLNK:
2950 uiop->uio_offset = (off_t)0;
2951 OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios);
2952 error = nfs_readlinkrpc(vp, uiop, cr, p);
2953 if (!error) {
2954 bp->nb_validoff = 0;
2955 bp->nb_validend = uiop->uio_offset;
2956 }
2957 break;
2958 case VDIR:
2959 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
2960 uiop->uio_offset = NBOFF(bp);
2961 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2962 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2963 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2964 error = nfs_readdirplusrpc(vp, uiop, cr, p);
2965 if (error == NFSERR_NOTSUPP)
2966 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2967 }
2968 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2969 error = nfs_readdirrpc(vp, uiop, cr, p);
2970 if (!error) {
2971 bp->nb_validoff = 0;
2972 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2973 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2974 }
2975 break;
2976 default:
2977 printf("nfs_doio: type %x unexpected\n", vtype);
2978 break;
2979 };
2980 if (error) {
2981 SET(bp->nb_flags, NB_ERROR);
2982 bp->nb_error = error;
2983 }
2984
2985 } else {
2986 /* we're doing a write */
2987 int doff, dend = 0;
2988
2989 /* We need to make sure the pages are locked before doing I/O. */
2990 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) {
2991 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2992 error = nfs_buf_upl_setup(bp);
2993 if (error) {
2994 printf("nfs_doio: upl create failed %d\n", error);
2995 SET(bp->nb_flags, NB_ERROR);
2996 bp->nb_error = EIO;
2997 return (EIO);
2998 }
2999 nfs_buf_upl_check(bp);
3000 }
3001 }
3002
3003 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
3004 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
3005 /*
3006 * There are pages marked dirty that need to be written out.
3007 *
3008 * We don't want to just combine the write range with the
3009 * range of pages that are dirty because that could cause us
3010 * to write data that wasn't actually written to.
3011 * We also don't want to write data more than once.
3012 *
3013 * If the dirty range just needs to be committed, we do that.
3014 * Otherwise, we write the dirty range and clear the dirty bits
3015 * for any COMPLETE pages covered by that range.
3016 * If there are dirty pages left after that, we write out the
3017 * parts that we haven't written yet.
3018 */
3019 }
3020
3021 /*
3022 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3023 * an actual write will have to be done.
3024 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
3025 */
3026 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
3027 doff = NBOFF(bp) + bp->nb_dirtyoff;
3028 SET(bp->nb_flags, NB_WRITEINPROG);
3029 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
3030 bp->nb_wcred, bp->nb_proc);
3031 CLR(bp->nb_flags, NB_WRITEINPROG);
3032 if (!error) {
3033 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3034 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3035 np->n_needcommitcnt--;
3036 CHECK_NEEDCOMMITCNT(np);
3037 } else if (error == NFSERR_STALEWRITEVERF)
3038 nfs_clearcommit(vnode_mount(vp));
3039 }
3040
3041 if (!error && bp->nb_dirtyend > 0) {
3042 /* there's a dirty range that needs to be written out */
3043 u_int32_t pagemask;
3044 int firstpg, lastpg;
3045
3046 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
3047 bp->nb_dirtyend = np->n_size - NBOFF(bp);
3048
3049 NFS_BUF_MAP(bp);
3050
3051 doff = bp->nb_dirtyoff;
3052 dend = bp->nb_dirtyend;
3053
3054 /* if doff page is dirty, move doff to start of page */
3055 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
3056 doff -= doff & PAGE_MASK;
3057 /* try to expand write range to include preceding dirty pages */
3058 if (!(doff & PAGE_MASK))
3059 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
3060 doff -= PAGE_SIZE;
3061 /* if dend page is dirty, move dend to start of next page */
3062 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
3063 dend = round_page_32(dend);
3064 /* try to expand write range to include trailing dirty pages */
3065 if (!(dend & PAGE_MASK))
3066 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
3067 dend += PAGE_SIZE;
3068 /* make sure to keep dend clipped to EOF */
3069 if (NBOFF(bp) + dend > (off_t)np->n_size)
3070 dend = np->n_size - NBOFF(bp);
3071 /* calculate range of complete pages being written */
3072 firstpg = round_page_32(doff) / PAGE_SIZE;
3073 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
3074 /* calculate mask for that page range */
3075 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
3076
3077 /* compare page mask to nb_dirty; if there are other dirty pages */
3078 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
3079 /* not needcommit/stable; otherwise write FILESYNC */
3080 if (bp->nb_dirty & ~pagemask)
3081 iomode = NFSV3WRITE_FILESYNC;
3082 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_STABLE)) == NB_ASYNC)
3083 iomode = NFSV3WRITE_UNSTABLE;
3084 else
3085 iomode = NFSV3WRITE_FILESYNC;
3086
3087 /* write the dirty range */
3088 io.iov_len = dend - doff;
3089 uio_uio_resid_set(uiop, io.iov_len);
3090 uiop->uio_offset = NBOFF(bp) + doff;
3091 io.iov_base = (uintptr_t) bp->nb_data + doff;
3092 uiop->uio_rw = UIO_WRITE;
3093
3094 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3095
3096 SET(bp->nb_flags, NB_WRITEINPROG);
3097 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3098 if (must_commit)
3099 nfs_clearcommit(vnode_mount(vp));
3100 /* clear dirty bits for pages we've written */
3101 if (!error)
3102 bp->nb_dirty &= ~pagemask;
3103 /* set/clear needcommit flag */
3104 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
3105 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3106 np->n_needcommitcnt++;
3107 SET(bp->nb_flags, NB_NEEDCOMMIT);
3108 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3109 bp->nb_dirtyoff = doff;
3110 bp->nb_dirtyend = dend;
3111 } else {
3112 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3113 np->n_needcommitcnt--;
3114 CHECK_NEEDCOMMITCNT(np);
3115 }
3116 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3117 }
3118 CLR(bp->nb_flags, NB_WRITEINPROG);
3119 /*
3120 * For an interrupted write, the buffer is still valid and the write
3121 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3122 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3123 * NB_EINTR is not relevant.
3124 *
3125 * For the case of a V3 write rpc not being committed to stable
3126 * storage, the block is still dirty and requires either a commit rpc
3127 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3128 * block is reused. This is indicated by setting the NB_DELWRI and
3129 * NB_NEEDCOMMIT flags.
3130 */
3131 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
3132 CLR(bp->nb_flags, NB_INVAL);
3133 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3134 SET(bp->nb_flags, NB_DELWRI);
3135 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
3136 NFSBUFCNTCHK(0);
3137 }
3138 FSDBG(261, bp->nb_validoff, bp->nb_validend,
3139 bp->nb_bufsize, 0);
3140 /*
3141 * Since for the NB_ASYNC case, nfs_bwrite() has
3142 * reassigned the buffer to the clean list, we have to
3143 * reassign it back to the dirty one. Ugh.
3144 */
3145 if (ISSET(bp->nb_flags, NB_ASYNC)) {
3146 /* move to dirty list */
3147 lck_mtx_lock(nfs_buf_mutex);
3148 if (bp->nb_vnbufs.le_next != NFSNOLIST)
3149 LIST_REMOVE(bp, nb_vnbufs);
3150 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3151 lck_mtx_unlock(nfs_buf_mutex);
3152 } else {
3153 SET(bp->nb_flags, NB_EINTR);
3154 }
3155 } else {
3156 /* either there's an error or we don't need to commit */
3157 if (error) {
3158 SET(bp->nb_flags, NB_ERROR);
3159 bp->nb_error = np->n_error = error;
3160 np->n_flag |= NWRITEERR;
3161 /*
3162 * There was a write error and we need to
3163 * invalidate attrs and flush buffers in
3164 * order to sync up with the server.
3165 * (if this write was extending the file,
3166 * we may no longer know the correct size)
3167 *
3168 * But we can't call vinvalbuf while holding
3169 * this buffer busy. Set a flag to do it after
3170 * releasing the buffer.
3171 *
3172 * Note we can only invalidate in this function
3173 * if this is an async write and so the iodone
3174 * below will release the buffer. Also, we
3175 * shouldn't call vinvalbuf from nfsiod because
3176 * that may deadlock waiting for the completion
3177 * of writes that are queued up behind this one.
3178 */
3179 if (ISSET(bp->nb_flags, NB_ASYNC) &&
3180 !ISSET(bp->nb_flags, NB_IOD)) {
3181 invalidate = 1;
3182 } else {
3183 /* invalidate later */
3184 np->n_flag |= NNEEDINVALIDATE;
3185 }
3186 NATTRINVALIDATE(np);
3187 }
3188 /* clear the dirty range */
3189 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3190 }
3191 }
3192
3193 if (!error && bp->nb_dirty) {
3194 /* there are pages marked dirty that need to be written out */
3195 int pg, count, npages, off;
3196
3197 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3198
3199 NFS_BUF_MAP(bp);
3200
3201 /*
3202 * we do these writes synchronously because we can't really
3203 * support the unstable/needommit method. We could write
3204 * them unstable, clear the dirty bits, and then commit the
3205 * whole block later, but if we need to rewrite the data, we
3206 * won't have any idea which pages were written because that
3207 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3208 * also can't leave the dirty bits set because then we wouldn't
3209 * be able to tell if the pages were re-dirtied between the end
3210 * of the write and the commit.
3211 */
3212 iomode = NFSV3WRITE_FILESYNC;
3213 uiop->uio_rw = UIO_WRITE;
3214
3215 SET(bp->nb_flags, NB_WRITEINPROG);
3216 npages = bp->nb_bufsize/PAGE_SIZE;
3217 for (pg=0; pg < npages; pg++) {
3218 if (!NBPGDIRTY(bp,pg))
3219 continue;
3220 count = 1;
3221 while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count))
3222 count++;
3223 /* write count pages starting with page pg */
3224 off = pg * PAGE_SIZE;
3225 len = count * PAGE_SIZE;
3226
3227 /* clip writes to EOF */
3228 if (NBOFF(bp) + off + len > (off_t)np->n_size)
3229 len -= (NBOFF(bp) + off + len) - np->n_size;
3230 if (len > 0) {
3231 io.iov_len = len;
3232 uio_uio_resid_set(uiop, io.iov_len);
3233 uiop->uio_offset = NBOFF(bp) + off;
3234 io.iov_base = (uintptr_t) bp->nb_data + off;
3235 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3236 if (must_commit)
3237 nfs_clearcommit(vnode_mount(vp));
3238 if (error)
3239 break;
3240 }
3241 /* clear dirty bits */
3242 while (count--) {
3243 bp->nb_dirty &= ~(1 << pg);
3244 /* leave pg on last page */
3245 if (count) pg++;
3246 }
3247 }
3248 if (!error) {
3249 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3250 np->n_needcommitcnt--;
3251 CHECK_NEEDCOMMITCNT(np);
3252 }
3253 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3254 }
3255 CLR(bp->nb_flags, NB_WRITEINPROG);
3256 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
3257 np->n_size);
3258 }
3259
3260 if (error) {
3261 SET(bp->nb_flags, NB_ERROR);
3262 bp->nb_error = error;
3263 }
3264 }
3265
3266 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
3267
3268 nfs_buf_iodone(bp);
3269
3270 if (invalidate) {
3271 /*
3272 * There was a write error and we need to
3273 * invalidate attrs and flush buffers in
3274 * order to sync up with the server.
3275 * (if this write was extending the file,
3276 * we may no longer know the correct size)
3277 *
3278 * But we couldn't call vinvalbuf while holding
3279 * the buffer busy. So we call vinvalbuf() after
3280 * releasing the buffer.
3281 *
3282 * Note: we don't bother calling nfs_vinvalbuf() if
3283 * there's already a flush in progress.
3284 */
3285 if (!(np->n_flag & NFLUSHINPROG))
3286 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1);
3287 }
3288
3289 return (error);
3290 }