]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
5fbd5fb460af7bf7953e75dddb909dbf2a240ab1
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 /*
32 * Copyright (c) 1989, 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * This code is derived from software contributed to Berkeley by
36 * Rick Macklem at The University of Guelph.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
67 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
68 */
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/resourcevar.h>
72 #include <sys/signalvar.h>
73 #include <sys/proc_internal.h>
74 #include <sys/kauth.h>
75 #include <sys/malloc.h>
76 #include <sys/vnode.h>
77 #include <sys/dirent.h>
78 #include <sys/mount_internal.h>
79 #include <sys/kernel.h>
80 #include <sys/sysctl.h>
81 #include <sys/ubc_internal.h>
82 #include <sys/uio_internal.h>
83
84 #include <sys/vm.h>
85 #include <sys/vmparam.h>
86
87 #include <sys/time.h>
88 #include <kern/clock.h>
89 #include <libkern/OSAtomic.h>
90 #include <kern/kalloc.h>
91
92 #include <nfs/rpcv2.h>
93 #include <nfs/nfsproto.h>
94 #include <nfs/nfs.h>
95 #include <nfs/nfsmount.h>
96 #include <nfs/nfsnode.h>
97 #include <sys/buf_internal.h>
98
99 #include <sys/kdebug.h>
100
101 #define FSDBG(A, B, C, D, E) \
102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
103 (int)(B), (int)(C), (int)(D), (int)(E), 0)
104 #define FSDBG_TOP(A, B, C, D, E) \
105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
106 (int)(B), (int)(C), (int)(D), (int)(E), 0)
107 #define FSDBG_BOT(A, B, C, D, E) \
108 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
109 (int)(B), (int)(C), (int)(D), (int)(E), 0)
110
111 extern int nfs_numasync;
112 extern int nfs_ioddelwri;
113 extern struct nfsstats nfsstats;
114
115 #define NFSBUFHASH(np, lbn) \
116 (&nfsbufhashtbl[((long)(np) / sizeof(*(np)) + (int)(lbn)) & nfsbufhash])
117 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
118 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
119 u_long nfsbufhash;
120 int nfsbufcnt, nfsbufmin, nfsbufmax, nfsbufmetacnt, nfsbufmetamax;
121 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
122 int nfs_nbdwrite;
123 time_t nfsbuffreeuptimestamp;
124
125 lck_grp_t *nfs_buf_lck_grp;
126 lck_grp_attr_t *nfs_buf_lck_grp_attr;
127 lck_attr_t *nfs_buf_lck_attr;
128 lck_mtx_t *nfs_buf_mutex;
129
130 #define NFSBUFWRITE_THROTTLE 9
131 #define NFSBUF_LRU_STALE 120
132 #define NFSBUF_META_STALE 240
133
134 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
135 #define LRU_TO_FREEUP 6
136 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
137 #define META_TO_FREEUP 3
138 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
139 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
140 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
141 #define LRU_FREEUP_FRAC_ON_TIMER 8
142 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
143 #define META_FREEUP_FRAC_ON_TIMER 16
144 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
145 #define LRU_FREEUP_MIN_FRAC 4
146 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
147 #define META_FREEUP_MIN_FRAC 2
148
149 #define NFS_BUF_FREEUP() \
150 do { \
151 /* only call nfs_buf_freeup() if it has work to do: */ \
152 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
153 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
154 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
155 nfs_buf_freeup(0); \
156 } while (0)
157
158 /*
159 * Initialize nfsbuf lists
160 */
161 void
162 nfs_nbinit(void)
163 {
164 nfs_buf_lck_grp_attr = lck_grp_attr_alloc_init();
165 lck_grp_attr_setstat(nfs_buf_lck_grp_attr);
166 nfs_buf_lck_grp = lck_grp_alloc_init("nfs_buf", nfs_buf_lck_grp_attr);
167
168 nfs_buf_lck_attr = lck_attr_alloc_init();
169
170 nfs_buf_mutex = lck_mtx_alloc_init(nfs_buf_lck_grp, nfs_buf_lck_attr);
171
172 nfsbufcnt = nfsbufmetacnt =
173 nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
174 nfsbufmin = 128;
175 nfsbufmax = (sane_size >> PAGE_SHIFT) / 4;
176 nfsbufmetamax = (sane_size >> PAGE_SHIFT) / 16;
177 nfsneedbuffer = 0;
178 nfs_nbdwrite = 0;
179 nfsbuffreeuptimestamp = 0;
180
181 nfsbufhashtbl = hashinit(nfsbufmax/4, M_TEMP, &nfsbufhash);
182 TAILQ_INIT(&nfsbuffree);
183 TAILQ_INIT(&nfsbuffreemeta);
184 TAILQ_INIT(&nfsbufdelwri);
185
186 }
187
188 /*
189 * try to free up some excess, unused nfsbufs
190 */
191 void
192 nfs_buf_freeup(int timer)
193 {
194 struct nfsbuf *fbp;
195 struct timeval now;
196 int count;
197 struct nfsbuffreehead nfsbuffreeup;
198
199 TAILQ_INIT(&nfsbuffreeup);
200
201 lck_mtx_lock(nfs_buf_mutex);
202
203 microuptime(&now);
204 nfsbuffreeuptimestamp = now.tv_sec;
205
206 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
207
208 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
209 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
210 fbp = TAILQ_FIRST(&nfsbuffree);
211 if (!fbp)
212 break;
213 if (fbp->nb_refs)
214 break;
215 if (NBUFSTAMPVALID(fbp) &&
216 (fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
217 break;
218 nfs_buf_remfree(fbp);
219 /* disassociate buffer from any vnode */
220 if (fbp->nb_vp) {
221 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
222 LIST_REMOVE(fbp, nb_vnbufs);
223 fbp->nb_vnbufs.le_next = NFSNOLIST;
224 }
225 fbp->nb_vp = NULL;
226 }
227 LIST_REMOVE(fbp, nb_hash);
228 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
229 nfsbufcnt--;
230 }
231
232 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
233 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
234 fbp = TAILQ_FIRST(&nfsbuffreemeta);
235 if (!fbp)
236 break;
237 if (fbp->nb_refs)
238 break;
239 if (NBUFSTAMPVALID(fbp) &&
240 (fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
241 break;
242 nfs_buf_remfree(fbp);
243 /* disassociate buffer from any vnode */
244 if (fbp->nb_vp) {
245 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
246 LIST_REMOVE(fbp, nb_vnbufs);
247 fbp->nb_vnbufs.le_next = NFSNOLIST;
248 }
249 fbp->nb_vp = NULL;
250 }
251 LIST_REMOVE(fbp, nb_hash);
252 TAILQ_INSERT_TAIL(&nfsbuffreeup, fbp, nb_free);
253 nfsbufcnt--;
254 nfsbufmetacnt--;
255 }
256
257 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, 0);
258 NFSBUFCNTCHK(1);
259
260 lck_mtx_unlock(nfs_buf_mutex);
261
262 while ((fbp = TAILQ_FIRST(&nfsbuffreeup))) {
263 TAILQ_REMOVE(&nfsbuffreeup, fbp, nb_free);
264 /* nuke any creds */
265 if (fbp->nb_rcred != NOCRED) {
266 kauth_cred_rele(fbp->nb_rcred);
267 fbp->nb_rcred = NOCRED;
268 }
269 if (fbp->nb_wcred != NOCRED) {
270 kauth_cred_rele(fbp->nb_wcred);
271 fbp->nb_wcred = NOCRED;
272 }
273 /* if buf was NB_META, dump buffer */
274 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data)
275 kfree(fbp->nb_data, fbp->nb_bufsize);
276 FREE(fbp, M_TEMP);
277 }
278
279 }
280
281 /*
282 * remove a buffer from the freelist
283 * (must be called with nfs_buf_mutex held)
284 */
285 void
286 nfs_buf_remfree(struct nfsbuf *bp)
287 {
288 if (bp->nb_free.tqe_next == NFSNOLIST)
289 panic("nfsbuf not on free list");
290 if (ISSET(bp->nb_flags, NB_DELWRI)) {
291 nfsbufdelwricnt--;
292 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
293 } else if (ISSET(bp->nb_flags, NB_META)) {
294 nfsbuffreemetacnt--;
295 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
296 } else {
297 nfsbuffreecnt--;
298 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
299 }
300 bp->nb_free.tqe_next = NFSNOLIST;
301 NFSBUFCNTCHK(1);
302 }
303
304 /*
305 * check for existence of nfsbuf in cache
306 */
307 boolean_t
308 nfs_buf_is_incore(vnode_t vp, daddr64_t blkno)
309 {
310 boolean_t rv;
311 lck_mtx_lock(nfs_buf_mutex);
312 if (nfs_buf_incore(vp, blkno))
313 rv = TRUE;
314 else
315 rv = FALSE;
316 lck_mtx_unlock(nfs_buf_mutex);
317 return (rv);
318 }
319
320 /*
321 * return incore buffer (must be called with nfs_buf_mutex held)
322 */
323 struct nfsbuf *
324 nfs_buf_incore(vnode_t vp, daddr64_t blkno)
325 {
326 /* Search hash chain */
327 struct nfsbuf * bp = NFSBUFHASH(VTONFS(vp), blkno)->lh_first;
328 for (; bp != NULL; bp = bp->nb_hash.le_next)
329 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
330 if (!ISSET(bp->nb_flags, NB_INVAL)) {
331 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
332 return (bp);
333 }
334 }
335 return (NULL);
336 }
337
338 /*
339 * Check if it's OK to drop a page.
340 *
341 * Called by vnode_pager() on pageout request of non-dirty page.
342 * We need to make sure that it's not part of a delayed write.
343 * If it is, we can't let the VM drop it because we may need it
344 * later when/if we need to write the data (again).
345 */
346 int
347 nfs_buf_page_inval(vnode_t vp, off_t offset)
348 {
349 struct nfsbuf *bp;
350 int error = 0;
351
352 lck_mtx_lock(nfs_buf_mutex);
353 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
354 if (!bp)
355 goto out;
356 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
357 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
358 error = EBUSY;
359 goto out;
360 }
361 /*
362 * If there's a dirty range in the buffer, check to
363 * see if this page intersects with the dirty range.
364 * If it does, we can't let the pager drop the page.
365 */
366 if (bp->nb_dirtyend > 0) {
367 int start = offset - NBOFF(bp);
368 if (bp->nb_dirtyend <= start ||
369 bp->nb_dirtyoff >= (start + PAGE_SIZE))
370 error = 0;
371 else
372 error = EBUSY;
373 }
374 out:
375 lck_mtx_unlock(nfs_buf_mutex);
376 return (error);
377 }
378
379 /*
380 * set up the UPL for a buffer
381 * (must NOT be called with nfs_buf_mutex held)
382 */
383 int
384 nfs_buf_upl_setup(struct nfsbuf *bp)
385 {
386 kern_return_t kret;
387 upl_t upl;
388 int upl_flags;
389
390 if (ISSET(bp->nb_flags, NB_PAGELIST))
391 return (0);
392
393 upl_flags = UPL_PRECIOUS;
394 if (! ISSET(bp->nb_flags, NB_READ)) {
395 /*
396 * We're doing a "write", so we intend to modify
397 * the pages we're gathering.
398 */
399 upl_flags |= UPL_WILL_MODIFY;
400 }
401 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
402 &upl, NULL, upl_flags);
403 if (kret == KERN_INVALID_ARGUMENT) {
404 /* vm object probably doesn't exist any more */
405 bp->nb_pagelist = NULL;
406 return (EINVAL);
407 }
408 if (kret != KERN_SUCCESS) {
409 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
410 bp->nb_pagelist = NULL;
411 return (EIO);
412 }
413
414 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
415
416 bp->nb_pagelist = upl;
417 SET(bp->nb_flags, NB_PAGELIST);
418 return (0);
419 }
420
421 /*
422 * update buffer's valid/dirty info from UBC
423 * (must NOT be called with nfs_buf_mutex held)
424 */
425 void
426 nfs_buf_upl_check(struct nfsbuf *bp)
427 {
428 upl_page_info_t *pl;
429 off_t filesize, fileoffset;
430 int i, npages;
431
432 if (!ISSET(bp->nb_flags, NB_PAGELIST))
433 return;
434
435 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
436 filesize = ubc_getsize(bp->nb_vp);
437 fileoffset = NBOFF(bp);
438 if (fileoffset < filesize)
439 SET(bp->nb_flags, NB_CACHE);
440 else
441 CLR(bp->nb_flags, NB_CACHE);
442
443 pl = ubc_upl_pageinfo(bp->nb_pagelist);
444 bp->nb_valid = bp->nb_dirty = 0;
445
446 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
447 /* anything beyond the end of the file is not valid or dirty */
448 if (fileoffset >= filesize)
449 break;
450 if (!upl_valid_page(pl, i)) {
451 CLR(bp->nb_flags, NB_CACHE);
452 continue;
453 }
454 NBPGVALID_SET(bp,i);
455 if (upl_dirty_page(pl, i)) {
456 NBPGDIRTY_SET(bp, i);
457 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
458 SET(bp->nb_flags, NB_WASDIRTY);
459 }
460 }
461 fileoffset = NBOFF(bp);
462 if (ISSET(bp->nb_flags, NB_CACHE)) {
463 bp->nb_validoff = 0;
464 bp->nb_validend = bp->nb_bufsize;
465 if (fileoffset + bp->nb_validend > filesize)
466 bp->nb_validend = filesize - fileoffset;
467 } else {
468 bp->nb_validoff = bp->nb_validend = -1;
469 }
470 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
471 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
472 }
473
474 /*
475 * make sure that a buffer is mapped
476 * (must NOT be called with nfs_buf_mutex held)
477 */
478 static int
479 nfs_buf_map(struct nfsbuf *bp)
480 {
481 kern_return_t kret;
482
483 if (bp->nb_data)
484 return (0);
485 if (!ISSET(bp->nb_flags, NB_PAGELIST))
486 return (EINVAL);
487
488 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
489 if (kret != KERN_SUCCESS)
490 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
491 if (bp->nb_data == 0)
492 panic("ubc_upl_map mapped 0");
493 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
494 return (0);
495 }
496
497 /*
498 * check range of pages in nfsbuf's UPL for validity
499 */
500 static int
501 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
502 {
503 off_t fileoffset, filesize;
504 int pg, lastpg;
505 upl_page_info_t *pl;
506
507 if (!ISSET(bp->nb_flags, NB_PAGELIST))
508 return (0);
509 pl = ubc_upl_pageinfo(bp->nb_pagelist);
510
511 size += off & PAGE_MASK;
512 off &= ~PAGE_MASK;
513 fileoffset = NBOFF(bp);
514 filesize = VTONFS(bp->nb_vp)->n_size;
515 if ((fileoffset + off + size) > filesize)
516 size = filesize - (fileoffset + off);
517
518 pg = off/PAGE_SIZE;
519 lastpg = (off + size - 1)/PAGE_SIZE;
520 while (pg <= lastpg) {
521 if (!upl_valid_page(pl, pg))
522 return (0);
523 pg++;
524 }
525 return (1);
526 }
527
528 /*
529 * normalize an nfsbuf's valid range
530 *
531 * the read/write code guarantees that we'll always have a valid
532 * region that is an integral number of pages. If either end
533 * of the valid range isn't page-aligned, it gets corrected
534 * here as we extend the valid range through all of the
535 * contiguous valid pages.
536 */
537 static void
538 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
539 {
540 int pg, npg;
541 /* pull validoff back to start of contiguous valid page range */
542 pg = bp->nb_validoff/PAGE_SIZE;
543 while (pg >= 0 && NBPGVALID(bp,pg))
544 pg--;
545 bp->nb_validoff = (pg+1) * PAGE_SIZE;
546 /* push validend forward to end of contiguous valid page range */
547 npg = bp->nb_bufsize/PAGE_SIZE;
548 pg = bp->nb_validend/PAGE_SIZE;
549 while (pg < npg && NBPGVALID(bp,pg))
550 pg++;
551 bp->nb_validend = pg * PAGE_SIZE;
552 /* clip to EOF */
553 if (NBOFF(bp) + bp->nb_validend > (off_t)np->n_size)
554 bp->nb_validend = np->n_size % bp->nb_bufsize;
555 }
556
557 /*
558 * try to push out some delayed/uncommitted writes
559 * ("locked" indicates whether nfs_buf_mutex is already held)
560 */
561 static void
562 nfs_buf_delwri_push(int locked)
563 {
564 struct nfsbuf *bp;
565 int i, error;
566
567 if (TAILQ_EMPTY(&nfsbufdelwri))
568 return;
569
570 /* first try to tell the nfsiods to do it */
571 if (nfs_asyncio(NULL, NULL) == 0)
572 return;
573
574 /* otherwise, try to do some of the work ourselves */
575 i = 0;
576 if (!locked)
577 lck_mtx_lock(nfs_buf_mutex);
578 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
579 struct nfsnode *np = VTONFS(bp->nb_vp);
580 nfs_buf_remfree(bp);
581 nfs_buf_refget(bp);
582 while ((error = nfs_buf_acquire(bp, 0, 0, 0)) == EAGAIN);
583 nfs_buf_refrele(bp);
584 if (error)
585 break;
586 if (!bp->nb_vp) {
587 /* buffer is no longer valid */
588 nfs_buf_drop(bp);
589 continue;
590 }
591 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
592 /* put buffer at end of delwri list */
593 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
594 nfsbufdelwricnt++;
595 nfs_buf_drop(bp);
596 lck_mtx_unlock(nfs_buf_mutex);
597 nfs_flushcommits(np->n_vnode, NULL, 1);
598 } else {
599 SET(bp->nb_flags, NB_ASYNC);
600 lck_mtx_unlock(nfs_buf_mutex);
601 nfs_buf_write(bp);
602 }
603 i++;
604 lck_mtx_lock(nfs_buf_mutex);
605 }
606 if (!locked)
607 lck_mtx_unlock(nfs_buf_mutex);
608 }
609
610 /*
611 * Get an nfs buffer.
612 *
613 * Returns errno on error, 0 otherwise.
614 * Any buffer is returned in *bpp.
615 *
616 * If NBLK_ONLYVALID is set, only return buffer if found in cache.
617 * If NBLK_NOWAIT is set, don't wait for the buffer if it's marked BUSY.
618 *
619 * Check for existence of buffer in cache.
620 * Or attempt to reuse a buffer from one of the free lists.
621 * Or allocate a new buffer if we haven't already hit max allocation.
622 * Or wait for a free buffer.
623 *
624 * If available buffer found, prepare it, and return it.
625 *
626 * If the calling process is interrupted by a signal for
627 * an interruptible mount point, return EINTR.
628 */
629 int
630 nfs_buf_get(
631 vnode_t vp,
632 daddr64_t blkno,
633 int size,
634 proc_t p,
635 int flags,
636 struct nfsbuf **bpp)
637 {
638 struct nfsnode *np = VTONFS(vp);
639 struct nfsbuf *bp;
640 int biosize, bufsize;
641 kauth_cred_t cred;
642 int slpflag = PCATCH;
643 int operation = (flags & NBLK_OPMASK);
644 int error = 0;
645 struct timespec ts;
646
647 FSDBG_TOP(541, vp, blkno, size, flags);
648 *bpp = NULL;
649
650 bufsize = size;
651 if (bufsize > MAXBSIZE)
652 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
653
654 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
655
656 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp)) {
657 operation = NBLK_META;
658 } else if (bufsize < biosize) {
659 /* reg files should always have biosize blocks */
660 bufsize = biosize;
661 }
662
663 /* if NBLK_WRITE, check for too many delayed/uncommitted writes */
664 if ((operation == NBLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
665 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
666
667 /* poke the delwri list */
668 nfs_buf_delwri_push(0);
669
670 /* sleep to let other threads run... */
671 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
672 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
673 }
674
675 loop:
676 lck_mtx_lock(nfs_buf_mutex);
677
678 /* check for existence of nfsbuf in cache */
679 if ((bp = nfs_buf_incore(vp, blkno))) {
680 /* if busy, set wanted and wait */
681 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
682 if (flags & NBLK_NOWAIT) {
683 lck_mtx_unlock(nfs_buf_mutex);
684 FSDBG_BOT(541, vp, blkno, bp, 0xbcbcbcbc);
685 return (0);
686 }
687 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
688 SET(bp->nb_lflags, NBL_WANTED);
689
690 ts.tv_sec = 2;
691 ts.tv_nsec = 0;
692 msleep(bp, nfs_buf_mutex, slpflag|(PRIBIO+1)|PDROP,
693 "nfsbufget", (slpflag == PCATCH) ? 0 : &ts);
694 slpflag = 0;
695 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
696 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
697 FSDBG_BOT(541, vp, blkno, 0, error);
698 return (error);
699 }
700 goto loop;
701 }
702 if (bp->nb_bufsize != bufsize)
703 panic("nfsbuf size mismatch");
704 SET(bp->nb_lflags, NBL_BUSY);
705 SET(bp->nb_flags, NB_CACHE);
706 nfs_buf_remfree(bp);
707 /* additional paranoia: */
708 if (ISSET(bp->nb_flags, NB_PAGELIST))
709 panic("pagelist buffer was not busy");
710 goto buffer_setup;
711 }
712
713 if (flags & NBLK_ONLYVALID) {
714 lck_mtx_unlock(nfs_buf_mutex);
715 FSDBG_BOT(541, vp, blkno, 0, 0x0000cace);
716 return (0);
717 }
718
719 /*
720 * where to get a free buffer:
721 * - if meta and maxmeta reached, must reuse meta
722 * - alloc new if we haven't reached min bufs
723 * - if free lists are NOT empty
724 * - if free list is stale, use it
725 * - else if freemeta list is stale, use it
726 * - else if max bufs allocated, use least-time-to-stale
727 * - alloc new if we haven't reached max allowed
728 * - start clearing out delwri list and try again
729 */
730
731 if ((operation == NBLK_META) && (nfsbufmetacnt >= nfsbufmetamax)) {
732 /* if we've hit max meta buffers, must reuse a meta buffer */
733 bp = TAILQ_FIRST(&nfsbuffreemeta);
734 } else if ((nfsbufcnt > nfsbufmin) &&
735 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
736 /* try to pull an nfsbuf off a free list */
737 struct nfsbuf *lrubp, *metabp;
738 struct timeval now;
739 microuptime(&now);
740
741 /* if the next LRU or META buffer is invalid or stale, use it */
742 lrubp = TAILQ_FIRST(&nfsbuffree);
743 if (lrubp && (!NBUFSTAMPVALID(lrubp) ||
744 ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec)))
745 bp = lrubp;
746 metabp = TAILQ_FIRST(&nfsbuffreemeta);
747 if (!bp && metabp && (!NBUFSTAMPVALID(metabp) ||
748 ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec)))
749 bp = metabp;
750
751 if (!bp && (nfsbufcnt >= nfsbufmax)) {
752 /* we've already allocated all bufs, so */
753 /* choose the buffer that'll go stale first */
754 if (!metabp)
755 bp = lrubp;
756 else if (!lrubp)
757 bp = metabp;
758 else {
759 int32_t lru_stale_time, meta_stale_time;
760 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
761 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
762 if (lru_stale_time <= meta_stale_time)
763 bp = lrubp;
764 else
765 bp = metabp;
766 }
767 }
768 }
769
770 if (bp) {
771 /* we have a buffer to reuse */
772 FSDBG(544, vp, blkno, bp, bp->nb_flags);
773 nfs_buf_remfree(bp);
774 if (ISSET(bp->nb_flags, NB_DELWRI))
775 panic("nfs_buf_get: delwri");
776 SET(bp->nb_lflags, NBL_BUSY);
777 /* disassociate buffer from previous vnode */
778 if (bp->nb_vp) {
779 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
780 LIST_REMOVE(bp, nb_vnbufs);
781 bp->nb_vnbufs.le_next = NFSNOLIST;
782 }
783 bp->nb_vp = NULL;
784 }
785 LIST_REMOVE(bp, nb_hash);
786 /* nuke any creds we're holding */
787 cred = bp->nb_rcred;
788 if (cred != NOCRED) {
789 bp->nb_rcred = NOCRED;
790 kauth_cred_rele(cred);
791 }
792 cred = bp->nb_wcred;
793 if (cred != NOCRED) {
794 bp->nb_wcred = NOCRED;
795 kauth_cred_rele(cred);
796 }
797 /* if buf will no longer be NB_META, dump old buffer */
798 if (operation == NBLK_META) {
799 if (!ISSET(bp->nb_flags, NB_META))
800 nfsbufmetacnt++;
801 } else if (ISSET(bp->nb_flags, NB_META)) {
802 if (bp->nb_data) {
803 kfree(bp->nb_data, bp->nb_bufsize);
804 bp->nb_data = NULL;
805 }
806 nfsbufmetacnt--;
807 }
808 /* re-init buf fields */
809 bp->nb_error = 0;
810 bp->nb_validoff = bp->nb_validend = -1;
811 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
812 bp->nb_valid = 0;
813 bp->nb_dirty = 0;
814 } else {
815 /* no buffer to reuse */
816 if ((nfsbufcnt < nfsbufmax) &&
817 ((operation != NBLK_META) || (nfsbufmetacnt < nfsbufmetamax))) {
818 /* just alloc a new one */
819 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
820 if (!bp) {
821 lck_mtx_unlock(nfs_buf_mutex);
822 FSDBG_BOT(541, vp, blkno, 0, error);
823 return (ENOMEM);
824 }
825 nfsbufcnt++;
826 if (operation == NBLK_META)
827 nfsbufmetacnt++;
828 NFSBUFCNTCHK(1);
829 /* init nfsbuf */
830 bzero(bp, sizeof(*bp));
831 bp->nb_free.tqe_next = NFSNOLIST;
832 bp->nb_validoff = bp->nb_validend = -1;
833 FSDBG(545, vp, blkno, bp, 0);
834 } else {
835 /* too many bufs... wait for buffers to free up */
836 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
837
838 /* poke the delwri list */
839 nfs_buf_delwri_push(1);
840
841 nfsneedbuffer = 1;
842 msleep(&nfsneedbuffer, nfs_buf_mutex, PCATCH|PDROP,
843 "nfsbufget", 0);
844 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
845 if ((error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
846 FSDBG_BOT(541, vp, blkno, 0, error);
847 return (error);
848 }
849 goto loop;
850 }
851 }
852
853 /* setup nfsbuf */
854 bp->nb_lflags = NBL_BUSY;
855 bp->nb_flags = 0;
856 bp->nb_lblkno = blkno;
857 /* insert buf in hash */
858 LIST_INSERT_HEAD(NFSBUFHASH(np, blkno), bp, nb_hash);
859 /* associate buffer with new vnode */
860 bp->nb_vp = vp;
861 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
862
863 buffer_setup:
864
865 /* unlock hash */
866 lck_mtx_unlock(nfs_buf_mutex);
867
868 switch (operation) {
869 case NBLK_META:
870 SET(bp->nb_flags, NB_META);
871 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
872 kfree(bp->nb_data, bp->nb_bufsize);
873 bp->nb_data = NULL;
874 bp->nb_validoff = bp->nb_validend = -1;
875 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
876 bp->nb_valid = 0;
877 bp->nb_dirty = 0;
878 CLR(bp->nb_flags, NB_CACHE);
879 }
880 if (!bp->nb_data)
881 bp->nb_data = kalloc(bufsize);
882 if (!bp->nb_data) {
883 /* Ack! couldn't allocate the data buffer! */
884 /* cleanup buffer and return error */
885 lck_mtx_lock(nfs_buf_mutex);
886 LIST_REMOVE(bp, nb_vnbufs);
887 bp->nb_vnbufs.le_next = NFSNOLIST;
888 bp->nb_vp = NULL;
889 /* invalidate usage timestamp to allow immediate freeing */
890 NBUFSTAMPINVALIDATE(bp);
891 if (bp->nb_free.tqe_next != NFSNOLIST)
892 panic("nfsbuf on freelist");
893 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
894 nfsbuffreecnt++;
895 lck_mtx_unlock(nfs_buf_mutex);
896 FSDBG_BOT(541, vp, blkno, 0xb00, ENOMEM);
897 return (ENOMEM);
898 }
899 bp->nb_bufsize = bufsize;
900 break;
901
902 case NBLK_READ:
903 case NBLK_WRITE:
904 /*
905 * Set or clear NB_READ now to let the UPL subsystem know
906 * if we intend to modify the pages or not.
907 */
908 if (operation == NBLK_READ) {
909 SET(bp->nb_flags, NB_READ);
910 } else {
911 CLR(bp->nb_flags, NB_READ);
912 }
913 if (bufsize < PAGE_SIZE)
914 bufsize = PAGE_SIZE;
915 bp->nb_bufsize = bufsize;
916 bp->nb_validoff = bp->nb_validend = -1;
917
918 if (UBCINFOEXISTS(vp)) {
919 /* setup upl */
920 if (nfs_buf_upl_setup(bp)) {
921 /* unable to create upl */
922 /* vm object must no longer exist */
923 /* cleanup buffer and return error */
924 lck_mtx_lock(nfs_buf_mutex);
925 LIST_REMOVE(bp, nb_vnbufs);
926 bp->nb_vnbufs.le_next = NFSNOLIST;
927 bp->nb_vp = NULL;
928 /* invalidate usage timestamp to allow immediate freeing */
929 NBUFSTAMPINVALIDATE(bp);
930 if (bp->nb_free.tqe_next != NFSNOLIST)
931 panic("nfsbuf on freelist");
932 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
933 nfsbuffreecnt++;
934 lck_mtx_unlock(nfs_buf_mutex);
935 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
936 return (EIO);
937 }
938 nfs_buf_upl_check(bp);
939 }
940 break;
941
942 default:
943 panic("nfs_buf_get: %d unknown operation", operation);
944 }
945
946 *bpp = bp;
947
948 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
949
950 return (0);
951 }
952
953 void
954 nfs_buf_release(struct nfsbuf *bp, int freeup)
955 {
956 vnode_t vp = bp->nb_vp;
957 struct timeval now;
958 int wakeup_needbuffer, wakeup_buffer, wakeup_nbdwrite;
959
960 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
961 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
962 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
963
964 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
965 int upl_flags;
966 upl_t upl;
967 int i, rv;
968
969 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
970 rv = nfs_buf_upl_setup(bp);
971 if (rv)
972 printf("nfs_buf_release: upl create failed %d\n", rv);
973 else
974 nfs_buf_upl_check(bp);
975 }
976 upl = bp->nb_pagelist;
977 if (!upl)
978 goto pagelist_cleanup_done;
979 if (bp->nb_data) {
980 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
981 panic("ubc_upl_unmap failed");
982 bp->nb_data = NULL;
983 }
984 if (bp->nb_flags & (NB_ERROR | NB_INVAL | NB_NOCACHE)) {
985 if (bp->nb_flags & (NB_READ | NB_INVAL | NB_NOCACHE))
986 upl_flags = UPL_ABORT_DUMP_PAGES;
987 else
988 upl_flags = 0;
989 ubc_upl_abort(upl, upl_flags);
990 goto pagelist_cleanup_done;
991 }
992 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
993 if (!NBPGVALID(bp,i))
994 ubc_upl_abort_range(upl,
995 i*PAGE_SIZE, PAGE_SIZE,
996 UPL_ABORT_DUMP_PAGES |
997 UPL_ABORT_FREE_ON_EMPTY);
998 else {
999 if (NBPGDIRTY(bp,i))
1000 upl_flags = UPL_COMMIT_SET_DIRTY;
1001 else
1002 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
1003 ubc_upl_commit_range(upl,
1004 i*PAGE_SIZE, PAGE_SIZE,
1005 upl_flags |
1006 UPL_COMMIT_INACTIVATE |
1007 UPL_COMMIT_FREE_ON_EMPTY);
1008 }
1009 }
1010 pagelist_cleanup_done:
1011 /* was this the last buffer in the file? */
1012 if (NBOFF(bp) + bp->nb_bufsize > (off_t)(VTONFS(vp)->n_size)) {
1013 /* if so, invalidate all pages of last buffer past EOF */
1014 int biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
1015 off_t start, end;
1016 start = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
1017 end = trunc_page_64(NBOFF(bp) + biosize);
1018 if (end > start) {
1019 if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE)))
1020 printf("nfs_buf_release(): ubc_sync_range failed!\n");
1021 }
1022 }
1023 CLR(bp->nb_flags, NB_PAGELIST);
1024 bp->nb_pagelist = NULL;
1025 }
1026
1027 lck_mtx_lock(nfs_buf_mutex);
1028
1029 wakeup_needbuffer = wakeup_buffer = wakeup_nbdwrite = 0;
1030
1031 /* Wake up any processes waiting for any buffer to become free. */
1032 if (nfsneedbuffer) {
1033 nfsneedbuffer = 0;
1034 wakeup_needbuffer = 1;
1035 }
1036 /* Wake up any processes waiting for _this_ buffer to become free. */
1037 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1038 CLR(bp->nb_lflags, NBL_WANTED);
1039 wakeup_buffer = 1;
1040 }
1041
1042 /* If it's not cacheable, or an error, mark it invalid. */
1043 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
1044 SET(bp->nb_flags, NB_INVAL);
1045
1046 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
1047 /* If it's invalid or empty, dissociate it from its vnode */
1048 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
1049 LIST_REMOVE(bp, nb_vnbufs);
1050 bp->nb_vnbufs.le_next = NFSNOLIST;
1051 }
1052 bp->nb_vp = NULL;
1053 /* if this was a delayed write, wakeup anyone */
1054 /* waiting for delayed writes to complete */
1055 if (ISSET(bp->nb_flags, NB_DELWRI)) {
1056 CLR(bp->nb_flags, NB_DELWRI);
1057 OSAddAtomic(-1, (SInt32*)&nfs_nbdwrite);
1058 NFSBUFCNTCHK(1);
1059 wakeup_nbdwrite = 1;
1060 }
1061 /* invalidate usage timestamp to allow immediate freeing */
1062 NBUFSTAMPINVALIDATE(bp);
1063 /* put buffer at head of free list */
1064 if (bp->nb_free.tqe_next != NFSNOLIST)
1065 panic("nfsbuf on freelist");
1066 SET(bp->nb_flags, NB_INVAL);
1067 if (ISSET(bp->nb_flags, NB_META)) {
1068 TAILQ_INSERT_HEAD(&nfsbuffreemeta, bp, nb_free);
1069 nfsbuffreemetacnt++;
1070 } else {
1071 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
1072 nfsbuffreecnt++;
1073 }
1074 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
1075 /* put buffer at end of delwri list */
1076 if (bp->nb_free.tqe_next != NFSNOLIST)
1077 panic("nfsbuf on freelist");
1078 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
1079 nfsbufdelwricnt++;
1080 freeup = 0;
1081 } else {
1082 /* update usage timestamp */
1083 microuptime(&now);
1084 bp->nb_timestamp = now.tv_sec;
1085 /* put buffer at end of free list */
1086 if (bp->nb_free.tqe_next != NFSNOLIST)
1087 panic("nfsbuf on freelist");
1088 if (ISSET(bp->nb_flags, NB_META)) {
1089 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
1090 nfsbuffreemetacnt++;
1091 } else {
1092 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
1093 nfsbuffreecnt++;
1094 }
1095 }
1096
1097 NFSBUFCNTCHK(1);
1098
1099 /* Unlock the buffer. */
1100 CLR(bp->nb_flags, (NB_ASYNC | NB_NOCACHE | NB_STABLE | NB_IOD));
1101 CLR(bp->nb_lflags, NBL_BUSY);
1102
1103 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
1104
1105 lck_mtx_unlock(nfs_buf_mutex);
1106
1107 if (wakeup_needbuffer)
1108 wakeup(&nfsneedbuffer);
1109 if (wakeup_buffer)
1110 wakeup(bp);
1111 if (wakeup_nbdwrite)
1112 wakeup(&nfs_nbdwrite);
1113 if (freeup)
1114 NFS_BUF_FREEUP();
1115 }
1116
1117 /*
1118 * Wait for operations on the buffer to complete.
1119 * When they do, extract and return the I/O's error value.
1120 */
1121 int
1122 nfs_buf_iowait(struct nfsbuf *bp)
1123 {
1124 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1125
1126 lck_mtx_lock(nfs_buf_mutex);
1127
1128 while (!ISSET(bp->nb_flags, NB_DONE))
1129 msleep(bp, nfs_buf_mutex, PRIBIO + 1, "nfs_buf_iowait", 0);
1130
1131 lck_mtx_unlock(nfs_buf_mutex);
1132
1133 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1134
1135 /* check for interruption of I/O, then errors. */
1136 if (ISSET(bp->nb_flags, NB_EINTR)) {
1137 CLR(bp->nb_flags, NB_EINTR);
1138 return (EINTR);
1139 } else if (ISSET(bp->nb_flags, NB_ERROR))
1140 return (bp->nb_error ? bp->nb_error : EIO);
1141 return (0);
1142 }
1143
1144 /*
1145 * Mark I/O complete on a buffer.
1146 */
1147 void
1148 nfs_buf_iodone(struct nfsbuf *bp)
1149 {
1150
1151 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1152
1153 if (ISSET(bp->nb_flags, NB_DONE))
1154 panic("nfs_buf_iodone already");
1155 /*
1156 * I/O was done, so don't believe
1157 * the DIRTY state from VM anymore
1158 */
1159 CLR(bp->nb_flags, NB_WASDIRTY);
1160
1161 if (!ISSET(bp->nb_flags, NB_READ)) {
1162 CLR(bp->nb_flags, NB_WRITEINPROG);
1163 /*
1164 * vnode_writedone() takes care of waking up
1165 * any throttled write operations
1166 */
1167 vnode_writedone(bp->nb_vp);
1168 }
1169 if (ISSET(bp->nb_flags, NB_ASYNC)) { /* if async, release it */
1170 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1171 nfs_buf_release(bp, 1);
1172 } else { /* or just wakeup the buffer */
1173 lck_mtx_lock(nfs_buf_mutex);
1174 SET(bp->nb_flags, NB_DONE); /* note that it's done */
1175 CLR(bp->nb_lflags, NBL_WANTED);
1176 lck_mtx_unlock(nfs_buf_mutex);
1177 wakeup(bp);
1178 }
1179
1180 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1181 }
1182
1183 void
1184 nfs_buf_write_delayed(struct nfsbuf *bp, proc_t p)
1185 {
1186 vnode_t vp = bp->nb_vp;
1187
1188 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1189 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1190
1191 /*
1192 * If the block hasn't been seen before:
1193 * (1) Mark it as having been seen,
1194 * (2) Charge for the write.
1195 * (3) Make sure it's on its vnode's correct block list,
1196 */
1197 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1198 SET(bp->nb_flags, NB_DELWRI);
1199 if (p && p->p_stats)
1200 p->p_stats->p_ru.ru_oublock++; /* XXX */
1201 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
1202 NFSBUFCNTCHK(0);
1203 /* move to dirty list */
1204 lck_mtx_lock(nfs_buf_mutex);
1205 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1206 LIST_REMOVE(bp, nb_vnbufs);
1207 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1208 lck_mtx_unlock(nfs_buf_mutex);
1209 }
1210
1211 /*
1212 * If the vnode has "too many" write operations in progress
1213 * wait for them to finish the IO
1214 */
1215 (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "nfs_buf_write_delayed");
1216
1217 /*
1218 * If we have too many delayed write buffers,
1219 * more than we can "safely" handle, just fall back to
1220 * doing the async write
1221 */
1222 if (nfs_nbdwrite < 0)
1223 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1224
1225 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1226 /* issue async write */
1227 SET(bp->nb_flags, NB_ASYNC);
1228 nfs_buf_write(bp);
1229 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1230 return;
1231 }
1232
1233 /* Otherwise, the "write" is done, so mark and release the buffer. */
1234 SET(bp->nb_flags, NB_DONE);
1235 nfs_buf_release(bp, 1);
1236 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1237 return;
1238 }
1239
1240 /*
1241 * add a reference to a buffer so it doesn't disappear while being used
1242 * (must be called with nfs_buf_mutex held)
1243 */
1244 void
1245 nfs_buf_refget(struct nfsbuf *bp)
1246 {
1247 bp->nb_refs++;
1248 }
1249 /*
1250 * release a reference on a buffer
1251 * (must be called with nfs_buf_mutex held)
1252 */
1253 void
1254 nfs_buf_refrele(struct nfsbuf *bp)
1255 {
1256 bp->nb_refs--;
1257 }
1258
1259 /*
1260 * mark a particular buffer as BUSY
1261 * (must be called with nfs_buf_mutex held)
1262 */
1263 errno_t
1264 nfs_buf_acquire(struct nfsbuf *bp, int flags, int slpflag, int slptimeo)
1265 {
1266 errno_t error;
1267 struct timespec ts;
1268
1269 if (ISSET(bp->nb_lflags, NBL_BUSY)) {
1270 /*
1271 * since the mutex_lock may block, the buffer
1272 * may become BUSY, so we need to recheck for
1273 * a NOWAIT request
1274 */
1275 if (flags & NBAC_NOWAIT)
1276 return (EBUSY);
1277 SET(bp->nb_lflags, NBL_WANTED);
1278
1279 ts.tv_sec = (slptimeo/100);
1280 /* the hz value is 100; which leads to 10ms */
1281 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
1282
1283 error = msleep(bp, nfs_buf_mutex, slpflag | (PRIBIO + 1),
1284 "nfs_buf_acquire", &ts);
1285 if (error)
1286 return (error);
1287 return (EAGAIN);
1288 }
1289 if (flags & NBAC_REMOVE)
1290 nfs_buf_remfree(bp);
1291 SET(bp->nb_lflags, NBL_BUSY);
1292
1293 return (0);
1294 }
1295
1296 /*
1297 * simply drop the BUSY status of a buffer
1298 * (must be called with nfs_buf_mutex held)
1299 */
1300 void
1301 nfs_buf_drop(struct nfsbuf *bp)
1302 {
1303 int need_wakeup = 0;
1304
1305 if (!ISSET(bp->nb_lflags, NBL_BUSY))
1306 panic("nfs_buf_drop: buffer not busy!");
1307 if (ISSET(bp->nb_lflags, NBL_WANTED)) {
1308 /*
1309 * delay the actual wakeup until after we
1310 * clear NBL_BUSY and we've dropped nfs_buf_mutex
1311 */
1312 need_wakeup = 1;
1313 }
1314 /* Unlock the buffer. */
1315 CLR(bp->nb_lflags, (NBL_BUSY | NBL_WANTED));
1316
1317 if (need_wakeup)
1318 wakeup(bp);
1319 }
1320
1321 /*
1322 * prepare for iterating over an nfsnode's buffer list
1323 * this lock protects the queue manipulation
1324 * (must be called with nfs_buf_mutex held)
1325 */
1326 int
1327 nfs_buf_iterprepare(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1328 {
1329 struct nfsbuflists *listheadp;
1330
1331 if (flags & NBI_DIRTY)
1332 listheadp = &np->n_dirtyblkhd;
1333 else
1334 listheadp = &np->n_cleanblkhd;
1335
1336 if ((flags & NBI_NOWAIT) && (np->n_bufiterflags & NBI_ITER)) {
1337 LIST_INIT(iterheadp);
1338 return(EWOULDBLOCK);
1339 }
1340
1341 while (np->n_bufiterflags & NBI_ITER) {
1342 np->n_bufiterflags |= NBI_ITERWANT;
1343 msleep(&np->n_bufiterflags, nfs_buf_mutex, 0, "nfs_buf_iterprepare", 0);
1344 }
1345 if (LIST_EMPTY(listheadp)) {
1346 LIST_INIT(iterheadp);
1347 return(EINVAL);
1348 }
1349 np->n_bufiterflags |= NBI_ITER;
1350
1351 iterheadp->lh_first = listheadp->lh_first;
1352 listheadp->lh_first->nb_vnbufs.le_prev = &iterheadp->lh_first;
1353 LIST_INIT(listheadp);
1354
1355 return(0);
1356 }
1357
1358 /*
1359 * cleanup after iterating over an nfsnode's buffer list
1360 * this lock protects the queue manipulation
1361 * (must be called with nfs_buf_mutex held)
1362 */
1363 void
1364 nfs_buf_itercomplete(struct nfsnode *np, struct nfsbuflists *iterheadp, int flags)
1365 {
1366 struct nfsbuflists * listheadp;
1367 struct nfsbuf *bp;
1368
1369 if (flags & NBI_DIRTY)
1370 listheadp = &np->n_dirtyblkhd;
1371 else
1372 listheadp = &np->n_cleanblkhd;
1373
1374 while (!LIST_EMPTY(iterheadp)) {
1375 bp = LIST_FIRST(iterheadp);
1376 LIST_REMOVE(bp, nb_vnbufs);
1377 LIST_INSERT_HEAD(listheadp, bp, nb_vnbufs);
1378 }
1379
1380 np->n_bufiterflags &= ~NBI_ITER;
1381 if (np->n_bufiterflags & NBI_ITERWANT) {
1382 np->n_bufiterflags &= ~NBI_ITERWANT;
1383 wakeup(&np->n_bufiterflags);
1384 }
1385 }
1386
1387
1388 /*
1389 * Vnode op for read using bio
1390 * Any similarity to readip() is purely coincidental
1391 */
1392 int
1393 nfs_bioread(
1394 vnode_t vp,
1395 struct uio *uio,
1396 __unused int ioflag,
1397 kauth_cred_t cred,
1398 proc_t p)
1399 {
1400 struct nfsnode *np = VTONFS(vp);
1401 int biosize;
1402 off_t diff;
1403 struct nfsbuf *bp = NULL, *rabp;
1404 struct nfs_vattr nvattr;
1405 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1406 daddr64_t lbn, rabn, lastrabn = -1, tlbn;
1407 int bufsize;
1408 int nra, error = 0, n = 0, on = 0;
1409 caddr_t dp;
1410 struct dirent *direntp = NULL;
1411 enum vtype vtype;
1412 int nocachereadahead = 0;
1413
1414 FSDBG_TOP(514, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1415
1416 #if DIAGNOSTIC
1417 if (uio->uio_rw != UIO_READ)
1418 panic("nfs_read mode");
1419 #endif
1420 if (uio_uio_resid(uio) == 0) {
1421 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1422 return (0);
1423 }
1424 if (uio->uio_offset < 0) {
1425 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1426 return (EINVAL);
1427 }
1428
1429 if ((nmp->nm_flag & NFSMNT_NFSV3) && !(nmp->nm_state & NFSSTA_GOTFSINFO))
1430 nfs_fsinfo(nmp, vp, cred, p);
1431 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
1432 vtype = vnode_vtype(vp);
1433 /*
1434 * For nfs, cache consistency can only be maintained approximately.
1435 * Although RFC1094 does not specify the criteria, the following is
1436 * believed to be compatible with the reference port.
1437 * For nfs:
1438 * If the file's modify time on the server has changed since the
1439 * last read rpc or you have written to the file,
1440 * you may have lost data cache consistency with the
1441 * server, so flush all of the file's data out of the cache.
1442 * Then force a getattr rpc to ensure that you have up to date
1443 * attributes.
1444 * NB: This implies that cache data can be read when up to
1445 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1446 * current attributes this could be forced by setting calling
1447 * NATTRINVALIDATE() before the nfs_getattr() call.
1448 */
1449 if (np->n_flag & NNEEDINVALIDATE) {
1450 np->n_flag &= ~NNEEDINVALIDATE;
1451 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1452 }
1453 if (np->n_flag & NMODIFIED) {
1454 if (vtype != VREG) {
1455 if (vtype != VDIR)
1456 panic("nfs: bioread, not dir");
1457 nfs_invaldir(vp);
1458 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1459 if (error) {
1460 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1461 return (error);
1462 }
1463 }
1464 NATTRINVALIDATE(np);
1465 error = nfs_getattr(vp, &nvattr, cred, p);
1466 if (error) {
1467 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1468 return (error);
1469 }
1470 if (vtype == VDIR) {
1471 /* if directory changed, purge any name cache entries */
1472 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1473 cache_purge(vp);
1474 np->n_ncmtime = nvattr.nva_mtime;
1475 }
1476 np->n_mtime = nvattr.nva_mtime;
1477 } else {
1478 error = nfs_getattr(vp, &nvattr, cred, p);
1479 if (error) {
1480 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1481 return (error);
1482 }
1483 if (nfstimespeccmp(&np->n_mtime, &nvattr.nva_mtime, !=)) {
1484 if (vtype == VDIR) {
1485 nfs_invaldir(vp);
1486 /* purge name cache entries */
1487 if (nfstimespeccmp(&np->n_ncmtime, &nvattr.nva_mtime, !=))
1488 cache_purge(vp);
1489 }
1490 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1491 if (error) {
1492 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1493 return (error);
1494 }
1495 if (vtype == VDIR)
1496 np->n_ncmtime = nvattr.nva_mtime;
1497 np->n_mtime = nvattr.nva_mtime;
1498 }
1499 }
1500
1501 if (vnode_isnocache(vp)) {
1502 if (!(np->n_flag & NNOCACHE)) {
1503 if (NVALIDBUFS(np)) {
1504 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1505 if (error) {
1506 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1507 return (error);
1508 }
1509 }
1510 np->n_flag |= NNOCACHE;
1511 }
1512 } else if (np->n_flag & NNOCACHE) {
1513 np->n_flag &= ~NNOCACHE;
1514 }
1515
1516 do {
1517 if (np->n_flag & NNOCACHE) {
1518 switch (vtype) {
1519 case VREG:
1520 /*
1521 * If we have only a block or so to read,
1522 * just do the rpc directly.
1523 * If we have a couple blocks or more to read,
1524 * then we'll take advantage of readahead within
1525 * this loop to try to fetch all the data in parallel
1526 */
1527 if (!nocachereadahead && (uio_uio_resid(uio) < 2*biosize)) {
1528 error = nfs_readrpc(vp, uio, cred, p);
1529 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1530 return (error);
1531 }
1532 nocachereadahead = 1;
1533 break;
1534 case VLNK:
1535 error = nfs_readlinkrpc(vp, uio, cred, p);
1536 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1537 return (error);
1538 case VDIR:
1539 break;
1540 default:
1541 printf(" NFSNOCACHE: type %x unexpected\n", vtype);
1542 };
1543 }
1544 switch (vtype) {
1545 case VREG:
1546 lbn = uio->uio_offset / biosize;
1547
1548 /*
1549 * Copy directly from any cached pages without grabbing the bufs.
1550 *
1551 * Note: for "nocache" reads, we don't copy directly from UBC
1552 * because any cached pages will be for readahead buffers that
1553 * need to be invalidated anyway before we finish this request.
1554 */
1555 if (!(np->n_flag & NNOCACHE) &&
1556 (uio->uio_segflg == UIO_USERSPACE32 ||
1557 uio->uio_segflg == UIO_USERSPACE64 ||
1558 uio->uio_segflg == UIO_USERSPACE)) {
1559 // LP64todo - fix this!
1560 int io_resid = uio_uio_resid(uio);
1561 diff = np->n_size - uio->uio_offset;
1562 if (diff < io_resid)
1563 io_resid = diff;
1564 if (io_resid > 0) {
1565 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1566 if (error) {
1567 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1568 return (error);
1569 }
1570 }
1571 /* count any biocache reads that we just copied directly */
1572 if (lbn != uio->uio_offset / biosize) {
1573 OSAddAtomic((uio->uio_offset / biosize) - lbn, (SInt32*)&nfsstats.biocache_reads);
1574 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1575 }
1576 }
1577
1578 lbn = uio->uio_offset / biosize;
1579 on = uio->uio_offset % biosize;
1580
1581 /*
1582 * Start the read ahead(s), as required.
1583 */
1584 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1585 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1586 rabn = lbn + 1 + nra;
1587 if (rabn <= lastrabn) {
1588 /* we've already (tried to) read this block */
1589 /* no need to try it again... */
1590 continue;
1591 }
1592 lastrabn = rabn;
1593 if ((off_t)rabn * biosize >= (off_t)np->n_size)
1594 break;
1595 if ((np->n_flag & NNOCACHE) &&
1596 (((off_t)rabn * biosize) >= (uio->uio_offset + uio_uio_resid(uio))))
1597 /* for uncached readahead, don't go beyond end of request */
1598 break;
1599 /* check if block exists and is valid. */
1600 error = nfs_buf_get(vp, rabn, biosize, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1601 if (error) {
1602 FSDBG_BOT(514, vp, 0xd1e000b, 1, error);
1603 return (error);
1604 }
1605 if (!rabp)
1606 continue;
1607 if (nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize)) {
1608 nfs_buf_release(rabp, 1);
1609 continue;
1610 }
1611 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1612 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1613 if (nfs_asyncio(rabp, cred)) {
1614 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1615 rabp->nb_error = EIO;
1616 nfs_buf_release(rabp, 1);
1617 }
1618 } else
1619 nfs_buf_release(rabp, 1);
1620 }
1621 }
1622
1623 if ((uio_uio_resid(uio) <= 0) || (uio->uio_offset >= (off_t)np->n_size)) {
1624 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), 0xaaaaaaaa);
1625 return (0);
1626 }
1627
1628 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_reads);
1629
1630 /*
1631 * If the block is in the cache and has the required data
1632 * in a valid region, just copy it out.
1633 * Otherwise, get the block and write back/read in,
1634 * as required.
1635 */
1636 again:
1637 bufsize = biosize;
1638 // LP64todo - fix this!
1639 n = min((unsigned)(bufsize - on), uio_uio_resid(uio));
1640 diff = np->n_size - uio->uio_offset;
1641 if (diff < n)
1642 n = diff;
1643
1644 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_READ, &bp);
1645 if (error) {
1646 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1647 return (EINTR);
1648 }
1649
1650 /* if any pages are valid... */
1651 if (bp->nb_valid) {
1652 /* ...check for any invalid pages in the read range */
1653 int pg, firstpg, lastpg, dirtypg;
1654 dirtypg = firstpg = lastpg = -1;
1655 pg = on/PAGE_SIZE;
1656 while (pg <= (on + n - 1)/PAGE_SIZE) {
1657 if (!NBPGVALID(bp,pg)) {
1658 if (firstpg < 0)
1659 firstpg = pg;
1660 lastpg = pg;
1661 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1662 dirtypg = pg;
1663 pg++;
1664 }
1665
1666 /* if there are no invalid pages, we're all set */
1667 if (firstpg < 0) {
1668 if (bp->nb_validoff < 0) {
1669 /* valid range isn't set up, so */
1670 /* set it to what we know is valid */
1671 bp->nb_validoff = trunc_page(on);
1672 bp->nb_validend = round_page(on+n);
1673 nfs_buf_normalize_valid_range(np, bp);
1674 }
1675 goto buffer_ready;
1676 }
1677
1678 /* there are invalid pages in the read range */
1679 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1680 /* there are also dirty page(s) in the range, */
1681 /* so write the buffer out and try again */
1682 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1683 SET(bp->nb_flags, NB_ASYNC);
1684 if (bp->nb_wcred == NOCRED) {
1685 kauth_cred_ref(cred);
1686 bp->nb_wcred = cred;
1687 }
1688 error = nfs_buf_write(bp);
1689 if (error) {
1690 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1691 return (error);
1692 }
1693 goto again;
1694 }
1695 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1696 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1697 /* we need to read in more than half the buffer and the */
1698 /* buffer's not dirty, so just fetch the whole buffer */
1699 bp->nb_valid = 0;
1700 } else {
1701 /* read the page range in */
1702 uio_t auio;
1703 char uio_buf[ UIO_SIZEOF(1) ];
1704
1705 NFS_BUF_MAP(bp);
1706 auio = uio_createwithbuffer(1, (NBOFF(bp) + firstpg * PAGE_SIZE_64),
1707 UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf));
1708 if (!auio) {
1709 error = ENOMEM;
1710 } else {
1711 uio_addiov(auio, CAST_USER_ADDR_T((bp->nb_data + firstpg * PAGE_SIZE)),
1712 ((lastpg - firstpg + 1) * PAGE_SIZE));
1713 error = nfs_readrpc(vp, auio, cred, p);
1714 }
1715 if (error) {
1716 if (np->n_flag & NNOCACHE)
1717 SET(bp->nb_flags, NB_NOCACHE);
1718 nfs_buf_release(bp, 1);
1719 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1720 return (error);
1721 }
1722 /* Make sure that the valid range is set to cover this read. */
1723 bp->nb_validoff = trunc_page_32(on);
1724 bp->nb_validend = round_page_32(on+n);
1725 nfs_buf_normalize_valid_range(np, bp);
1726 if (uio_resid(auio) > 0) {
1727 /* if short read, must have hit EOF, */
1728 /* so zero the rest of the range */
1729 bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio));
1730 }
1731 /* mark the pages (successfully read) as valid */
1732 for (pg=firstpg; pg <= lastpg; pg++)
1733 NBPGVALID_SET(bp,pg);
1734 }
1735 }
1736 /* if no pages are valid, read the whole block */
1737 if (!bp->nb_valid) {
1738 SET(bp->nb_flags, NB_READ);
1739 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1740 error = nfs_doio(bp, cred, p);
1741 if (error) {
1742 if (np->n_flag & NNOCACHE)
1743 SET(bp->nb_flags, NB_NOCACHE);
1744 nfs_buf_release(bp, 1);
1745 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1746 return (error);
1747 }
1748 }
1749 buffer_ready:
1750 /* validate read range against valid range and clip */
1751 if (bp->nb_validend > 0) {
1752 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1753 if (diff < n)
1754 n = diff;
1755 }
1756 if (n > 0)
1757 NFS_BUF_MAP(bp);
1758 break;
1759 case VLNK:
1760 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readlinks);
1761 error = nfs_buf_get(vp, 0, NFS_MAXPATHLEN, p, NBLK_READ, &bp);
1762 if (error) {
1763 FSDBG_BOT(514, vp, 0xd1e0010, 0, error);
1764 return (error);
1765 }
1766 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1767 SET(bp->nb_flags, NB_READ);
1768 error = nfs_doio(bp, cred, p);
1769 if (error) {
1770 SET(bp->nb_flags, NB_ERROR);
1771 nfs_buf_release(bp, 1);
1772 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1773 return (error);
1774 }
1775 }
1776 // LP64todo - fix this!
1777 n = min(uio_uio_resid(uio), bp->nb_validend);
1778 on = 0;
1779 break;
1780 case VDIR:
1781 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_readdirs);
1782 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1783 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1784 return (0);
1785 }
1786 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1787 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1788 error = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1789 if (error) {
1790 FSDBG_BOT(514, vp, 0xd1e0012, 0, error);
1791 return (error);
1792 }
1793 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1794 SET(bp->nb_flags, NB_READ);
1795 error = nfs_doio(bp, cred, p);
1796 if (error) {
1797 nfs_buf_release(bp, 1);
1798 }
1799 while (error == NFSERR_BAD_COOKIE) {
1800 nfs_invaldir(vp);
1801 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1802 /*
1803 * Yuck! The directory has been modified on the
1804 * server. The only way to get the block is by
1805 * reading from the beginning to get all the
1806 * offset cookies.
1807 */
1808 for (tlbn = 0; tlbn <= lbn && !error; tlbn++) {
1809 if (np->n_direofoffset
1810 && (tlbn * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1811 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1812 return (0);
1813 }
1814 error = nfs_buf_get(vp, tlbn, NFS_DIRBLKSIZ, p, NBLK_READ, &bp);
1815 if (error) {
1816 FSDBG_BOT(514, vp, 0xd1e0013, 0, error);
1817 return (error);
1818 }
1819 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1820 SET(bp->nb_flags, NB_READ);
1821 error = nfs_doio(bp, cred, p);
1822 /*
1823 * no error + NB_INVAL == directory EOF,
1824 * use the block.
1825 */
1826 if (error == 0 && (bp->nb_flags & NB_INVAL))
1827 break;
1828 }
1829 /*
1830 * An error will throw away the block and the
1831 * for loop will break out. If no error and this
1832 * is not the block we want, we throw away the
1833 * block and go for the next one via the for loop.
1834 */
1835 if (error || tlbn < lbn)
1836 nfs_buf_release(bp, 1);
1837 }
1838 }
1839 /*
1840 * The above while is repeated if we hit another cookie
1841 * error. If we hit an error and it wasn't a cookie error,
1842 * we give up.
1843 */
1844 if (error) {
1845 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1846 return (error);
1847 }
1848 }
1849
1850 /*
1851 * If not eof and read aheads are enabled, start one.
1852 * (You need the current block first, so that you have the
1853 * directory offset cookie of the next block.)
1854 */
1855 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1856 (np->n_direofoffset == 0 ||
1857 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1858 !nfs_buf_is_incore(vp, lbn + 1)) {
1859 error = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p, NBLK_READ|NBLK_NOWAIT, &rabp);
1860 if (error) {
1861 FSDBG_BOT(514, vp, 0xd1e0015, 0, error);
1862 return (error);
1863 }
1864 if (rabp) {
1865 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1866 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1867 if (nfs_asyncio(rabp, cred)) {
1868 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1869 rabp->nb_error = EIO;
1870 nfs_buf_release(rabp, 1);
1871 }
1872 } else {
1873 nfs_buf_release(rabp, 1);
1874 }
1875 }
1876 }
1877 /*
1878 * Make sure we use a signed variant of min() since
1879 * the second term may be negative.
1880 */
1881 // LP64todo - fix this!
1882 n = lmin(uio_uio_resid(uio), bp->nb_validend - on);
1883 /*
1884 * We keep track of the directory eof in
1885 * np->n_direofoffset and chop it off as an
1886 * extra step right here.
1887 */
1888 if (np->n_direofoffset &&
1889 n > np->n_direofoffset - uio->uio_offset)
1890 n = np->n_direofoffset - uio->uio_offset;
1891 /*
1892 * Make sure that we return an integral number of entries so
1893 * that any subsequent calls will start copying from the start
1894 * of the next entry.
1895 *
1896 * If the current value of n has the last entry cut short,
1897 * set n to copy everything up to the last entry instead.
1898 */
1899 if (n > 0) {
1900 dp = bp->nb_data + on;
1901 while (dp < (bp->nb_data + on + n)) {
1902 direntp = (struct dirent *)dp;
1903 dp += direntp->d_reclen;
1904 }
1905 if (dp > (bp->nb_data + on + n))
1906 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1907 }
1908 break;
1909 default:
1910 printf("nfs_bioread: type %x unexpected\n", vtype);
1911 FSDBG_BOT(514, vp, 0xd1e0016, 0, EINVAL);
1912 return (EINVAL);
1913 };
1914
1915 if (n > 0) {
1916 error = uiomove(bp->nb_data + on, (int)n, uio);
1917 }
1918 switch (vtype) {
1919 case VREG:
1920 if (np->n_flag & NNOCACHE)
1921 SET(bp->nb_flags, NB_NOCACHE);
1922 break;
1923 case VLNK:
1924 n = 0;
1925 break;
1926 case VDIR:
1927 break;
1928 default:
1929 break;
1930 }
1931 nfs_buf_release(bp, 1);
1932 } while (error == 0 && uio_uio_resid(uio) > 0 && n > 0);
1933 FSDBG_BOT(514, vp, uio->uio_offset, uio_uio_resid(uio), error);
1934 return (error);
1935 }
1936
1937
1938 /*
1939 * Vnode op for write using bio
1940 */
1941 int
1942 nfs_write(ap)
1943 struct vnop_write_args /* {
1944 struct vnodeop_desc *a_desc;
1945 vnode_t a_vp;
1946 struct uio *a_uio;
1947 int a_ioflag;
1948 vfs_context_t a_context;
1949 } */ *ap;
1950 {
1951 struct uio *uio = ap->a_uio;
1952 vnode_t vp = ap->a_vp;
1953 struct nfsnode *np = VTONFS(vp);
1954 proc_t p;
1955 kauth_cred_t cred;
1956 int ioflag = ap->a_ioflag;
1957 struct nfsbuf *bp;
1958 struct nfs_vattr nvattr;
1959 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
1960 daddr64_t lbn;
1961 int biosize, bufsize;
1962 int n, on, error = 0;
1963 off_t boff, start, end, cureof;
1964 struct iovec_32 iov;
1965 struct uio auio;
1966
1967 FSDBG_TOP(515, vp, uio->uio_offset, uio_uio_resid(uio), ioflag);
1968
1969 #if DIAGNOSTIC
1970 if (uio->uio_rw != UIO_WRITE)
1971 panic("nfs_write mode");
1972 if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
1973 panic("nfs_write proc");
1974 #endif
1975
1976 p = vfs_context_proc(ap->a_context);
1977 cred = vfs_context_ucred(ap->a_context);
1978
1979 if (vnode_vtype(vp) != VREG)
1980 return (EIO);
1981
1982 np->n_flag |= NWRBUSY;
1983
1984 if (np->n_flag & NNEEDINVALIDATE) {
1985 np->n_flag &= ~NNEEDINVALIDATE;
1986 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cred, p, 1);
1987 }
1988 if (np->n_flag & NWRITEERR) {
1989 np->n_flag &= ~(NWRITEERR | NWRBUSY);
1990 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), np->n_error);
1991 return (np->n_error);
1992 }
1993 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1994 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1995 (void)nfs_fsinfo(nmp, vp, cred, p);
1996 if (ioflag & (IO_APPEND | IO_SYNC)) {
1997 if (np->n_flag & NMODIFIED) {
1998 NATTRINVALIDATE(np);
1999 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2000 if (error) {
2001 np->n_flag &= ~NWRBUSY;
2002 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
2003 return (error);
2004 }
2005 }
2006 if (ioflag & IO_APPEND) {
2007 NATTRINVALIDATE(np);
2008 error = nfs_getattr(vp, &nvattr, cred, p);
2009 if (error) {
2010 np->n_flag &= ~NWRBUSY;
2011 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
2012 return (error);
2013 }
2014 uio->uio_offset = np->n_size;
2015 }
2016 }
2017 if (uio->uio_offset < 0) {
2018 np->n_flag &= ~NWRBUSY;
2019 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
2020 return (EINVAL);
2021 }
2022 if (uio_uio_resid(uio) == 0) {
2023 np->n_flag &= ~NWRBUSY;
2024 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2025 return (0);
2026 }
2027
2028 biosize = vfs_statfs(vnode_mount(vp))->f_iosize;
2029
2030 if (vnode_isnocache(vp)) {
2031 if (!(np->n_flag & NNOCACHE)) {
2032 if (NVALIDBUFS(np)) {
2033 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2034 if (error) {
2035 np->n_flag &= ~NWRBUSY;
2036 FSDBG_BOT(515, vp, 0, 0, error);
2037 return (error);
2038 }
2039 }
2040 np->n_flag |= NNOCACHE;
2041 }
2042 } else if (np->n_flag & NNOCACHE) {
2043 np->n_flag &= ~NNOCACHE;
2044 }
2045
2046 do {
2047 OSAddAtomic(1, (SInt32*)&nfsstats.biocache_writes);
2048 lbn = uio->uio_offset / biosize;
2049 on = uio->uio_offset % biosize;
2050 // LP64todo - fix this
2051 n = min((unsigned)(biosize - on), uio_uio_resid(uio));
2052 again:
2053 bufsize = biosize;
2054 /*
2055 * Get a cache block for writing. The range to be written is
2056 * (off..off+n) within the block. We ensure that the block
2057 * either has no dirty region or that the given range is
2058 * contiguous with the existing dirty region.
2059 */
2060 error = nfs_buf_get(vp, lbn, bufsize, p, NBLK_WRITE, &bp);
2061 if (error) {
2062 np->n_flag &= ~NWRBUSY;
2063 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2064 return (error);
2065 }
2066 /* map the block because we know we're going to write to it */
2067 NFS_BUF_MAP(bp);
2068
2069 if (np->n_flag & NNOCACHE)
2070 SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE));
2071
2072 if (bp->nb_wcred == NOCRED) {
2073 kauth_cred_ref(cred);
2074 bp->nb_wcred = cred;
2075 }
2076
2077 /*
2078 * If there's already a dirty range AND dirty pages in this block we
2079 * need to send a commit AND write the dirty pages before continuing.
2080 *
2081 * If there's already a dirty range OR dirty pages in this block
2082 * and the new write range is not contiguous with the existing range,
2083 * then force the buffer to be written out now.
2084 * (We used to just extend the dirty range to cover the valid,
2085 * but unwritten, data in between also. But writing ranges
2086 * of data that weren't actually written by an application
2087 * risks overwriting some other client's data with stale data
2088 * that's just masquerading as new written data.)
2089 */
2090 if (bp->nb_dirtyend > 0) {
2091 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
2092 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
2093 /* write/commit buffer "synchronously" */
2094 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2095 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2096 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2097 error = nfs_buf_write(bp);
2098 if (error) {
2099 np->n_flag &= ~NWRBUSY;
2100 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2101 return (error);
2102 }
2103 goto again;
2104 }
2105 } else if (bp->nb_dirty) {
2106 int firstpg, lastpg;
2107 u_int32_t pagemask;
2108 /* calculate write range pagemask */
2109 firstpg = on/PAGE_SIZE;
2110 lastpg = (on+n-1)/PAGE_SIZE;
2111 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2112 /* check if there are dirty pages outside the write range */
2113 if (bp->nb_dirty & ~pagemask) {
2114 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
2115 /* write/commit buffer "synchronously" */
2116 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2117 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
2118 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
2119 error = nfs_buf_write(bp);
2120 if (error) {
2121 np->n_flag &= ~NWRBUSY;
2122 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2123 return (error);
2124 }
2125 goto again;
2126 }
2127 /* if the first or last pages are already dirty */
2128 /* make sure that the dirty range encompasses those pages */
2129 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
2130 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
2131 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
2132 if (NBPGDIRTY(bp,lastpg)) {
2133 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
2134 /* clip to EOF */
2135 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2136 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2137 } else
2138 bp->nb_dirtyend = on+n;
2139 }
2140 }
2141
2142 /*
2143 * Are we extending the size of the file with this write?
2144 * If so, update file size now that we have the block.
2145 * If there was a partial buf at the old eof, validate
2146 * and zero the new bytes.
2147 */
2148 cureof = (off_t)np->n_size;
2149 if (uio->uio_offset + n > (off_t)np->n_size) {
2150 struct nfsbuf *eofbp = NULL;
2151 daddr64_t eofbn = np->n_size / biosize;
2152 int eofoff = np->n_size % biosize;
2153 int neweofoff = (uio->uio_offset + n) % biosize;
2154
2155 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
2156
2157 if (eofoff && (eofbn < lbn)) {
2158 error = nfs_buf_get(vp, eofbn, biosize, p, NBLK_WRITE|NBLK_ONLYVALID, &eofbp);
2159 if (error) {
2160 np->n_flag &= ~NWRBUSY;
2161 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2162 return (error);
2163 }
2164 }
2165
2166 /* if we're extending within the same last block */
2167 /* and the block is flagged as being cached... */
2168 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
2169 /* ...check that all pages in buffer are valid */
2170 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
2171 u_int32_t pagemask;
2172 /* pagemask only has to extend to last page being written to */
2173 pagemask = (1 << (endpg+1)) - 1;
2174 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
2175 if ((bp->nb_valid & pagemask) != pagemask) {
2176 /* zerofill any hole */
2177 if (on > bp->nb_validend) {
2178 int i;
2179 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
2180 NBPGVALID_SET(bp, i);
2181 NFS_BUF_MAP(bp);
2182 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
2183 bzero((char *)bp->nb_data + bp->nb_validend,
2184 on - bp->nb_validend);
2185 }
2186 /* zerofill any trailing data in the last page */
2187 if (neweofoff) {
2188 NFS_BUF_MAP(bp);
2189 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
2190 bzero((char *)bp->nb_data + neweofoff,
2191 PAGE_SIZE - (neweofoff & PAGE_MASK));
2192 }
2193 }
2194 }
2195 np->n_flag |= NMODIFIED;
2196 np->n_size = uio->uio_offset + n;
2197 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
2198 if (eofbp) {
2199 /*
2200 * We may need to zero any previously invalid data
2201 * after the old EOF in the previous EOF buffer.
2202 *
2203 * For the old last page, don't zero bytes if there
2204 * are invalid bytes in that page (i.e. the page isn't
2205 * currently valid).
2206 * For pages after the old last page, zero them and
2207 * mark them as valid.
2208 */
2209 char *d;
2210 int i;
2211 if (np->n_flag & NNOCACHE)
2212 SET(eofbp->nb_flags, (NB_NOCACHE|NB_STABLE));
2213 NFS_BUF_MAP(eofbp);
2214 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
2215 d = eofbp->nb_data;
2216 i = eofoff/PAGE_SIZE;
2217 while (eofoff < biosize) {
2218 int poff = eofoff & PAGE_MASK;
2219 if (!poff || NBPGVALID(eofbp,i)) {
2220 bzero(d + eofoff, PAGE_SIZE - poff);
2221 NBPGVALID_SET(eofbp, i);
2222 }
2223 if (bp->nb_validend == eofoff)
2224 bp->nb_validend += PAGE_SIZE - poff;
2225 eofoff += PAGE_SIZE - poff;
2226 i++;
2227 }
2228 nfs_buf_release(eofbp, 1);
2229 }
2230 }
2231 /*
2232 * If dirtyend exceeds file size, chop it down. This should
2233 * not occur unless there is a race.
2234 */
2235 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
2236 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2237 /*
2238 * UBC doesn't handle partial pages, so we need to make sure
2239 * that any pages left in the page cache are completely valid.
2240 *
2241 * Writes that are smaller than a block are delayed if they
2242 * don't extend to the end of the block.
2243 *
2244 * If the block isn't (completely) cached, we may need to read
2245 * in some parts of pages that aren't covered by the write.
2246 * If the write offset (on) isn't page aligned, we'll need to
2247 * read the start of the first page being written to. Likewise,
2248 * if the offset of the end of the write (on+n) isn't page aligned,
2249 * we'll need to read the end of the last page being written to.
2250 *
2251 * Notes:
2252 * We don't want to read anything we're just going to write over.
2253 * We don't want to issue multiple I/Os if we don't have to
2254 * (because they're synchronous rpcs).
2255 * We don't want to read anything we already have modified in the
2256 * page cache.
2257 */
2258 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
2259 int firstpg, lastpg, dirtypg;
2260 int firstpgoff, lastpgoff;
2261 start = end = -1;
2262 firstpg = on/PAGE_SIZE;
2263 firstpgoff = on & PAGE_MASK;
2264 lastpg = (on+n-1)/PAGE_SIZE;
2265 lastpgoff = (on+n) & PAGE_MASK;
2266 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
2267 /* need to read start of first page */
2268 start = firstpg * PAGE_SIZE;
2269 end = start + firstpgoff;
2270 }
2271 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
2272 /* need to read end of last page */
2273 if (start < 0)
2274 start = (lastpg * PAGE_SIZE) + lastpgoff;
2275 end = (lastpg + 1) * PAGE_SIZE;
2276 }
2277 if (end > start) {
2278 /* need to read the data in range: start...end-1 */
2279
2280 /* first, check for dirty pages in between */
2281 /* if there are, we'll have to do two reads because */
2282 /* we don't want to overwrite the dirty pages. */
2283 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
2284 if (NBPGDIRTY(bp,dirtypg))
2285 break;
2286
2287 /* if start is at beginning of page, try */
2288 /* to get any preceeding pages as well. */
2289 if (!(start & PAGE_MASK)) {
2290 /* stop at next dirty/valid page or start of block */
2291 for (; start > 0; start-=PAGE_SIZE)
2292 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
2293 break;
2294 }
2295
2296 NFS_BUF_MAP(bp);
2297 /* setup uio for read(s) */
2298 boff = NBOFF(bp);
2299 auio.uio_iovs.iov32p = &iov;
2300 auio.uio_iovcnt = 1;
2301 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2302 auio.uio_segflg = UIO_SYSSPACE;
2303 #else
2304 auio.uio_segflg = UIO_SYSSPACE32;
2305 #endif
2306 auio.uio_rw = UIO_READ;
2307
2308 if (dirtypg <= (end-1)/PAGE_SIZE) {
2309 /* there's a dirty page in the way, so just do two reads */
2310 /* we'll read the preceding data here */
2311 auio.uio_offset = boff + start;
2312 iov.iov_len = on - start;
2313 uio_uio_resid_set(&auio, iov.iov_len);
2314 iov.iov_base = (uintptr_t) bp->nb_data + start;
2315 error = nfs_readrpc(vp, &auio, cred, p);
2316 if (error) {
2317 bp->nb_error = error;
2318 SET(bp->nb_flags, NB_ERROR);
2319 printf("nfs_write: readrpc %d", error);
2320 }
2321 if (uio_uio_resid(&auio) > 0) {
2322 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee01);
2323 // LP64todo - fix this
2324 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2325 }
2326 /* update validoff/validend if necessary */
2327 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2328 bp->nb_validoff = start;
2329 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2330 bp->nb_validend = on;
2331 if ((off_t)np->n_size > boff + bp->nb_validend)
2332 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2333 /* validate any pages before the write offset */
2334 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2335 NBPGVALID_SET(bp, start/PAGE_SIZE);
2336 /* adjust start to read any trailing data */
2337 start = on+n;
2338 }
2339
2340 /* if end is at end of page, try to */
2341 /* get any following pages as well. */
2342 if (!(end & PAGE_MASK)) {
2343 /* stop at next valid page or end of block */
2344 for (; end < bufsize; end+=PAGE_SIZE)
2345 if (NBPGVALID(bp,end/PAGE_SIZE))
2346 break;
2347 }
2348
2349 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2350 /*
2351 * Either this entire read is beyond the current EOF
2352 * or the range that we won't be modifying (on+n...end)
2353 * is all beyond the current EOF.
2354 * No need to make a trip across the network to
2355 * read nothing. So, just zero the buffer instead.
2356 */
2357 FSDBG(516, bp, start, end - start, 0xd00dee00);
2358 bzero(bp->nb_data + start, end - start);
2359 } else {
2360 /* now we'll read the (rest of the) data */
2361 auio.uio_offset = boff + start;
2362 iov.iov_len = end - start;
2363 uio_uio_resid_set(&auio, iov.iov_len);
2364 iov.iov_base = (uintptr_t) (bp->nb_data + start);
2365 error = nfs_readrpc(vp, &auio, cred, p);
2366 if (error) {
2367 bp->nb_error = error;
2368 SET(bp->nb_flags, NB_ERROR);
2369 printf("nfs_write: readrpc %d", error);
2370 }
2371 if (uio_uio_resid(&auio) > 0) {
2372 FSDBG(516, bp, iov.iov_base - bp->nb_data, uio_uio_resid(&auio), 0xd00dee02);
2373 // LP64todo - fix this
2374 bzero((caddr_t)iov.iov_base, uio_uio_resid(&auio));
2375 }
2376 }
2377 /* update validoff/validend if necessary */
2378 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2379 bp->nb_validoff = start;
2380 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2381 bp->nb_validend = end;
2382 if ((off_t)np->n_size > boff + bp->nb_validend)
2383 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2384 /* validate any pages before the write offset's page */
2385 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2386 NBPGVALID_SET(bp, start/PAGE_SIZE);
2387 /* validate any pages after the range of pages being written to */
2388 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2389 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2390 /* Note: pages being written to will be validated when written */
2391 }
2392 }
2393
2394 if (ISSET(bp->nb_flags, NB_ERROR)) {
2395 error = bp->nb_error;
2396 nfs_buf_release(bp, 1);
2397 np->n_flag &= ~NWRBUSY;
2398 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2399 return (error);
2400 }
2401
2402 np->n_flag |= NMODIFIED;
2403
2404 NFS_BUF_MAP(bp);
2405 error = uiomove((char *)bp->nb_data + on, n, uio);
2406 if (error) {
2407 SET(bp->nb_flags, NB_ERROR);
2408 nfs_buf_release(bp, 1);
2409 np->n_flag &= ~NWRBUSY;
2410 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), error);
2411 return (error);
2412 }
2413
2414 /* validate any pages written to */
2415 start = on & ~PAGE_MASK;
2416 for (; start < on+n; start += PAGE_SIZE) {
2417 NBPGVALID_SET(bp, start/PAGE_SIZE);
2418 /*
2419 * This may seem a little weird, but we don't actually set the
2420 * dirty bits for writes. This is because we keep the dirty range
2421 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2422 * delayed writes, when we give the pages back to the VM we don't
2423 * want to keep them marked dirty, because when we later write the
2424 * buffer we won't be able to tell which pages were written dirty
2425 * and which pages were mmapped and dirtied.
2426 */
2427 }
2428 if (bp->nb_dirtyend > 0) {
2429 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2430 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2431 } else {
2432 bp->nb_dirtyoff = on;
2433 bp->nb_dirtyend = on + n;
2434 }
2435 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2436 bp->nb_validoff > bp->nb_dirtyend) {
2437 bp->nb_validoff = bp->nb_dirtyoff;
2438 bp->nb_validend = bp->nb_dirtyend;
2439 } else {
2440 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2441 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2442 }
2443 if (!ISSET(bp->nb_flags, NB_CACHE))
2444 nfs_buf_normalize_valid_range(np, bp);
2445
2446 /*
2447 * Since this block is being modified, it must be written
2448 * again and not just committed.
2449 */
2450 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2451 np->n_needcommitcnt--;
2452 CHECK_NEEDCOMMITCNT(np);
2453 }
2454 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2455
2456 if (ioflag & IO_SYNC) {
2457 bp->nb_proc = p;
2458 error = nfs_buf_write(bp);
2459 if (error) {
2460 np->n_flag &= ~NWRBUSY;
2461 FSDBG_BOT(515, vp, uio->uio_offset,
2462 uio_uio_resid(uio), error);
2463 return (error);
2464 }
2465 } else if (((n + on) == biosize) || (np->n_flag & NNOCACHE)) {
2466 bp->nb_proc = NULL;
2467 SET(bp->nb_flags, NB_ASYNC);
2468 nfs_buf_write(bp);
2469 } else
2470 nfs_buf_write_delayed(bp, p);
2471
2472 if (np->n_needcommitcnt > (nfsbufcnt/16))
2473 nfs_flushcommits(vp, p, 1);
2474
2475 } while (uio_uio_resid(uio) > 0 && n > 0);
2476
2477 np->n_flag &= ~NWRBUSY;
2478 FSDBG_BOT(515, vp, uio->uio_offset, uio_uio_resid(uio), 0);
2479 return (0);
2480 }
2481
2482 /*
2483 * Flush out and invalidate all buffers associated with a vnode.
2484 * Called with the underlying object locked.
2485 */
2486 static int
2487 nfs_vinvalbuf_internal(
2488 vnode_t vp,
2489 int flags,
2490 kauth_cred_t cred,
2491 proc_t p,
2492 int slpflag,
2493 int slptimeo)
2494 {
2495 struct nfsbuf *bp;
2496 struct nfsbuflists blist;
2497 int list, error = 0;
2498 struct nfsnode *np = VTONFS(vp);
2499
2500 if (flags & V_SAVE) {
2501 if ((error = nfs_flush(vp, MNT_WAIT, cred, p,
2502 (flags & V_IGNORE_WRITEERR))))
2503 return (error);
2504 if (!LIST_EMPTY(&np->n_dirtyblkhd))
2505 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2506 vp, LIST_FIRST(&np->n_dirtyblkhd));
2507 }
2508
2509 lck_mtx_lock(nfs_buf_mutex);
2510 for (;;) {
2511 list = NBI_CLEAN;
2512 if (nfs_buf_iterprepare(np, &blist, list)) {
2513 list = NBI_DIRTY;
2514 if (nfs_buf_iterprepare(np, &blist, list))
2515 break;
2516 }
2517 while ((bp = LIST_FIRST(&blist))) {
2518 LIST_REMOVE(bp, nb_vnbufs);
2519 if (list == NBI_CLEAN)
2520 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
2521 else
2522 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2523 nfs_buf_refget(bp);
2524 while ((error = nfs_buf_acquire(bp, NBAC_REMOVE, slpflag, slptimeo))) {
2525 FSDBG(556, vp, bp, NBOFF(bp), bp->nb_flags);
2526 if (error != EAGAIN) {
2527 FSDBG(554, vp, bp, -1, error);
2528 nfs_buf_refrele(bp);
2529 nfs_buf_itercomplete(np, &blist, list);
2530 lck_mtx_unlock(nfs_buf_mutex);
2531 return (error);
2532 }
2533 }
2534 nfs_buf_refrele(bp);
2535 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2536 lck_mtx_unlock(nfs_buf_mutex);
2537 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && bp->nb_vp &&
2538 (NBOFF(bp) < (off_t)np->n_size)) {
2539 /* XXX extra paranoia: make sure we're not */
2540 /* somehow leaving any dirty data around */
2541 int mustwrite = 0;
2542 int end = (NBOFF(bp) + bp->nb_bufsize > (off_t)np->n_size) ?
2543 ((off_t)np->n_size - NBOFF(bp)) : bp->nb_bufsize;
2544 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2545 error = nfs_buf_upl_setup(bp);
2546 if (error == EINVAL) {
2547 /* vm object must no longer exist */
2548 /* hopefully we don't need to do */
2549 /* anything for this buffer */
2550 } else if (error)
2551 printf("nfs_vinvalbuf: upl setup failed %d\n", error);
2552 bp->nb_valid = bp->nb_dirty = 0;
2553 }
2554 nfs_buf_upl_check(bp);
2555 /* check for any dirty data before the EOF */
2556 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2557 /* clip dirty range to EOF */
2558 if (bp->nb_dirtyend > end)
2559 bp->nb_dirtyend = end;
2560 mustwrite++;
2561 }
2562 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2563 /* also make sure we'll have a credential to do the write */
2564 if (mustwrite && (bp->nb_wcred == NOCRED) && (cred == NOCRED)) {
2565 printf("nfs_vinvalbuf: found dirty buffer with no write creds\n");
2566 mustwrite = 0;
2567 }
2568 if (mustwrite) {
2569 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2570 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2571 panic("nfs_vinvalbuf: dirty buffer without upl");
2572 /* gotta write out dirty data before invalidating */
2573 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2574 /* (NB_NOCACHE indicates buffer should be discarded) */
2575 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2576 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2577 if (bp->nb_wcred == NOCRED) {
2578 kauth_cred_ref(cred);
2579 bp->nb_wcred = cred;
2580 }
2581 error = nfs_buf_write(bp);
2582 // Note: bp has been released
2583 if (error) {
2584 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2585 np->n_error = error;
2586 np->n_flag |= NWRITEERR;
2587 /*
2588 * There was a write error and we need to
2589 * invalidate attrs to sync with server.
2590 * (if this write was extending the file,
2591 * we may no longer know the correct size)
2592 */
2593 NATTRINVALIDATE(np);
2594 error = 0;
2595 }
2596 lck_mtx_lock(nfs_buf_mutex);
2597 continue;
2598 }
2599 }
2600 SET(bp->nb_flags, NB_INVAL);
2601 // hold off on FREEUPs until we're done here
2602 nfs_buf_release(bp, 0);
2603 lck_mtx_lock(nfs_buf_mutex);
2604 }
2605 nfs_buf_itercomplete(np, &blist, list);
2606 }
2607 lck_mtx_unlock(nfs_buf_mutex);
2608 NFS_BUF_FREEUP();
2609 if (NVALIDBUFS(np))
2610 panic("nfs_vinvalbuf: flush failed");
2611 return (0);
2612 }
2613
2614
2615 /*
2616 * Flush and invalidate all dirty buffers. If another process is already
2617 * doing the flush, just wait for completion.
2618 */
2619 int
2620 nfs_vinvalbuf(
2621 vnode_t vp,
2622 int flags,
2623 kauth_cred_t cred,
2624 proc_t p,
2625 int intrflg)
2626 {
2627 struct nfsnode *np = VTONFS(vp);
2628 struct nfsmount *nmp = VFSTONFS(vnode_mount(vp));
2629 int error = 0, slpflag, slptimeo;
2630 off_t size;
2631
2632 FSDBG_TOP(554, vp, flags, intrflg, 0);
2633
2634 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2635 intrflg = 0;
2636 if (intrflg) {
2637 slpflag = PCATCH;
2638 slptimeo = 2 * hz;
2639 } else {
2640 slpflag = 0;
2641 slptimeo = 0;
2642 }
2643 /*
2644 * First wait for any other process doing a flush to complete.
2645 */
2646 while (np->n_flag & NFLUSHINPROG) {
2647 np->n_flag |= NFLUSHWANT;
2648 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2649 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2650 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2651 if (error && (error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p))) {
2652 FSDBG_BOT(554, vp, flags, intrflg, error);
2653 return (error);
2654 }
2655 }
2656
2657 /*
2658 * Now, flush as required.
2659 */
2660 np->n_flag |= NFLUSHINPROG;
2661 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2662 while (error) {
2663 FSDBG(554, vp, 0, 0, error);
2664 error = nfs_sigintr(VFSTONFS(vnode_mount(vp)), NULL, p);
2665 if (error) {
2666 np->n_flag &= ~NFLUSHINPROG;
2667 if (np->n_flag & NFLUSHWANT) {
2668 np->n_flag &= ~NFLUSHWANT;
2669 wakeup((caddr_t)&np->n_flag);
2670 }
2671 FSDBG_BOT(554, vp, flags, intrflg, error);
2672 return (error);
2673 }
2674 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2675 }
2676 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2677 if (np->n_flag & NFLUSHWANT) {
2678 np->n_flag &= ~NFLUSHWANT;
2679 wakeup((caddr_t)&np->n_flag);
2680 }
2681 /*
2682 * get the pages out of vm also
2683 */
2684 if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) {
2685 int rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_INVALIDATE);
2686 if (!rv)
2687 panic("nfs_vinvalbuf(): ubc_sync_range failed!");
2688 }
2689
2690 FSDBG_BOT(554, vp, flags, intrflg, 0);
2691 return (0);
2692 }
2693
2694 /*
2695 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2696 * This is mainly to avoid queueing async I/O requests when the nfsiods
2697 * are all hung on a dead server.
2698 */
2699 int
2700 nfs_asyncio(bp, cred)
2701 struct nfsbuf *bp;
2702 kauth_cred_t cred;
2703 {
2704 struct nfsmount *nmp;
2705 int i;
2706 int gotiod;
2707 int slpflag = 0;
2708 int slptimeo = 0;
2709 int error, error2;
2710 void *wakeme = NULL;
2711 struct timespec ts;
2712
2713 if (nfs_numasync == 0)
2714 return (EIO);
2715
2716 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2717
2718 nmp = ((bp != NULL) ? VFSTONFS(vnode_mount(bp->nb_vp)) : NULL);
2719 again:
2720 if (nmp && nmp->nm_flag & NFSMNT_INT)
2721 slpflag = PCATCH;
2722 gotiod = FALSE;
2723
2724 lck_mtx_lock(nfs_iod_mutex);
2725
2726 /* no nfsbuf means tell nfsiod to process delwri list */
2727 if (!bp)
2728 nfs_ioddelwri = 1;
2729
2730 /*
2731 * Find a free iod to process this request.
2732 */
2733 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2734 if (nfs_iodwant[i]) {
2735 /*
2736 * Found one, so wake it up and tell it which
2737 * mount to process.
2738 */
2739 nfs_iodwant[i] = NULL;
2740 nfs_iodmount[i] = nmp;
2741 if (nmp)
2742 nmp->nm_bufqiods++;
2743 wakeme = &nfs_iodwant[i];
2744 gotiod = TRUE;
2745 break;
2746 }
2747
2748 /* if we're just poking the delwri list, we're done */
2749 if (!bp) {
2750 lck_mtx_unlock(nfs_iod_mutex);
2751 if (wakeme)
2752 wakeup(wakeme);
2753 FSDBG_BOT(552, bp, 0x10101010, wakeme, 0);
2754 return (0);
2755 }
2756
2757 /*
2758 * If none are free, we may already have an iod working on this mount
2759 * point. If so, it will process our request.
2760 */
2761 if (!gotiod) {
2762 if (nmp->nm_bufqiods > 0) {
2763 gotiod = TRUE;
2764 }
2765 }
2766
2767 /*
2768 * If we have an iod which can process the request, then queue
2769 * the buffer.
2770 */
2771 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2772 if (gotiod) {
2773 /*
2774 * Ensure that the queue never grows too large.
2775 */
2776 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2777 if (ISSET(bp->nb_flags, NB_IOD)) {
2778 /* An nfsiod is attempting this async operation so */
2779 /* we must not fall asleep on the bufq because we */
2780 /* could be waiting on ourself. Just return error */
2781 /* and we'll do this operation syncrhonously. */
2782 goto out;
2783 }
2784 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2785 nmp->nm_bufqwant = TRUE;
2786
2787 ts.tv_sec = (slptimeo/100);
2788 /* the hz value is 100; which leads to 10ms */
2789 ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
2790
2791 error = msleep(&nmp->nm_bufq, nfs_iod_mutex, slpflag | PRIBIO,
2792 "nfsaio", &ts);
2793 if (error) {
2794 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2795 if (error2) {
2796 lck_mtx_unlock(nfs_iod_mutex);
2797 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2798 return (error2);
2799 }
2800 if (slpflag == PCATCH) {
2801 slpflag = 0;
2802 slptimeo = 2 * hz;
2803 }
2804 }
2805 /*
2806 * We might have lost our iod while sleeping,
2807 * so check and loop if nescessary.
2808 */
2809 if (nmp->nm_bufqiods == 0) {
2810 lck_mtx_unlock(nfs_iod_mutex);
2811 goto again;
2812 }
2813 }
2814
2815 if (ISSET(bp->nb_flags, NB_READ)) {
2816 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2817 kauth_cred_ref(cred);
2818 bp->nb_rcred = cred;
2819 }
2820 } else {
2821 SET(bp->nb_flags, NB_WRITEINPROG);
2822 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2823 kauth_cred_ref(cred);
2824 bp->nb_wcred = cred;
2825 }
2826 }
2827
2828 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2829 nmp->nm_bufqlen++;
2830 lck_mtx_unlock(nfs_iod_mutex);
2831 if (wakeme)
2832 wakeup(wakeme);
2833 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2834 return (0);
2835 }
2836
2837 out:
2838 lck_mtx_unlock(nfs_iod_mutex);
2839 /*
2840 * All the iods are busy on other mounts, so return EIO to
2841 * force the caller to process the i/o synchronously.
2842 */
2843 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2844 return (EIO);
2845 }
2846
2847 /*
2848 * Do an I/O operation to/from a cache block. This may be called
2849 * synchronously or from an nfsiod.
2850 */
2851 int
2852 nfs_doio(struct nfsbuf *bp, kauth_cred_t cr, proc_t p)
2853 {
2854 struct uio *uiop;
2855 vnode_t vp;
2856 struct nfsnode *np;
2857 struct nfsmount *nmp;
2858 int error = 0, diff, len, iomode, must_commit = 0, invalidate = 0;
2859 struct uio uio;
2860 struct iovec_32 io;
2861 enum vtype vtype;
2862
2863 vp = bp->nb_vp;
2864 vtype = vnode_vtype(vp);
2865 np = VTONFS(vp);
2866 nmp = VFSTONFS(vnode_mount(vp));
2867 uiop = &uio;
2868 uiop->uio_iovs.iov32p = &io;
2869 uiop->uio_iovcnt = 1;
2870 #if 1 /* LP64todo - can't use new segment flags until the drivers are ready */
2871 uiop->uio_segflg = UIO_SYSSPACE;
2872 #else
2873 uiop->uio_segflg = UIO_SYSSPACE32;
2874 #endif
2875
2876 /*
2877 * we've decided to perform I/O for this block,
2878 * so we couldn't possibly NB_DONE. So, clear it.
2879 */
2880 if (ISSET(bp->nb_flags, NB_DONE)) {
2881 if (!ISSET(bp->nb_flags, NB_ASYNC))
2882 panic("nfs_doio: done and not async");
2883 CLR(bp->nb_flags, NB_DONE);
2884 }
2885 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2886 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2887 bp->nb_dirtyend);
2888
2889 if (ISSET(bp->nb_flags, NB_READ)) {
2890 if (vtype == VREG)
2891 NFS_BUF_MAP(bp);
2892 io.iov_len = bp->nb_bufsize;
2893 uio_uio_resid_set(uiop, io.iov_len);
2894 io.iov_base = (uintptr_t) bp->nb_data;
2895 uiop->uio_rw = UIO_READ;
2896 switch (vtype) {
2897 case VREG:
2898 uiop->uio_offset = NBOFF(bp);
2899 OSAddAtomic(1, (SInt32*)&nfsstats.read_bios);
2900 error = nfs_readrpc(vp, uiop, cr, p);
2901 FSDBG(262, np->n_size, NBOFF(bp), uio_uio_resid(uiop), error);
2902 if (!error) {
2903 /* update valid range */
2904 bp->nb_validoff = 0;
2905 if (uio_uio_resid(uiop) != 0) {
2906 /*
2907 * If len > 0, there is a hole in the file and
2908 * no writes after the hole have been pushed to
2909 * the server yet.
2910 * Just zero fill the rest of the valid area.
2911 */
2912 // LP64todo - fix this
2913 diff = bp->nb_bufsize - uio_uio_resid(uiop);
2914 len = np->n_size - (NBOFF(bp) + diff);
2915 if (len > 0) {
2916 // LP64todo - fix this
2917 len = min(len, uio_uio_resid(uiop));
2918 bzero((char *)bp->nb_data + diff, len);
2919 bp->nb_validend = diff + len;
2920 FSDBG(258, diff, len, 0, 1);
2921 } else
2922 bp->nb_validend = diff;
2923 } else
2924 bp->nb_validend = bp->nb_bufsize;
2925 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2926 if (bp->nb_validend & PAGE_MASK) {
2927 /* valid range ends in the middle of a page so we */
2928 /* need to zero-fill any invalid data at the end */
2929 /* of the last page */
2930 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2931 bp->nb_bufsize - bp->nb_validend);
2932 FSDBG(258, bp->nb_validend,
2933 bp->nb_bufsize - bp->nb_validend, 0, 2);
2934 }
2935 }
2936 break;
2937 case VLNK:
2938 uiop->uio_offset = (off_t)0;
2939 OSAddAtomic(1, (SInt32*)&nfsstats.readlink_bios);
2940 error = nfs_readlinkrpc(vp, uiop, cr, p);
2941 if (!error) {
2942 bp->nb_validoff = 0;
2943 bp->nb_validend = uiop->uio_offset;
2944 }
2945 break;
2946 case VDIR:
2947 OSAddAtomic(1, (SInt32*)&nfsstats.readdir_bios);
2948 uiop->uio_offset = NBOFF(bp);
2949 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2950 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2951 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2952 error = nfs_readdirplusrpc(vp, uiop, cr, p);
2953 if (error == NFSERR_NOTSUPP)
2954 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2955 }
2956 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2957 error = nfs_readdirrpc(vp, uiop, cr, p);
2958 if (!error) {
2959 bp->nb_validoff = 0;
2960 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2961 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2962 }
2963 break;
2964 default:
2965 printf("nfs_doio: type %x unexpected\n", vtype);
2966 break;
2967 };
2968 if (error) {
2969 SET(bp->nb_flags, NB_ERROR);
2970 bp->nb_error = error;
2971 }
2972
2973 } else {
2974 /* we're doing a write */
2975 int doff, dend = 0;
2976
2977 /* We need to make sure the pages are locked before doing I/O. */
2978 if (!ISSET(bp->nb_flags, NB_META) && UBCINFOEXISTS(vp)) {
2979 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2980 error = nfs_buf_upl_setup(bp);
2981 if (error) {
2982 printf("nfs_doio: upl create failed %d\n", error);
2983 SET(bp->nb_flags, NB_ERROR);
2984 bp->nb_error = EIO;
2985 return (EIO);
2986 }
2987 nfs_buf_upl_check(bp);
2988 }
2989 }
2990
2991 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2992 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2993 /*
2994 * There are pages marked dirty that need to be written out.
2995 *
2996 * We don't want to just combine the write range with the
2997 * range of pages that are dirty because that could cause us
2998 * to write data that wasn't actually written to.
2999 * We also don't want to write data more than once.
3000 *
3001 * If the dirty range just needs to be committed, we do that.
3002 * Otherwise, we write the dirty range and clear the dirty bits
3003 * for any COMPLETE pages covered by that range.
3004 * If there are dirty pages left after that, we write out the
3005 * parts that we haven't written yet.
3006 */
3007 }
3008
3009 /*
3010 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
3011 * an actual write will have to be done.
3012 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
3013 */
3014 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
3015 doff = NBOFF(bp) + bp->nb_dirtyoff;
3016 SET(bp->nb_flags, NB_WRITEINPROG);
3017 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
3018 bp->nb_wcred, bp->nb_proc);
3019 CLR(bp->nb_flags, NB_WRITEINPROG);
3020 if (!error) {
3021 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3022 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3023 np->n_needcommitcnt--;
3024 CHECK_NEEDCOMMITCNT(np);
3025 } else if (error == NFSERR_STALEWRITEVERF)
3026 nfs_clearcommit(vnode_mount(vp));
3027 }
3028
3029 if (!error && bp->nb_dirtyend > 0) {
3030 /* there's a dirty range that needs to be written out */
3031 u_int32_t pagemask;
3032 int firstpg, lastpg;
3033
3034 if (NBOFF(bp) + bp->nb_dirtyend > (off_t)np->n_size)
3035 bp->nb_dirtyend = np->n_size - NBOFF(bp);
3036
3037 NFS_BUF_MAP(bp);
3038
3039 doff = bp->nb_dirtyoff;
3040 dend = bp->nb_dirtyend;
3041
3042 /* if doff page is dirty, move doff to start of page */
3043 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
3044 doff -= doff & PAGE_MASK;
3045 /* try to expand write range to include preceding dirty pages */
3046 if (!(doff & PAGE_MASK))
3047 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
3048 doff -= PAGE_SIZE;
3049 /* if dend page is dirty, move dend to start of next page */
3050 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
3051 dend = round_page_32(dend);
3052 /* try to expand write range to include trailing dirty pages */
3053 if (!(dend & PAGE_MASK))
3054 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
3055 dend += PAGE_SIZE;
3056 /* make sure to keep dend clipped to EOF */
3057 if (NBOFF(bp) + dend > (off_t)np->n_size)
3058 dend = np->n_size - NBOFF(bp);
3059 /* calculate range of complete pages being written */
3060 firstpg = round_page_32(doff) / PAGE_SIZE;
3061 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
3062 /* calculate mask for that page range */
3063 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
3064
3065 /* compare page mask to nb_dirty; if there are other dirty pages */
3066 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
3067 /* not needcommit/nocache/call; otherwise write FILESYNC */
3068 if (bp->nb_dirty & ~pagemask)
3069 iomode = NFSV3WRITE_FILESYNC;
3070 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
3071 iomode = NFSV3WRITE_UNSTABLE;
3072 else
3073 iomode = NFSV3WRITE_FILESYNC;
3074
3075 /* write the dirty range */
3076 io.iov_len = dend - doff;
3077 uio_uio_resid_set(uiop, io.iov_len);
3078 uiop->uio_offset = NBOFF(bp) + doff;
3079 io.iov_base = (uintptr_t) bp->nb_data + doff;
3080 uiop->uio_rw = UIO_WRITE;
3081
3082 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3083
3084 SET(bp->nb_flags, NB_WRITEINPROG);
3085 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3086 if (must_commit)
3087 nfs_clearcommit(vnode_mount(vp));
3088 /* clear dirty bits for pages we've written */
3089 if (!error)
3090 bp->nb_dirty &= ~pagemask;
3091 /* set/clear needcommit flag */
3092 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
3093 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
3094 np->n_needcommitcnt++;
3095 SET(bp->nb_flags, NB_NEEDCOMMIT);
3096 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
3097 bp->nb_dirtyoff = doff;
3098 bp->nb_dirtyend = dend;
3099 } else {
3100 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3101 np->n_needcommitcnt--;
3102 CHECK_NEEDCOMMITCNT(np);
3103 }
3104 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3105 }
3106 CLR(bp->nb_flags, NB_WRITEINPROG);
3107 /*
3108 * For an interrupted write, the buffer is still valid and the write
3109 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
3110 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
3111 * NB_EINTR is not relevant.
3112 *
3113 * For the case of a V3 write rpc not being committed to stable
3114 * storage, the block is still dirty and requires either a commit rpc
3115 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
3116 * block is reused. This is indicated by setting the NB_DELWRI and
3117 * NB_NEEDCOMMIT flags.
3118 */
3119 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
3120 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
3121 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
3122 SET(bp->nb_flags, NB_DELWRI);
3123 OSAddAtomic(1, (SInt32*)&nfs_nbdwrite);
3124 NFSBUFCNTCHK(0);
3125 }
3126 FSDBG(261, bp->nb_validoff, bp->nb_validend,
3127 bp->nb_bufsize, 0);
3128 /*
3129 * Since for the NB_ASYNC case, nfs_bwrite() has
3130 * reassigned the buffer to the clean list, we have to
3131 * reassign it back to the dirty one. Ugh.
3132 */
3133 if (ISSET(bp->nb_flags, NB_ASYNC)) {
3134 /* move to dirty list */
3135 lck_mtx_lock(nfs_buf_mutex);
3136 if (bp->nb_vnbufs.le_next != NFSNOLIST)
3137 LIST_REMOVE(bp, nb_vnbufs);
3138 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
3139 lck_mtx_unlock(nfs_buf_mutex);
3140 } else {
3141 SET(bp->nb_flags, NB_EINTR);
3142 }
3143 } else {
3144 /* either there's an error or we don't need to commit */
3145 if (error) {
3146 SET(bp->nb_flags, NB_ERROR);
3147 bp->nb_error = np->n_error = error;
3148 np->n_flag |= NWRITEERR;
3149 /*
3150 * There was a write error and we need to
3151 * invalidate attrs and flush buffers in
3152 * order to sync up with the server.
3153 * (if this write was extending the file,
3154 * we may no longer know the correct size)
3155 *
3156 * But we can't call vinvalbuf while holding
3157 * this buffer busy. Set a flag to do it after
3158 * releasing the buffer.
3159 *
3160 * Note we can only invalidate in this function
3161 * if this is an async write and so the iodone
3162 * below will release the buffer. Also, we
3163 * shouldn't call vinvalbuf from nfsiod because
3164 * that may deadlock waiting for the completion
3165 * of writes that are queued up behind this one.
3166 */
3167 if (ISSET(bp->nb_flags, NB_ASYNC) &&
3168 !ISSET(bp->nb_flags, NB_IOD)) {
3169 invalidate = 1;
3170 } else {
3171 /* invalidate later */
3172 np->n_flag |= NNEEDINVALIDATE;
3173 }
3174 NATTRINVALIDATE(np);
3175 }
3176 /* clear the dirty range */
3177 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
3178 }
3179 }
3180
3181 if (!error && bp->nb_dirty) {
3182 /* there are pages marked dirty that need to be written out */
3183 int pg, count, npages, off;
3184
3185 OSAddAtomic(1, (SInt32*)&nfsstats.write_bios);
3186
3187 NFS_BUF_MAP(bp);
3188
3189 /*
3190 * we do these writes synchronously because we can't really
3191 * support the unstable/needommit method. We could write
3192 * them unstable, clear the dirty bits, and then commit the
3193 * whole block later, but if we need to rewrite the data, we
3194 * won't have any idea which pages were written because that
3195 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
3196 * also can't leave the dirty bits set because then we wouldn't
3197 * be able to tell if the pages were re-dirtied between the end
3198 * of the write and the commit.
3199 */
3200 iomode = NFSV3WRITE_FILESYNC;
3201 uiop->uio_rw = UIO_WRITE;
3202
3203 SET(bp->nb_flags, NB_WRITEINPROG);
3204 npages = bp->nb_bufsize/PAGE_SIZE;
3205 for (pg=0; pg < npages; pg++) {
3206 if (!NBPGDIRTY(bp,pg))
3207 continue;
3208 count = 1;
3209 while (((pg+count) < npages) && NBPGDIRTY(bp,pg+count))
3210 count++;
3211 /* write count pages starting with page pg */
3212 off = pg * PAGE_SIZE;
3213 len = count * PAGE_SIZE;
3214
3215 /* clip writes to EOF */
3216 if (NBOFF(bp) + off + len > (off_t)np->n_size)
3217 len -= (NBOFF(bp) + off + len) - np->n_size;
3218 if (len > 0) {
3219 io.iov_len = len;
3220 uio_uio_resid_set(uiop, io.iov_len);
3221 uiop->uio_offset = NBOFF(bp) + off;
3222 io.iov_base = (uintptr_t) bp->nb_data + off;
3223 error = nfs_writerpc(vp, uiop, cr, p, &iomode, &must_commit);
3224 if (must_commit)
3225 nfs_clearcommit(vnode_mount(vp));
3226 if (error)
3227 break;
3228 }
3229 /* clear dirty bits */
3230 while (count--) {
3231 bp->nb_dirty &= ~(1 << pg);
3232 /* leave pg on last page */
3233 if (count) pg++;
3234 }
3235 }
3236 if (!error) {
3237 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
3238 np->n_needcommitcnt--;
3239 CHECK_NEEDCOMMITCNT(np);
3240 }
3241 CLR(bp->nb_flags, NB_NEEDCOMMIT);
3242 }
3243 CLR(bp->nb_flags, NB_WRITEINPROG);
3244 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
3245 np->n_size);
3246 }
3247
3248 if (error) {
3249 SET(bp->nb_flags, NB_ERROR);
3250 bp->nb_error = error;
3251 }
3252 }
3253
3254 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
3255
3256 nfs_buf_iodone(bp);
3257
3258 if (invalidate) {
3259 /*
3260 * There was a write error and we need to
3261 * invalidate attrs and flush buffers in
3262 * order to sync up with the server.
3263 * (if this write was extending the file,
3264 * we may no longer know the correct size)
3265 *
3266 * But we couldn't call vinvalbuf while holding
3267 * the buffer busy. So we call vinvalbuf() after
3268 * releasing the buffer.
3269 *
3270 * Note: we don't bother calling nfs_vinvalbuf() if
3271 * there's already a flush in progress.
3272 */
3273 if (!(np->n_flag & NFLUSHINPROG))
3274 nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, cr, p, 1);
3275 }
3276
3277 return (error);
3278 }