]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
8478a7da4125861346f7988dafc6020f211cae2d
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc.h>
66 #include <sys/malloc.h>
67 #include <sys/vnode.h>
68 #include <sys/dirent.h>
69 #include <sys/mount.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72 #include <sys/ubc.h>
73
74 #include <sys/vm.h>
75 #include <sys/vmparam.h>
76
77 #include <sys/time.h>
78 #include <kern/clock.h>
79
80 #include <nfs/rpcv2.h>
81 #include <nfs/nfsproto.h>
82 #include <nfs/nfs.h>
83 #include <nfs/nfsmount.h>
84 #include <nfs/nqnfs.h>
85 #include <nfs/nfsnode.h>
86
87 #include <sys/kdebug.h>
88
89 #define FSDBG(A, B, C, D, E) \
90 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
91 (int)(B), (int)(C), (int)(D), (int)(E), 0)
92 #define FSDBG_TOP(A, B, C, D, E) \
93 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
94 (int)(B), (int)(C), (int)(D), (int)(E), 0)
95 #define FSDBG_BOT(A, B, C, D, E) \
96 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
97 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98
99 extern int nfs_numasync;
100 extern int nfs_ioddelwri;
101 extern struct nfsstats nfsstats;
102
103 #define NFSBUFHASH(dvp, lbn) \
104 (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
105 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
106 struct nfsbuffreehead nfsbuffree, nfsbufdelwri;
107 u_long nfsbufhash;
108 int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax;
109 int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer;
110 int nfs_nbdwrite;
111
112 #define NFSBUFWRITE_THROTTLE 9
113
114 /*
115 * Initialize nfsbuf lists
116 */
117 void
118 nfs_nbinit(void)
119 {
120 nfsbufhashlock = 0;
121 nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash);
122 TAILQ_INIT(&nfsbuffree);
123 TAILQ_INIT(&nfsbufdelwri);
124 nfsbufcnt = nfsbuffreecnt = nfsbufdelwricnt = 0;
125 nfsbufmin = 128; // XXX tune me!
126 nfsbufmax = 8192; // XXX tune me!
127 nfsneedbuffer = 0;
128 nfs_nbdwrite = 0;
129 }
130
131 /*
132 * try to free up some excess, unused nfsbufs
133 */
134 static void
135 nfs_buf_freeup(void)
136 {
137 struct nfsbuf *fbp;
138 int cnt;
139
140 #define NFS_BUF_FREEUP() \
141 do { \
142 /* only call nfs_buf_freeup() if it has work to do */ \
143 if ((nfsbuffreecnt > nfsbufcnt/4) && \
144 (nfsbufcnt-nfsbuffreecnt/8 > nfsbufmin)) \
145 nfs_buf_freeup(); \
146 } while (0)
147
148 if (nfsbuffreecnt < nfsbufcnt/4)
149 return;
150 cnt = nfsbuffreecnt/8;
151 if (nfsbufcnt-cnt < nfsbufmin)
152 return;
153
154 FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
155 while (cnt-- > 0) {
156 fbp = TAILQ_FIRST(&nfsbuffree);
157 if (!fbp)
158 break;
159 nfs_buf_remfree(fbp);
160 /* disassociate buffer from any vnode */
161 if (fbp->nb_vp) {
162 struct vnode *oldvp;
163 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
164 LIST_REMOVE(fbp, nb_vnbufs);
165 fbp->nb_vnbufs.le_next = NFSNOLIST;
166 }
167 oldvp = fbp->nb_vp;
168 fbp->nb_vp = NULL;
169 HOLDRELE(oldvp);
170 }
171 LIST_REMOVE(fbp, nb_hash);
172 /* nuke any creds */
173 if (fbp->nb_rcred != NOCRED)
174 crfree(fbp->nb_rcred);
175 if (fbp->nb_wcred != NOCRED)
176 crfree(fbp->nb_wcred);
177 /* if buf was NB_META, dump buffer */
178 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
179 FREE(fbp->nb_data, M_TEMP);
180 }
181 FREE(fbp, M_TEMP);
182 nfsbufcnt--;
183 }
184 FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
185 }
186
187 void
188 nfs_buf_remfree(struct nfsbuf *bp)
189 {
190 if (bp->nb_free.tqe_next == NFSNOLIST)
191 panic("nfsbuf not on free list");
192 if (ISSET(bp->nb_flags, NB_DELWRI)) {
193 nfsbufdelwricnt--;
194 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
195 } else {
196 nfsbuffreecnt--;
197 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
198 }
199 bp->nb_free.tqe_next = NFSNOLIST;
200 NFSBUFCNTCHK();
201 }
202
203 /*
204 * check for existence of nfsbuf in cache
205 */
206 struct nfsbuf *
207 nfs_buf_incore(struct vnode *vp, daddr_t blkno)
208 {
209 /* Search hash chain */
210 struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first;
211 for (; bp != NULL; bp = bp->nb_hash.le_next)
212 if (bp->nb_lblkno == blkno && bp->nb_vp == vp &&
213 !ISSET(bp->nb_flags, NB_INVAL)) {
214 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
215 return (bp);
216 }
217 return (NULL);
218 }
219
220 /*
221 * Check if it's OK to drop a page.
222 *
223 * Called by vnode_pager() on pageout request of non-dirty page.
224 * We need to make sure that it's not part of a delayed write.
225 * If it is, we can't let the VM drop it because we may need it
226 * later when/if we need to write the data (again).
227 */
228 int
229 nfs_buf_page_inval(struct vnode *vp, off_t offset)
230 {
231 struct nfsbuf *bp;
232 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
233 if (!bp)
234 return (0);
235 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
236 if (ISSET(bp->nb_flags, NB_BUSY))
237 return (EBUSY);
238 /*
239 * If there's a dirty range in the buffer, check to
240 * see if this page intersects with the dirty range.
241 * If it does, we can't let the pager drop the page.
242 */
243 if (bp->nb_dirtyend > 0) {
244 int start = offset - NBOFF(bp);
245 if (bp->nb_dirtyend <= start ||
246 bp->nb_dirtyoff >= (start + PAGE_SIZE))
247 return (0);
248 return (EBUSY);
249 }
250 return (0);
251 }
252
253 int
254 nfs_buf_upl_setup(struct nfsbuf *bp)
255 {
256 kern_return_t kret;
257 upl_t upl;
258 int s;
259
260 if (ISSET(bp->nb_flags, NB_PAGELIST))
261 return (0);
262
263 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
264 &upl, NULL, UPL_PRECIOUS);
265 if (kret == KERN_INVALID_ARGUMENT) {
266 /* vm object probably doesn't exist any more */
267 bp->nb_pagelist = NULL;
268 return (EINVAL);
269 }
270 if (kret != KERN_SUCCESS) {
271 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
272 bp->nb_pagelist = NULL;
273 return (EIO);
274 }
275
276 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
277
278 s = splbio();
279 bp->nb_pagelist = upl;
280 SET(bp->nb_flags, NB_PAGELIST);
281 splx(s);
282 return (0);
283 }
284
285 void
286 nfs_buf_upl_check(struct nfsbuf *bp)
287 {
288 upl_page_info_t *pl;
289 off_t filesize, fileoffset;
290 int i, npages;
291
292 if (!ISSET(bp->nb_flags, NB_PAGELIST))
293 return;
294
295 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
296 filesize = ubc_getsize(bp->nb_vp);
297 fileoffset = NBOFF(bp);
298 if (fileoffset < filesize)
299 SET(bp->nb_flags, NB_CACHE);
300 else
301 CLR(bp->nb_flags, NB_CACHE);
302
303 pl = ubc_upl_pageinfo(bp->nb_pagelist);
304 bp->nb_valid = bp->nb_dirty = 0;
305
306 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
307 /* anything beyond the end of the file is not valid or dirty */
308 if (fileoffset >= filesize)
309 break;
310 if (!upl_valid_page(pl, i)) {
311 CLR(bp->nb_flags, NB_CACHE);
312 continue;
313 }
314 NBPGVALID_SET(bp,i);
315 if (upl_dirty_page(pl, i)) {
316 NBPGDIRTY_SET(bp, i);
317 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
318 SET(bp->nb_flags, NB_WASDIRTY);
319 }
320 }
321 fileoffset = NBOFF(bp);
322 if (ISSET(bp->nb_flags, NB_CACHE)) {
323 bp->nb_validoff = 0;
324 bp->nb_validend = bp->nb_bufsize;
325 if (fileoffset + bp->nb_validend > filesize)
326 bp->nb_validend = filesize - fileoffset;
327 } else {
328 bp->nb_validoff = bp->nb_validend = -1;
329 }
330 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
331 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
332 }
333
334 static int
335 nfs_buf_map(struct nfsbuf *bp)
336 {
337 kern_return_t kret;
338
339 if (bp->nb_data)
340 return (0);
341 if (!ISSET(bp->nb_flags, NB_PAGELIST))
342 return (EINVAL);
343
344 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
345 if (kret != KERN_SUCCESS)
346 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
347 if (bp->nb_data == 0)
348 panic("ubc_upl_map mapped 0");
349 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
350 return (0);
351 }
352
353 /*
354 * check range of pages in nfsbuf's UPL for validity
355 */
356 static int
357 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
358 {
359 off_t fileoffset, filesize;
360 int pg, lastpg;
361 upl_page_info_t *pl;
362
363 if (!ISSET(bp->nb_flags, NB_PAGELIST))
364 return (0);
365 pl = ubc_upl_pageinfo(bp->nb_pagelist);
366
367 size += off & PAGE_MASK;
368 off &= ~PAGE_MASK;
369 fileoffset = NBOFF(bp);
370 filesize = VTONFS(bp->nb_vp)->n_size;
371 if ((fileoffset + off + size) > filesize)
372 size = filesize - (fileoffset + off);
373
374 pg = off/PAGE_SIZE;
375 lastpg = (off + size - 1)/PAGE_SIZE;
376 while (pg <= lastpg) {
377 if (!upl_valid_page(pl, pg))
378 return (0);
379 pg++;
380 }
381 return (1);
382 }
383
384 /*
385 * normalize an nfsbuf's valid range
386 *
387 * the read/write code guarantees that we'll always have a valid
388 * region that is an integral number of pages. If either end
389 * of the valid range isn't page-aligned, it gets corrected
390 * here as we extend the valid range through all of the
391 * contiguous valid pages.
392 */
393 static void
394 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
395 {
396 int pg, npg;
397 /* pull validoff back to start of contiguous valid page range */
398 pg = bp->nb_validoff/PAGE_SIZE;
399 while (pg >= 0 && NBPGVALID(bp,pg))
400 pg--;
401 bp->nb_validoff = (pg+1) * PAGE_SIZE;
402 /* push validend forward to end of contiguous valid page range */
403 npg = bp->nb_bufsize/PAGE_SIZE;
404 pg = bp->nb_validend/PAGE_SIZE;
405 while (pg < npg && NBPGVALID(bp,pg))
406 pg++;
407 bp->nb_validend = pg * PAGE_SIZE;
408 /* clip to EOF */
409 if (NBOFF(bp) + bp->nb_validend > np->n_size)
410 bp->nb_validend = np->n_size % bp->nb_bufsize;
411 }
412
413 /*
414 * try to push out some delayed/uncommitted writes
415 */
416 static void
417 nfs_buf_delwri_push(void)
418 {
419 struct nfsbuf *bp;
420 int i;
421
422 if (TAILQ_EMPTY(&nfsbufdelwri))
423 return;
424
425 /* first try to tell the nfsiods to do it */
426 if (nfs_asyncio(NULL, NULL) == 0)
427 return;
428
429 /* otherwise, try to do some of the work ourselves */
430 i = 0;
431 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
432 struct nfsnode *np = VTONFS(bp->nb_vp);
433 nfs_buf_remfree(bp);
434 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
435 /* put buffer at end of delwri list */
436 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
437 nfsbufdelwricnt++;
438 nfs_flushcommits(np->n_vnode, (struct proc *)0);
439 } else {
440 SET(bp->nb_flags, (NB_BUSY | NB_ASYNC));
441 nfs_buf_write(bp);
442 }
443 i++;
444 }
445 }
446
447 /*
448 * Get an nfs cache block.
449 * Allocate a new one if the block isn't currently in the cache
450 * and return the block marked busy. If the calling process is
451 * interrupted by a signal for an interruptible mount point, return
452 * NULL.
453 */
454 struct nfsbuf *
455 nfs_buf_get(
456 struct vnode *vp,
457 daddr_t blkno,
458 int size,
459 struct proc *p,
460 int operation)
461 {
462 struct nfsnode *np = VTONFS(vp);
463 struct nfsbuf *bp;
464 int i, biosize, bufsize, rv;
465 struct ucred *cred;
466 int slpflag = PCATCH;
467
468 FSDBG_TOP(541, vp, blkno, size, operation);
469
470 bufsize = size;
471 if (bufsize > MAXBSIZE)
472 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
473
474 biosize = vp->v_mount->mnt_stat.f_iosize;
475
476 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp))
477 operation = BLK_META;
478 else if (bufsize < biosize)
479 /* reg files should always have biosize blocks */
480 bufsize = biosize;
481
482 /* if BLK_WRITE, check for too many delayed/uncommitted writes */
483 if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
484 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
485
486 /* poke the delwri list */
487 nfs_buf_delwri_push();
488
489 /* sleep to let other threads run... */
490 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
491 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
492 }
493
494 loop:
495 /*
496 * Obtain a lock to prevent a race condition if the
497 * MALLOC() below happens to block.
498 */
499 if (nfsbufhashlock) {
500 while (nfsbufhashlock) {
501 nfsbufhashlock = -1;
502 tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0);
503 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))
504 return (NULL);
505 }
506 goto loop;
507 }
508 nfsbufhashlock = 1;
509
510 /* check for existence of nfsbuf in cache */
511 if (bp = nfs_buf_incore(vp, blkno)) {
512 /* if busy, set wanted and wait */
513 if (ISSET(bp->nb_flags, NB_BUSY)) {
514 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
515 SET(bp->nb_flags, NB_WANTED);
516 /* unlock hash */
517 if (nfsbufhashlock < 0) {
518 nfsbufhashlock = 0;
519 wakeup(&nfsbufhashlock);
520 } else
521 nfsbufhashlock = 0;
522 tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz);
523 slpflag = 0;
524 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
525 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
526 FSDBG_BOT(541, vp, blkno, 0, EINTR);
527 return (NULL);
528 }
529 goto loop;
530 }
531 if (bp->nb_bufsize != bufsize)
532 panic("nfsbuf size mismatch");
533 SET(bp->nb_flags, (NB_BUSY | NB_CACHE));
534 nfs_buf_remfree(bp);
535 /* additional paranoia: */
536 if (ISSET(bp->nb_flags, NB_PAGELIST))
537 panic("pagelist buffer was not busy");
538 goto buffer_setup;
539 }
540
541 /*
542 * where to get a free buffer:
543 * - alloc new if we haven't reached min bufs
544 * - free list
545 * - alloc new if we haven't reached max allowed
546 * - start clearing out delwri list and try again
547 */
548
549 if ((nfsbufcnt > nfsbufmin) && !TAILQ_EMPTY(&nfsbuffree)) {
550 /* pull an nfsbuf off the free list */
551 bp = TAILQ_FIRST(&nfsbuffree);
552 FSDBG(544, vp, blkno, bp, bp->nb_flags);
553 nfs_buf_remfree(bp);
554 if (ISSET(bp->nb_flags, NB_DELWRI))
555 panic("nfs_buf_get: delwri");
556 SET(bp->nb_flags, NB_BUSY);
557 /* disassociate buffer from previous vnode */
558 if (bp->nb_vp) {
559 struct vnode *oldvp;
560 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
561 LIST_REMOVE(bp, nb_vnbufs);
562 bp->nb_vnbufs.le_next = NFSNOLIST;
563 }
564 oldvp = bp->nb_vp;
565 bp->nb_vp = NULL;
566 HOLDRELE(oldvp);
567 }
568 LIST_REMOVE(bp, nb_hash);
569 /* nuke any creds we're holding */
570 cred = bp->nb_rcred;
571 if (cred != NOCRED) {
572 bp->nb_rcred = NOCRED;
573 crfree(cred);
574 }
575 cred = bp->nb_wcred;
576 if (cred != NOCRED) {
577 bp->nb_wcred = NOCRED;
578 crfree(cred);
579 }
580 /* if buf will no longer be NB_META, dump old buffer */
581 if ((operation != BLK_META) &&
582 ISSET(bp->nb_flags, NB_META) && bp->nb_data) {
583 FREE(bp->nb_data, M_TEMP);
584 bp->nb_data = NULL;
585 }
586 /* re-init buf fields */
587 bp->nb_error = 0;
588 bp->nb_validoff = bp->nb_validend = -1;
589 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
590 bp->nb_valid = 0;
591 bp->nb_dirty = 0;
592 } else if (nfsbufcnt < nfsbufmax) {
593 /* just alloc a new one */
594 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
595 nfsbufcnt++;
596 NFSBUFCNTCHK();
597 /* init nfsbuf */
598 bzero(bp, sizeof(*bp));
599 bp->nb_free.tqe_next = NFSNOLIST;
600 bp->nb_validoff = bp->nb_validend = -1;
601 FSDBG(545, vp, blkno, bp, 0);
602 } else {
603 /* too many bufs... wait for buffers to free up */
604 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
605 /* unlock hash */
606 if (nfsbufhashlock < 0) {
607 nfsbufhashlock = 0;
608 wakeup(&nfsbufhashlock);
609 } else
610 nfsbufhashlock = 0;
611
612 /* poke the delwri list */
613 nfs_buf_delwri_push();
614
615 nfsneedbuffer = 1;
616 tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0);
617 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
618 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
619 FSDBG_BOT(541, vp, blkno, 0, EINTR);
620 return (NULL);
621 }
622 goto loop;
623 }
624
625 setup_nfsbuf:
626
627 /* setup nfsbuf */
628 bp->nb_flags = NB_BUSY;
629 bp->nb_lblkno = blkno;
630 /* insert buf in hash */
631 LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash);
632 /* associate buffer with new vnode */
633 VHOLD(vp);
634 bp->nb_vp = vp;
635 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
636
637 buffer_setup:
638
639 switch (operation) {
640 case BLK_META:
641 SET(bp->nb_flags, NB_META);
642 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
643 FREE(bp->nb_data, M_TEMP);
644 bp->nb_data = NULL;
645 bp->nb_validoff = bp->nb_validend = -1;
646 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
647 bp->nb_valid = 0;
648 bp->nb_dirty = 0;
649 CLR(bp->nb_flags, NB_CACHE);
650 }
651 if (!bp->nb_data)
652 MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK);
653 if (!bp->nb_data)
654 panic("nfs_buf_get: null nb_data");
655 bp->nb_bufsize = bufsize;
656 break;
657
658 case BLK_READ:
659 case BLK_WRITE:
660 if (bufsize < PAGE_SIZE)
661 bufsize = PAGE_SIZE;
662 bp->nb_bufsize = bufsize;
663 bp->nb_validoff = bp->nb_validend = -1;
664
665 if (UBCISVALID(vp)) {
666 /* setup upl */
667 if (nfs_buf_upl_setup(bp)) {
668 /* unable to create upl */
669 /* vm object must no longer exist */
670 /* cleanup buffer and return NULL */
671 LIST_REMOVE(bp, nb_vnbufs);
672 bp->nb_vnbufs.le_next = NFSNOLIST;
673 bp->nb_vp = NULL;
674 HOLDRELE(vp);
675 if (bp->nb_free.tqe_next != NFSNOLIST)
676 panic("nfsbuf on freelist");
677 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
678 nfsbuffreecnt++;
679 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
680 return (NULL);
681 }
682 nfs_buf_upl_check(bp);
683 }
684 break;
685
686 default:
687 panic("nfs_buf_get: %d unknown operation", operation);
688 }
689
690 /* unlock hash */
691 if (nfsbufhashlock < 0) {
692 nfsbufhashlock = 0;
693 wakeup(&nfsbufhashlock);
694 } else
695 nfsbufhashlock = 0;
696
697 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
698
699 return (bp);
700 }
701
702 void
703 nfs_buf_release(struct nfsbuf *bp)
704 {
705 struct vnode *vp = bp->nb_vp;
706
707 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
708 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
709 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
710
711 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
712 int upl_flags;
713 upl_t upl;
714 int i, rv;
715
716 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
717 rv = nfs_buf_upl_setup(bp);
718 if (rv)
719 printf("nfs_buf_release: upl create failed %d\n", rv);
720 else
721 nfs_buf_upl_check(bp);
722 }
723 upl = bp->nb_pagelist;
724 if (!upl)
725 goto pagelist_cleanup_done;
726 if (bp->nb_data) {
727 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
728 panic("ubc_upl_unmap failed");
729 bp->nb_data = NULL;
730 }
731 if (bp->nb_flags & (NB_ERROR | NB_INVAL)) {
732 if (bp->nb_flags & (NB_READ | NB_INVAL))
733 upl_flags = UPL_ABORT_DUMP_PAGES;
734 else
735 upl_flags = 0;
736 ubc_upl_abort(upl, upl_flags);
737 goto pagelist_cleanup_done;
738 }
739 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
740 if (!NBPGVALID(bp,i))
741 ubc_upl_abort_range(upl,
742 i*PAGE_SIZE, PAGE_SIZE,
743 UPL_ABORT_DUMP_PAGES |
744 UPL_ABORT_FREE_ON_EMPTY);
745 else {
746 if (NBPGDIRTY(bp,i))
747 upl_flags = UPL_COMMIT_SET_DIRTY;
748 else
749 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
750 ubc_upl_commit_range(upl,
751 i*PAGE_SIZE, PAGE_SIZE,
752 upl_flags |
753 UPL_COMMIT_INACTIVATE |
754 UPL_COMMIT_FREE_ON_EMPTY);
755 }
756 }
757 pagelist_cleanup_done:
758 /* was this the last buffer in the file? */
759 if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) {
760 /* if so, invalidate all pages of last buffer past EOF */
761 int biosize = vp->v_mount->mnt_stat.f_iosize;
762 off_t off, size;
763 off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
764 size = trunc_page_64(NBOFF(bp) + biosize) - off;
765 if (size)
766 ubc_invalidate(vp, off, size);
767 }
768 CLR(bp->nb_flags, NB_PAGELIST);
769 bp->nb_pagelist = NULL;
770 }
771
772 /* Wake up any processes waiting for any buffer to become free. */
773 if (nfsneedbuffer) {
774 nfsneedbuffer = 0;
775 wakeup(&nfsneedbuffer);
776 }
777 /* Wake up any processes waiting for _this_ buffer to become free. */
778 if (ISSET(bp->nb_flags, NB_WANTED)) {
779 CLR(bp->nb_flags, NB_WANTED);
780 wakeup(bp);
781 }
782
783 /* If it's not cacheable, or an error, mark it invalid. */
784 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
785 SET(bp->nb_flags, NB_INVAL);
786
787 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
788 /* If it's invalid or empty, dissociate it from its vnode */
789 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
790 LIST_REMOVE(bp, nb_vnbufs);
791 bp->nb_vnbufs.le_next = NFSNOLIST;
792 }
793 bp->nb_vp = NULL;
794 HOLDRELE(vp);
795 /* if this was a delayed write, wakeup anyone */
796 /* waiting for delayed writes to complete */
797 if (ISSET(bp->nb_flags, NB_DELWRI)) {
798 CLR(bp->nb_flags, NB_DELWRI);
799 nfs_nbdwrite--;
800 NFSBUFCNTCHK();
801 wakeup((caddr_t)&nfs_nbdwrite);
802 }
803 /* put buffer at head of free list */
804 if (bp->nb_free.tqe_next != NFSNOLIST)
805 panic("nfsbuf on freelist");
806 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
807 nfsbuffreecnt++;
808 NFS_BUF_FREEUP();
809 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
810 /* put buffer at end of delwri list */
811 if (bp->nb_free.tqe_next != NFSNOLIST)
812 panic("nfsbuf on freelist");
813 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
814 nfsbufdelwricnt++;
815 } else {
816 /* put buffer at end of free list */
817 if (bp->nb_free.tqe_next != NFSNOLIST)
818 panic("nfsbuf on freelist");
819 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
820 nfsbuffreecnt++;
821 NFS_BUF_FREEUP();
822 }
823
824 NFSBUFCNTCHK();
825
826 /* Unlock the buffer. */
827 CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD));
828
829 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
830 }
831
832 /*
833 * Wait for operations on the buffer to complete.
834 * When they do, extract and return the I/O's error value.
835 */
836 int
837 nfs_buf_iowait(struct nfsbuf *bp)
838 {
839 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
840
841 while (!ISSET(bp->nb_flags, NB_DONE))
842 tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0);
843
844 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
845
846 /* check for interruption of I/O, then errors. */
847 if (ISSET(bp->nb_flags, NB_EINTR)) {
848 CLR(bp->nb_flags, NB_EINTR);
849 return (EINTR);
850 } else if (ISSET(bp->nb_flags, NB_ERROR))
851 return (bp->nb_error ? bp->nb_error : EIO);
852 return (0);
853 }
854
855 /*
856 * Mark I/O complete on a buffer.
857 */
858 void
859 nfs_buf_iodone(struct nfsbuf *bp)
860 {
861 struct vnode *vp;
862
863 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
864
865 if (ISSET(bp->nb_flags, NB_DONE))
866 panic("nfs_buf_iodone already");
867 SET(bp->nb_flags, NB_DONE); /* note that it's done */
868 /*
869 * I/O was done, so don't believe
870 * the DIRTY state from VM anymore
871 */
872 CLR(bp->nb_flags, NB_WASDIRTY);
873
874 if (!ISSET(bp->nb_flags, NB_READ)) {
875 CLR(bp->nb_flags, NB_WRITEINPROG);
876 vpwakeup(bp->nb_vp);
877 }
878
879 /* Wakeup the throttled write operations as needed */
880 vp = bp->nb_vp;
881 if (vp && (vp->v_flag & VTHROTTLED)
882 && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) {
883 vp->v_flag &= ~VTHROTTLED;
884 wakeup((caddr_t)&vp->v_numoutput);
885 }
886
887 if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */
888 nfs_buf_release(bp);
889 else { /* or just wakeup the buffer */
890 CLR(bp->nb_flags, NB_WANTED);
891 wakeup(bp);
892 }
893
894 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
895 }
896
897 void
898 nfs_buf_write_delayed(struct nfsbuf *bp)
899 {
900 struct proc *p = current_proc();
901 struct vnode *vp = bp->nb_vp;
902
903 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
904 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
905
906 /*
907 * If the block hasn't been seen before:
908 * (1) Mark it as having been seen,
909 * (2) Charge for the write.
910 * (3) Make sure it's on its vnode's correct block list,
911 */
912 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
913 SET(bp->nb_flags, NB_DELWRI);
914 if (p && p->p_stats)
915 p->p_stats->p_ru.ru_oublock++; /* XXX */
916 nfs_nbdwrite++;
917 NFSBUFCNTCHK();
918 /* move to dirty list */
919 if (bp->nb_vnbufs.le_next != NFSNOLIST)
920 LIST_REMOVE(bp, nb_vnbufs);
921 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
922 }
923
924 /*
925 * If the vnode has "too many" write operations in progress
926 * wait for them to finish the IO
927 */
928 while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) {
929 vp->v_flag |= VTHROTTLED;
930 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0);
931 }
932
933 /*
934 * If we have too many delayed write buffers,
935 * more than we can "safely" handle, just fall back to
936 * doing the async write
937 */
938 if (nfs_nbdwrite < 0)
939 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
940
941 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
942 /* issue async write */
943 SET(bp->nb_flags, NB_ASYNC);
944 nfs_buf_write(bp);
945 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
946 return;
947 }
948
949 /* Otherwise, the "write" is done, so mark and release the buffer. */
950 SET(bp->nb_flags, NB_DONE);
951 nfs_buf_release(bp);
952 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
953 return;
954 }
955
956
957 /*
958 * Vnode op for read using bio
959 * Any similarity to readip() is purely coincidental
960 */
961 int
962 nfs_bioread(vp, uio, ioflag, cred, getpages)
963 register struct vnode *vp;
964 register struct uio *uio;
965 int ioflag;
966 struct ucred *cred;
967 int getpages; // XXX unused!
968 {
969 struct nfsnode *np = VTONFS(vp);
970 int biosize, i;
971 off_t diff;
972 struct nfsbuf *bp = 0, *rabp;
973 struct vattr vattr;
974 struct proc *p;
975 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
976 daddr_t lbn, rabn, lastrabn = -1;
977 int bufsize;
978 int nra, error = 0, n = 0, on = 0;
979 int operation = (getpages? BLK_PAGEIN : BLK_READ);
980 caddr_t dp;
981 struct dirent *direntp;
982
983 FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag);
984
985 #if DIAGNOSTIC
986 if (uio->uio_rw != UIO_READ)
987 panic("nfs_read mode");
988 #endif
989 if (uio->uio_resid == 0) {
990 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
991 return (0);
992 }
993 if (uio->uio_offset < 0) {
994 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
995 return (EINVAL);
996 }
997 p = uio->uio_procp;
998 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
999 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1000 (void)nfs_fsinfo(nmp, vp, cred, p);
1001 biosize = vp->v_mount->mnt_stat.f_iosize;
1002 /*
1003 * For nfs, cache consistency can only be maintained approximately.
1004 * Although RFC1094 does not specify the criteria, the following is
1005 * believed to be compatible with the reference port.
1006 * For nqnfs, full cache consistency is maintained within the loop.
1007 * For nfs:
1008 * If the file's modify time on the server has changed since the
1009 * last read rpc or you have written to the file,
1010 * you may have lost data cache consistency with the
1011 * server, so flush all of the file's data out of the cache.
1012 * Then force a getattr rpc to ensure that you have up to date
1013 * attributes.
1014 * NB: This implies that cache data can be read when up to
1015 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1016 * current attributes this could be forced by setting n_xid to 0
1017 * before the VOP_GETATTR() call.
1018 */
1019 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1020 if (np->n_flag & NMODIFIED) {
1021 if (vp->v_type != VREG) {
1022 if (vp->v_type != VDIR)
1023 panic("nfs: bioread, not dir");
1024 nfs_invaldir(vp);
1025 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1026 if (error) {
1027 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1028 return (error);
1029 }
1030 }
1031 np->n_xid = 0;
1032 error = VOP_GETATTR(vp, &vattr, cred, p);
1033 if (error) {
1034 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1035 return (error);
1036 }
1037 np->n_mtime = vattr.va_mtime.tv_sec;
1038 } else {
1039 error = VOP_GETATTR(vp, &vattr, cred, p);
1040 if (error) {
1041 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1042 return (error);
1043 }
1044 if (np->n_mtime != vattr.va_mtime.tv_sec) {
1045 if (vp->v_type == VDIR) {
1046 nfs_invaldir(vp);
1047 /* purge name cache entries */
1048 cache_purge(vp);
1049 }
1050 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1051 if (error) {
1052 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1053 return (error);
1054 }
1055 np->n_mtime = vattr.va_mtime.tv_sec;
1056 }
1057 }
1058 }
1059 do {
1060
1061 /*
1062 * Get a valid lease. If cached data is stale, flush it.
1063 */
1064 if (nmp->nm_flag & NFSMNT_NQNFS) {
1065 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1066 do {
1067 error = nqnfs_getlease(vp, ND_READ, cred, p);
1068 } while (error == NQNFS_EXPIRED);
1069 if (error) {
1070 FSDBG_BOT(514, vp, 0xd1e0007, 0, error);
1071 return (error);
1072 }
1073 if (np->n_lrev != np->n_brev ||
1074 (np->n_flag & NQNFSNONCACHE) ||
1075 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
1076 if (vp->v_type == VDIR)
1077 nfs_invaldir(vp);
1078 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1079 if (error) {
1080 FSDBG_BOT(514, vp, 0xd1e0008, 0, error);
1081 return (error);
1082 }
1083 np->n_brev = np->n_lrev;
1084 }
1085 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
1086 nfs_invaldir(vp);
1087 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1088 if (error) {
1089 FSDBG_BOT(514, vp, 0xd1e0009, 0, error);
1090 return (error);
1091 }
1092 }
1093 }
1094 if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) {
1095 if ((vp->v_flag & VNOCACHE_DATA) &&
1096 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1097 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1098 if (error) {
1099 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1100 return (error);
1101 }
1102 }
1103 switch (vp->v_type) {
1104 case VREG:
1105 error = nfs_readrpc(vp, uio, cred);
1106 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1107 return (error);
1108 case VLNK:
1109 error = nfs_readlinkrpc(vp, uio, cred);
1110 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1111 return (error);
1112 case VDIR:
1113 break;
1114 default:
1115 printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type);
1116 };
1117 }
1118 switch (vp->v_type) {
1119 case VREG:
1120 lbn = uio->uio_offset / biosize;
1121
1122 /*
1123 * Copy directly from any cached pages without grabbing the bufs.
1124 */
1125 if (uio->uio_segflg == UIO_USERSPACE) {
1126 int io_resid = uio->uio_resid;
1127 diff = np->n_size - uio->uio_offset;
1128 if (diff < io_resid)
1129 io_resid = diff;
1130 if (io_resid > 0) {
1131 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1132 if (error) {
1133 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1134 return (error);
1135 }
1136 }
1137 /* count any biocache reads that we just copied directly */
1138 if (lbn != uio->uio_offset / biosize) {
1139 nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn;
1140 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1141 }
1142 }
1143
1144 lbn = uio->uio_offset / biosize;
1145 on = uio->uio_offset % biosize;
1146
1147 /*
1148 * Start the read ahead(s), as required.
1149 */
1150 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1151 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1152 rabn = lbn + 1 + nra;
1153 if (rabn <= lastrabn) {
1154 /* we've already (tried to) read this block */
1155 /* no need to try it again... */
1156 continue;
1157 }
1158 lastrabn = rabn;
1159 if ((off_t)rabn * biosize >= np->n_size)
1160 break;
1161 /* check if block exists and is valid. */
1162 rabp = nfs_buf_incore(vp, rabn);
1163 if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize))
1164 continue;
1165 rabp = nfs_buf_get(vp, rabn, biosize, p, operation);
1166 if (!rabp) {
1167 FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR);
1168 return (EINTR);
1169 }
1170 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1171 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1172 if (nfs_asyncio(rabp, cred)) {
1173 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1174 rabp->nb_error = EIO;
1175 nfs_buf_release(rabp);
1176 }
1177 } else
1178 nfs_buf_release(rabp);
1179 }
1180 }
1181
1182 if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) {
1183 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa);
1184 return (0);
1185 }
1186
1187 nfsstats.biocache_reads++;
1188
1189 /*
1190 * If the block is in the cache and has the required data
1191 * in a valid region, just copy it out.
1192 * Otherwise, get the block and write back/read in,
1193 * as required.
1194 */
1195 again:
1196 bufsize = biosize;
1197 n = min((unsigned)(bufsize - on), uio->uio_resid);
1198 diff = np->n_size - uio->uio_offset;
1199 if (diff < n)
1200 n = diff;
1201
1202 bp = nfs_buf_get(vp, lbn, bufsize, p, operation);
1203 if (!bp) {
1204 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1205 return (EINTR);
1206 }
1207
1208 /* if any pages are valid... */
1209 if (bp->nb_valid) {
1210 /* ...check for any invalid pages in the read range */
1211 int pg, firstpg, lastpg, dirtypg;
1212 dirtypg = firstpg = lastpg = -1;
1213 pg = on/PAGE_SIZE;
1214 while (pg <= (on + n - 1)/PAGE_SIZE) {
1215 if (!NBPGVALID(bp,pg)) {
1216 if (firstpg < 0)
1217 firstpg = pg;
1218 lastpg = pg;
1219 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1220 dirtypg = pg;
1221 pg++;
1222 }
1223
1224 /* if there are no invalid pages, we're all set */
1225 if (firstpg < 0) {
1226 if (bp->nb_validoff < 0) {
1227 /* valid range isn't set up, so */
1228 /* set it to what we know is valid */
1229 bp->nb_validoff = trunc_page_32(on);
1230 bp->nb_validend = round_page_32(on+n);
1231 nfs_buf_normalize_valid_range(np, bp);
1232 }
1233 goto buffer_ready;
1234 }
1235
1236 /* there are invalid pages in the read range */
1237 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1238 /* there are also dirty page(s) in the range, */
1239 /* so write the buffer out and try again */
1240 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1241 SET(bp->nb_flags, NB_ASYNC);
1242 /*
1243 * NFS has embedded ucred so crhold() risks zone corruption
1244 */
1245 if (bp->nb_wcred == NOCRED)
1246 bp->nb_wcred = crdup(cred);
1247 error = nfs_buf_write(bp);
1248 if (error) {
1249 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1250 return (error);
1251 }
1252 goto again;
1253 }
1254 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1255 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1256 /* we need to read in more than half the buffer and the */
1257 /* buffer's not dirty, so just fetch the whole buffer */
1258 bp->nb_valid = 0;
1259 } else {
1260 /* read the page range in */
1261 struct iovec iov;
1262 struct uio auio;
1263 auio.uio_iov = &iov;
1264 auio.uio_iovcnt = 1;
1265 auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64;
1266 auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE;
1267 auio.uio_segflg = UIO_SYSSPACE;
1268 auio.uio_rw = UIO_READ;
1269 auio.uio_procp = p;
1270 NFS_BUF_MAP(bp);
1271 iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE;
1272 iov.iov_len = auio.uio_resid;
1273 error = nfs_readrpc(vp, &auio, cred);
1274 if (error) {
1275 nfs_buf_release(bp);
1276 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1277 return (error);
1278 }
1279 /* Make sure that the valid range is set to cover this read. */
1280 bp->nb_validoff = trunc_page_32(on);
1281 bp->nb_validend = round_page_32(on+n);
1282 nfs_buf_normalize_valid_range(np, bp);
1283 if (auio.uio_resid > 0) {
1284 /* if short read, must have hit EOF, */
1285 /* so zero the rest of the range */
1286 bzero(iov.iov_base, auio.uio_resid);
1287 }
1288 /* mark the pages (successfully read) as valid */
1289 for (pg=firstpg; pg <= lastpg; pg++)
1290 NBPGVALID_SET(bp,pg);
1291 }
1292 }
1293 /* if no pages are valid, read the whole block */
1294 if (!bp->nb_valid) {
1295 SET(bp->nb_flags, NB_READ);
1296 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1297 error = nfs_doio(bp, cred, p);
1298 if (error) {
1299 nfs_buf_release(bp);
1300 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1301 return (error);
1302 }
1303 }
1304 buffer_ready:
1305 vp->v_lastr = lbn;
1306 /* validate read range against valid range and clip */
1307 if (bp->nb_validend > 0) {
1308 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1309 if (diff < n)
1310 n = diff;
1311 }
1312 if (n > 0)
1313 NFS_BUF_MAP(bp);
1314 break;
1315 case VLNK:
1316 nfsstats.biocache_readlinks++;
1317 bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
1318 if (!bp) {
1319 FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR);
1320 return (EINTR);
1321 }
1322 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1323 SET(bp->nb_flags, NB_READ);
1324 error = nfs_doio(bp, cred, p);
1325 if (error) {
1326 SET(bp->nb_flags, NB_ERROR);
1327 nfs_buf_release(bp);
1328 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1329 return (error);
1330 }
1331 }
1332 n = min(uio->uio_resid, bp->nb_validend);
1333 on = 0;
1334 break;
1335 case VDIR:
1336 nfsstats.biocache_readdirs++;
1337 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1338 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1339 return (0);
1340 }
1341 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1342 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1343 bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation);
1344 if (!bp) {
1345 FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR);
1346 return (EINTR);
1347 }
1348 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1349 SET(bp->nb_flags, NB_READ);
1350 error = nfs_doio(bp, cred, p);
1351 if (error) {
1352 nfs_buf_release(bp);
1353 }
1354 while (error == NFSERR_BAD_COOKIE) {
1355 nfs_invaldir(vp);
1356 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1357 /*
1358 * Yuck! The directory has been modified on the
1359 * server. The only way to get the block is by
1360 * reading from the beginning to get all the
1361 * offset cookies.
1362 */
1363 for (i = 0; i <= lbn && !error; i++) {
1364 if (np->n_direofoffset
1365 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1366 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1367 return (0);
1368 }
1369 bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation);
1370 if (!bp) {
1371 FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR);
1372 return (EINTR);
1373 }
1374 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1375 SET(bp->nb_flags, NB_READ);
1376 error = nfs_doio(bp, cred, p);
1377 /*
1378 * no error + NB_INVAL == directory EOF,
1379 * use the block.
1380 */
1381 if (error == 0 && (bp->nb_flags & NB_INVAL))
1382 break;
1383 }
1384 /*
1385 * An error will throw away the block and the
1386 * for loop will break out. If no error and this
1387 * is not the block we want, we throw away the
1388 * block and go for the next one via the for loop.
1389 */
1390 if (error || i < lbn)
1391 nfs_buf_release(bp);
1392 }
1393 }
1394 /*
1395 * The above while is repeated if we hit another cookie
1396 * error. If we hit an error and it wasn't a cookie error,
1397 * we give up.
1398 */
1399 if (error) {
1400 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1401 return (error);
1402 }
1403 }
1404
1405 /*
1406 * If not eof and read aheads are enabled, start one.
1407 * (You need the current block first, so that you have the
1408 * directory offset cookie of the next block.)
1409 */
1410 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1411 (np->n_direofoffset == 0 ||
1412 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1413 !(np->n_flag & NQNFSNONCACHE) &&
1414 !nfs_buf_incore(vp, lbn + 1)) {
1415 rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p,
1416 operation);
1417 if (rabp) {
1418 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1419 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1420 if (nfs_asyncio(rabp, cred)) {
1421 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1422 rabp->nb_error = EIO;
1423 nfs_buf_release(rabp);
1424 }
1425 } else {
1426 nfs_buf_release(rabp);
1427 }
1428 }
1429 }
1430 /*
1431 * Make sure we use a signed variant of min() since
1432 * the second term may be negative.
1433 */
1434 n = lmin(uio->uio_resid, bp->nb_validend - on);
1435 /*
1436 * We keep track of the directory eof in
1437 * np->n_direofoffset and chop it off as an
1438 * extra step right here.
1439 */
1440 if (np->n_direofoffset &&
1441 n > np->n_direofoffset - uio->uio_offset)
1442 n = np->n_direofoffset - uio->uio_offset;
1443 /*
1444 * Make sure that we return an integral number of entries so
1445 * that any subsequent calls will start copying from the start
1446 * of the next entry.
1447 *
1448 * If the current value of n has the last entry cut short,
1449 * set n to copy everything up to the last entry instead.
1450 */
1451 if (n > 0) {
1452 dp = bp->nb_data + on;
1453 while (dp < (bp->nb_data + on + n)) {
1454 direntp = (struct dirent *)dp;
1455 dp += direntp->d_reclen;
1456 }
1457 if (dp > (bp->nb_data + on + n))
1458 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1459 }
1460 break;
1461 default:
1462 printf("nfs_bioread: type %x unexpected\n",vp->v_type);
1463 FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL);
1464 return (EINVAL);
1465 };
1466
1467 if (n > 0) {
1468 error = uiomove(bp->nb_data + on, (int)n, uio);
1469 }
1470 switch (vp->v_type) {
1471 case VREG:
1472 break;
1473 case VLNK:
1474 n = 0;
1475 break;
1476 case VDIR:
1477 if (np->n_flag & NQNFSNONCACHE)
1478 SET(bp->nb_flags, NB_INVAL);
1479 break;
1480 }
1481 nfs_buf_release(bp);
1482 } while (error == 0 && uio->uio_resid > 0 && n > 0);
1483 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1484 return (error);
1485 }
1486
1487
1488 /*
1489 * Vnode op for write using bio
1490 */
1491 int
1492 nfs_write(ap)
1493 struct vop_write_args /* {
1494 struct vnode *a_vp;
1495 struct uio *a_uio;
1496 int a_ioflag;
1497 struct ucred *a_cred;
1498 } */ *ap;
1499 {
1500 struct uio *uio = ap->a_uio;
1501 struct proc *p = uio->uio_procp;
1502 struct vnode *vp = ap->a_vp;
1503 struct nfsnode *np = VTONFS(vp);
1504 struct ucred *cred = ap->a_cred;
1505 int ioflag = ap->a_ioflag;
1506 struct nfsbuf *bp;
1507 struct vattr vattr;
1508 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1509 daddr_t lbn;
1510 int biosize, bufsize, writeop;
1511 int n, on, error = 0, iomode, must_commit;
1512 off_t boff, start, end;
1513 struct iovec iov;
1514 struct uio auio;
1515
1516 FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag);
1517
1518 #if DIAGNOSTIC
1519 if (uio->uio_rw != UIO_WRITE)
1520 panic("nfs_write mode");
1521 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
1522 panic("nfs_write proc");
1523 #endif
1524 if (vp->v_type != VREG)
1525 return (EIO);
1526 if (np->n_flag & NWRITEERR) {
1527 np->n_flag &= ~NWRITEERR;
1528 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error);
1529 return (np->n_error);
1530 }
1531 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1532 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1533 (void)nfs_fsinfo(nmp, vp, cred, p);
1534 if (ioflag & (IO_APPEND | IO_SYNC)) {
1535 if (np->n_flag & NMODIFIED) {
1536 np->n_xid = 0;
1537 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1538 if (error) {
1539 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1540 return (error);
1541 }
1542 }
1543 if (ioflag & IO_APPEND) {
1544 np->n_xid = 0;
1545 error = VOP_GETATTR(vp, &vattr, cred, p);
1546 if (error) {
1547 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
1548 return (error);
1549 }
1550 uio->uio_offset = np->n_size;
1551 }
1552 }
1553 if (uio->uio_offset < 0) {
1554 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
1555 return (EINVAL);
1556 }
1557 if (uio->uio_resid == 0) {
1558 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
1559 return (0);
1560 }
1561 /*
1562 * Maybe this should be above the vnode op call, but so long as
1563 * file servers have no limits, i don't think it matters
1564 */
1565 if (p && uio->uio_offset + uio->uio_resid >
1566 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
1567 psignal(p, SIGXFSZ);
1568 FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG);
1569 return (EFBIG);
1570 }
1571
1572 biosize = vp->v_mount->mnt_stat.f_iosize;
1573
1574 do {
1575 /*
1576 * Check for a valid write lease.
1577 */
1578 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1579 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1580 do {
1581 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1582 } while (error == NQNFS_EXPIRED);
1583 if (error) {
1584 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error);
1585 return (error);
1586 }
1587 if (np->n_lrev != np->n_brev ||
1588 (np->n_flag & NQNFSNONCACHE)) {
1589 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1590 if (error) {
1591 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error);
1592 return (error);
1593 }
1594 np->n_brev = np->n_lrev;
1595 }
1596 }
1597 if (ISSET(vp->v_flag, VNOCACHE_DATA) &&
1598 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1599 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1600 if (error) {
1601 FSDBG_BOT(515, vp, 0, 0, error);
1602 return (error);
1603 }
1604 }
1605 if (((np->n_flag & NQNFSNONCACHE) ||
1606 ISSET(vp->v_flag, VNOCACHE_DATA)) &&
1607 uio->uio_iovcnt == 1) {
1608 iomode = NFSV3WRITE_FILESYNC;
1609 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
1610 if (must_commit)
1611 nfs_clearcommit(vp->v_mount);
1612 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1613 return (error);
1614 }
1615 nfsstats.biocache_writes++;
1616 lbn = uio->uio_offset / biosize;
1617 on = uio->uio_offset % biosize;
1618 n = min((unsigned)(biosize - on), uio->uio_resid);
1619 again:
1620 bufsize = biosize;
1621 /*
1622 * Get a cache block for writing. The range to be written is
1623 * (off..off+n) within the block. We ensure that the block
1624 * either has no dirty region or that the given range is
1625 * contiguous with the existing dirty region.
1626 */
1627 bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE);
1628 if (!bp) {
1629 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR);
1630 return (EINTR);
1631 }
1632 /* map the block because we know we're going to write to it */
1633 NFS_BUF_MAP(bp);
1634
1635 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1636 SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL));
1637
1638 /*
1639 * NFS has embedded ucred so crhold() risks zone corruption
1640 */
1641 if (bp->nb_wcred == NOCRED)
1642 bp->nb_wcred = crdup(cred);
1643
1644 /*
1645 * If there's already a dirty range AND dirty pages in this block we
1646 * need to send a commit AND write the dirty pages before continuing.
1647 *
1648 * If there's already a dirty range OR dirty pages in this block
1649 * and the new write range is not contiguous with the existing range,
1650 * then force the buffer to be written out now.
1651 * (We used to just extend the dirty range to cover the valid,
1652 * but unwritten, data in between also. But writing ranges
1653 * of data that weren't actually written by an application
1654 * risks overwriting some other client's data with stale data
1655 * that's just masquerading as new written data.)
1656 */
1657 if (bp->nb_dirtyend > 0) {
1658 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
1659 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
1660 /* write/commit buffer "synchronously" */
1661 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1662 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1663 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1664 error = nfs_buf_write(bp);
1665 if (error) {
1666 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1667 return (error);
1668 }
1669 goto again;
1670 }
1671 } else if (bp->nb_dirty) {
1672 int firstpg, lastpg;
1673 u_int32_t pagemask;
1674 /* calculate write range pagemask */
1675 firstpg = on/PAGE_SIZE;
1676 lastpg = (on+n-1)/PAGE_SIZE;
1677 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
1678 /* check if there are dirty pages outside the write range */
1679 if (bp->nb_dirty & ~pagemask) {
1680 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
1681 /* write/commit buffer "synchronously" */
1682 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1683 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1684 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1685 error = nfs_buf_write(bp);
1686 if (error) {
1687 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1688 return (error);
1689 }
1690 goto again;
1691 }
1692 /* if the first or last pages are already dirty */
1693 /* make sure that the dirty range encompasses those pages */
1694 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
1695 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
1696 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
1697 if (NBPGDIRTY(bp,lastpg)) {
1698 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
1699 /* clip to EOF */
1700 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1701 bp->nb_dirtyend = np->n_size - NBOFF(bp);
1702 } else
1703 bp->nb_dirtyend = on+n;
1704 }
1705 }
1706
1707 /*
1708 * Are we extending the size of the file with this write?
1709 * If so, update file size now that we have the block.
1710 * If there was a partial buf at the old eof, validate
1711 * and zero the new bytes.
1712 */
1713 if (uio->uio_offset + n > np->n_size) {
1714 struct nfsbuf *eofbp = NULL;
1715 daddr_t eofbn = np->n_size / biosize;
1716 int eofoff = np->n_size % biosize;
1717 int neweofoff = (uio->uio_offset + n) % biosize;
1718
1719 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
1720
1721 if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn))
1722 eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE);
1723
1724 /* if we're extending within the same last block */
1725 /* and the block is flagged as being cached... */
1726 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
1727 /* ...check that all pages in buffer are valid */
1728 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
1729 u_int32_t pagemask;
1730 /* pagemask only has to extend to last page being written to */
1731 pagemask = (1 << (endpg+1)) - 1;
1732 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
1733 if ((bp->nb_valid & pagemask) != pagemask) {
1734 /* zerofill any hole */
1735 if (on > bp->nb_validend) {
1736 int i;
1737 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
1738 NBPGVALID_SET(bp, i);
1739 NFS_BUF_MAP(bp);
1740 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
1741 bzero((char *)bp->nb_data + bp->nb_validend,
1742 on - bp->nb_validend);
1743 }
1744 /* zerofill any trailing data in the last page */
1745 if (neweofoff) {
1746 NFS_BUF_MAP(bp);
1747 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
1748 bzero((char *)bp->nb_data + neweofoff,
1749 PAGE_SIZE - (neweofoff & PAGE_MASK));
1750 }
1751 }
1752 }
1753 np->n_flag |= NMODIFIED;
1754 np->n_size = uio->uio_offset + n;
1755 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
1756 if (eofbp) {
1757 /*
1758 * We may need to zero any previously invalid data
1759 * after the old EOF in the previous EOF buffer.
1760 *
1761 * For the old last page, don't zero bytes if there
1762 * are invalid bytes in that page (i.e. the page isn't
1763 * currently valid).
1764 * For pages after the old last page, zero them and
1765 * mark them as valid.
1766 */
1767 char *d;
1768 int i;
1769 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1770 SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL));
1771 NFS_BUF_MAP(eofbp);
1772 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
1773 d = eofbp->nb_data;
1774 i = eofoff/PAGE_SIZE;
1775 while (eofoff < biosize) {
1776 int poff = eofoff & PAGE_MASK;
1777 if (!poff || NBPGVALID(eofbp,i)) {
1778 bzero(d + eofoff, PAGE_SIZE - poff);
1779 NBPGVALID_SET(eofbp, i);
1780 }
1781 if (bp->nb_validend == eofoff)
1782 bp->nb_validend += PAGE_SIZE - poff;
1783 eofoff += PAGE_SIZE - poff;
1784 i++;
1785 }
1786 nfs_buf_release(eofbp);
1787 }
1788 }
1789 /*
1790 * If dirtyend exceeds file size, chop it down. This should
1791 * not occur unless there is a race.
1792 */
1793 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1794 bp->nb_dirtyend = np->n_size - NBOFF(bp);
1795 /*
1796 * UBC doesn't handle partial pages, so we need to make sure
1797 * that any pages left in the page cache are completely valid.
1798 *
1799 * Writes that are smaller than a block are delayed if they
1800 * don't extend to the end of the block.
1801 *
1802 * If the block isn't (completely) cached, we may need to read
1803 * in some parts of pages that aren't covered by the write.
1804 * If the write offset (on) isn't page aligned, we'll need to
1805 * read the start of the first page being written to. Likewise,
1806 * if the offset of the end of the write (on+n) isn't page aligned,
1807 * we'll need to read the end of the last page being written to.
1808 *
1809 * Notes:
1810 * We don't want to read anything we're just going to write over.
1811 * We don't want to issue multiple I/Os if we don't have to
1812 * (because they're synchronous rpcs).
1813 * We don't want to read anything we already have modified in the
1814 * page cache.
1815 */
1816 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
1817 int firstpg, lastpg, dirtypg;
1818 int firstpgoff, lastpgoff;
1819 start = end = -1;
1820 firstpg = on/PAGE_SIZE;
1821 firstpgoff = on & PAGE_MASK;
1822 lastpg = (on+n-1)/PAGE_SIZE;
1823 lastpgoff = (on+n) & PAGE_MASK;
1824 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
1825 /* need to read start of first page */
1826 start = firstpg * PAGE_SIZE;
1827 end = start + firstpgoff;
1828 }
1829 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
1830 /* need to read end of last page */
1831 if (start < 0)
1832 start = (lastpg * PAGE_SIZE) + lastpgoff;
1833 end = (lastpg + 1) * PAGE_SIZE;
1834 }
1835 if (end > start) {
1836 /* need to read the data in range: start...end-1 */
1837
1838 /*
1839 * XXX: If we know any of these reads are beyond the
1840 * current EOF (what np->n_size was before we possibly
1841 * just modified it above), we could short-circuit the
1842 * reads and just zero buffer. No need to make a trip
1843 * across the network to read nothing.
1844 */
1845
1846 /* first, check for dirty pages in between */
1847 /* if there are, we'll have to do two reads because */
1848 /* we don't want to overwrite the dirty pages. */
1849 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
1850 if (NBPGDIRTY(bp,dirtypg))
1851 break;
1852
1853 /* if start is at beginning of page, try */
1854 /* to get any preceeding pages as well. */
1855 if (!(start & PAGE_MASK)) {
1856 /* stop at next dirty/valid page or start of block */
1857 for (; start > 0; start-=PAGE_SIZE)
1858 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
1859 break;
1860 }
1861
1862 NFS_BUF_MAP(bp);
1863 /* setup uio for read(s) */
1864 boff = NBOFF(bp);
1865 auio.uio_iov = &iov;
1866 auio.uio_iovcnt = 1;
1867 auio.uio_segflg = UIO_SYSSPACE;
1868 auio.uio_rw = UIO_READ;
1869 auio.uio_procp = p;
1870
1871 if (dirtypg <= (end-1)/PAGE_SIZE) {
1872 /* there's a dirty page in the way, so just do two reads */
1873 /* we'll read the preceding data here */
1874 auio.uio_offset = boff + start;
1875 auio.uio_resid = iov.iov_len = on - start;
1876 iov.iov_base = bp->nb_data + start;
1877 error = nfs_readrpc(vp, &auio, cred);
1878 if (error) {
1879 bp->nb_error = error;
1880 SET(bp->nb_flags, NB_ERROR);
1881 printf("nfs_write: readrpc %d", error);
1882 }
1883 if (auio.uio_resid > 0) {
1884 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01);
1885 bzero(iov.iov_base, auio.uio_resid);
1886 }
1887 /* update validoff/validend if necessary */
1888 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
1889 bp->nb_validoff = start;
1890 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
1891 bp->nb_validend = on;
1892 if (np->n_size > boff + bp->nb_validend)
1893 bp->nb_validend = min(np->n_size - (boff + start), biosize);
1894 /* validate any pages before the write offset */
1895 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
1896 NBPGVALID_SET(bp, start/PAGE_SIZE);
1897 /* adjust start to read any trailing data */
1898 start = on+n;
1899 }
1900
1901 /* if end is at end of page, try to */
1902 /* get any following pages as well. */
1903 if (!(end & PAGE_MASK)) {
1904 /* stop at next valid page or end of block */
1905 for (; end < bufsize; end+=PAGE_SIZE)
1906 if (NBPGVALID(bp,end/PAGE_SIZE))
1907 break;
1908 }
1909
1910 /* now we'll read the (rest of the) data */
1911 auio.uio_offset = boff + start;
1912 auio.uio_resid = iov.iov_len = end - start;
1913 iov.iov_base = bp->nb_data + start;
1914 error = nfs_readrpc(vp, &auio, cred);
1915 if (error) {
1916 bp->nb_error = error;
1917 SET(bp->nb_flags, NB_ERROR);
1918 printf("nfs_write: readrpc %d", error);
1919 }
1920 if (auio.uio_resid > 0) {
1921 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02);
1922 bzero(iov.iov_base, auio.uio_resid);
1923 }
1924 /* update validoff/validend if necessary */
1925 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
1926 bp->nb_validoff = start;
1927 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
1928 bp->nb_validend = end;
1929 if (np->n_size > boff + bp->nb_validend)
1930 bp->nb_validend = min(np->n_size - (boff + start), biosize);
1931 /* validate any pages before the write offset's page */
1932 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
1933 NBPGVALID_SET(bp, start/PAGE_SIZE);
1934 /* validate any pages after the range of pages being written to */
1935 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
1936 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
1937 /* Note: pages being written to will be validated when written */
1938 }
1939 }
1940
1941 if (ISSET(bp->nb_flags, NB_ERROR)) {
1942 error = bp->nb_error;
1943 nfs_buf_release(bp);
1944 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1945 return (error);
1946 }
1947
1948 np->n_flag |= NMODIFIED;
1949
1950 /*
1951 * Check for valid write lease and get one as required.
1952 * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
1953 */
1954 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1955 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1956 do {
1957 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1958 } while (error == NQNFS_EXPIRED);
1959 if (error) {
1960 nfs_buf_release(bp);
1961 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error);
1962 return (error);
1963 }
1964 if (np->n_lrev != np->n_brev ||
1965 (np->n_flag & NQNFSNONCACHE)) {
1966 nfs_buf_release(bp);
1967 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1968 if (error) {
1969 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error);
1970 return (error);
1971 }
1972 np->n_brev = np->n_lrev;
1973 goto again;
1974 }
1975 }
1976 NFS_BUF_MAP(bp);
1977 error = uiomove((char *)bp->nb_data + on, n, uio);
1978 if (error) {
1979 SET(bp->nb_flags, NB_ERROR);
1980 nfs_buf_release(bp);
1981 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1982 return (error);
1983 }
1984
1985 /* validate any pages written to */
1986 start = on & ~PAGE_MASK;
1987 for (; start < on+n; start += PAGE_SIZE) {
1988 NBPGVALID_SET(bp, start/PAGE_SIZE);
1989 /*
1990 * This may seem a little weird, but we don't actually set the
1991 * dirty bits for writes. This is because we keep the dirty range
1992 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
1993 * delayed writes, when we give the pages back to the VM we don't
1994 * want to keep them marked dirty, because when we later write the
1995 * buffer we won't be able to tell which pages were written dirty
1996 * and which pages were mmapped and dirtied.
1997 */
1998 }
1999 if (bp->nb_dirtyend > 0) {
2000 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2001 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2002 } else {
2003 bp->nb_dirtyoff = on;
2004 bp->nb_dirtyend = on + n;
2005 }
2006 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2007 bp->nb_validoff > bp->nb_dirtyend) {
2008 bp->nb_validoff = bp->nb_dirtyoff;
2009 bp->nb_validend = bp->nb_dirtyend;
2010 } else {
2011 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2012 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2013 }
2014 if (!ISSET(bp->nb_flags, NB_CACHE))
2015 nfs_buf_normalize_valid_range(np, bp);
2016
2017 /*
2018 * Since this block is being modified, it must be written
2019 * again and not just committed.
2020 */
2021 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2022 np->n_needcommitcnt--;
2023 CHECK_NEEDCOMMITCNT(np);
2024 }
2025 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2026
2027 if ((np->n_flag & NQNFSNONCACHE) ||
2028 (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) {
2029 bp->nb_proc = p;
2030 error = nfs_buf_write(bp);
2031 if (error) {
2032 FSDBG_BOT(515, vp, uio->uio_offset,
2033 uio->uio_resid, error);
2034 return (error);
2035 }
2036 if (np->n_flag & NQNFSNONCACHE) {
2037 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2038 if (error) {
2039 FSDBG_BOT(515, vp, uio->uio_offset,
2040 uio->uio_resid, error);
2041 return (error);
2042 }
2043 }
2044 } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
2045 bp->nb_proc = (struct proc *)0;
2046 SET(bp->nb_flags, NB_ASYNC);
2047 nfs_buf_write(bp);
2048 } else
2049 nfs_buf_write_delayed(bp);
2050
2051 if (np->n_needcommitcnt > (nbuf/16))
2052 nfs_flushcommits(vp, p);
2053
2054 } while (uio->uio_resid > 0 && n > 0);
2055
2056 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
2057 return (0);
2058 }
2059
2060 /*
2061 * Flush out and invalidate all buffers associated with a vnode.
2062 * Called with the underlying object locked.
2063 */
2064 static int
2065 nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo)
2066 register struct vnode *vp;
2067 int flags;
2068 struct ucred *cred;
2069 struct proc *p;
2070 int slpflag, slptimeo;
2071 {
2072 struct nfsbuf *bp;
2073 struct nfsbuf *nbp, *blist;
2074 int s, error = 0;
2075 struct nfsnode *np = VTONFS(vp);
2076
2077 if (flags & V_SAVE) {
2078 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
2079 return (error);
2080 if (np->n_dirtyblkhd.lh_first)
2081 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2082 vp, np->n_dirtyblkhd.lh_first);
2083 }
2084
2085 for (;;) {
2086 blist = np->n_cleanblkhd.lh_first;
2087 if (!blist)
2088 blist = np->n_dirtyblkhd.lh_first;
2089 if (!blist)
2090 break;
2091
2092 for (bp = blist; bp; bp = nbp) {
2093 nbp = bp->nb_vnbufs.le_next;
2094 s = splbio();
2095 if (ISSET(bp->nb_flags, NB_BUSY)) {
2096 SET(bp->nb_flags, NB_WANTED);
2097 FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags);
2098 error = tsleep((caddr_t)bp,
2099 slpflag | (PRIBIO + 1), "nfs_vinvalbuf",
2100 slptimeo);
2101 FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags);
2102 splx(s);
2103 if (error) {
2104 FSDBG(554, vp, bp, -1, error);
2105 return (error);
2106 }
2107 break;
2108 }
2109 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2110 nfs_buf_remfree(bp);
2111 SET(bp->nb_flags, NB_BUSY);
2112 splx(s);
2113 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) {
2114 /* XXX extra paranoia: make sure we're not */
2115 /* somehow leaving any dirty data around */
2116 int mustwrite = 0;
2117 int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ?
2118 bp->nb_bufsize : (np->n_size - NBOFF(bp));
2119 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2120 error = nfs_buf_upl_setup(bp);
2121 if (error == EINVAL) {
2122 /* vm object must no longer exist */
2123 /* hopefully we don't need to do */
2124 /* anything for this buffer */
2125 } else if (error)
2126 printf("nfs_vinvalbuf: upl setup failed %d\n",
2127 error);
2128 bp->nb_valid = bp->nb_dirty = 0;
2129 }
2130 nfs_buf_upl_check(bp);
2131 /* check for any dirty data before the EOF */
2132 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2133 /* clip dirty range to EOF */
2134 if (bp->nb_dirtyend > end)
2135 bp->nb_dirtyend = end;
2136 mustwrite++;
2137 }
2138 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2139 if (bp->nb_dirty)
2140 mustwrite++;
2141 if (mustwrite) {
2142 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2143 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2144 panic("nfs_vinvalbuf: dirty buffer without upl");
2145 /* gotta write out dirty data before invalidating */
2146 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2147 /* (NB_NOCACHE indicates buffer should be discarded) */
2148 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2149 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2150 /*
2151 * NFS has embedded ucred so crhold() risks zone corruption
2152 */
2153 if (bp->nb_wcred == NOCRED)
2154 bp->nb_wcred = crdup(cred);
2155 error = nfs_buf_write(bp);
2156 // Note: bp has been released
2157 if (error) {
2158 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2159 np->n_error = error;
2160 np->n_flag |= NWRITEERR;
2161 error = 0;
2162 }
2163 break;
2164 }
2165 }
2166 SET(bp->nb_flags, NB_INVAL);
2167 nfs_buf_release(bp);
2168 }
2169 }
2170 if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)
2171 panic("nfs_vinvalbuf: flush failed");
2172 return (0);
2173 }
2174
2175
2176 /*
2177 * Flush and invalidate all dirty buffers. If another process is already
2178 * doing the flush, just wait for completion.
2179 */
2180 int
2181 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
2182 struct vnode *vp;
2183 int flags;
2184 struct ucred *cred;
2185 struct proc *p;
2186 int intrflg;
2187 {
2188 register struct nfsnode *np = VTONFS(vp);
2189 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2190 int error = 0, slpflag, slptimeo;
2191 int didhold = 0;
2192
2193 FSDBG_TOP(554, vp, flags, intrflg, 0);
2194
2195 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2196 intrflg = 0;
2197 if (intrflg) {
2198 slpflag = PCATCH;
2199 slptimeo = 2 * hz;
2200 } else {
2201 slpflag = 0;
2202 slptimeo = 0;
2203 }
2204 /*
2205 * First wait for any other process doing a flush to complete.
2206 */
2207 while (np->n_flag & NFLUSHINPROG) {
2208 np->n_flag |= NFLUSHWANT;
2209 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2210 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2211 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2212 if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) {
2213 FSDBG_BOT(554, vp, flags, intrflg, error);
2214 return (error);
2215 }
2216 }
2217
2218 /*
2219 * Now, flush as required.
2220 */
2221 np->n_flag |= NFLUSHINPROG;
2222 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2223 while (error) {
2224 FSDBG(554, vp, 0, 0, error);
2225 error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p);
2226 if (error) {
2227 np->n_flag &= ~NFLUSHINPROG;
2228 if (np->n_flag & NFLUSHWANT) {
2229 np->n_flag &= ~NFLUSHWANT;
2230 wakeup((caddr_t)&np->n_flag);
2231 }
2232 FSDBG_BOT(554, vp, flags, intrflg, error);
2233 return (error);
2234 }
2235 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2236 }
2237 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2238 if (np->n_flag & NFLUSHWANT) {
2239 np->n_flag &= ~NFLUSHWANT;
2240 wakeup((caddr_t)&np->n_flag);
2241 }
2242 didhold = ubc_hold(vp);
2243 if (didhold) {
2244 int rv = ubc_clean(vp, 1); /* get the pages out of vm also */
2245 if (!rv)
2246 panic("nfs_vinvalbuf(): ubc_clean failed!");
2247 ubc_rele(vp);
2248 }
2249 FSDBG_BOT(554, vp, flags, intrflg, 0);
2250 return (0);
2251 }
2252
2253 /*
2254 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2255 * This is mainly to avoid queueing async I/O requests when the nfsiods
2256 * are all hung on a dead server.
2257 */
2258 int
2259 nfs_asyncio(bp, cred)
2260 struct nfsbuf *bp;
2261 struct ucred *cred;
2262 {
2263 struct nfsmount *nmp;
2264 int i;
2265 int gotiod;
2266 int slpflag = 0;
2267 int slptimeo = 0;
2268 int error, error2;
2269
2270 if (nfs_numasync == 0)
2271 return (EIO);
2272
2273 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2274
2275 nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL);
2276 again:
2277 if (nmp && nmp->nm_flag & NFSMNT_INT)
2278 slpflag = PCATCH;
2279 gotiod = FALSE;
2280
2281 /* no nfsbuf means tell nfsiod to process delwri list */
2282 if (!bp)
2283 nfs_ioddelwri = 1;
2284
2285 /*
2286 * Find a free iod to process this request.
2287 */
2288 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2289 if (nfs_iodwant[i]) {
2290 /*
2291 * Found one, so wake it up and tell it which
2292 * mount to process.
2293 */
2294 NFS_DPF(ASYNCIO,
2295 ("nfs_asyncio: waking iod %d for mount %p\n",
2296 i, nmp));
2297 nfs_iodwant[i] = (struct proc *)0;
2298 nfs_iodmount[i] = nmp;
2299 if (nmp)
2300 nmp->nm_bufqiods++;
2301 wakeup((caddr_t)&nfs_iodwant[i]);
2302 gotiod = TRUE;
2303 break;
2304 }
2305
2306 /* if we're just poking the delwri list, we're done */
2307 if (!bp)
2308 return (0);
2309
2310 /*
2311 * If none are free, we may already have an iod working on this mount
2312 * point. If so, it will process our request.
2313 */
2314 if (!gotiod) {
2315 if (nmp->nm_bufqiods > 0) {
2316 NFS_DPF(ASYNCIO,
2317 ("nfs_asyncio: %d iods are already processing mount %p\n",
2318 nmp->nm_bufqiods, nmp));
2319 gotiod = TRUE;
2320 }
2321 }
2322
2323 /*
2324 * If we have an iod which can process the request, then queue
2325 * the buffer.
2326 */
2327 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2328 if (gotiod) {
2329 /*
2330 * Ensure that the queue never grows too large.
2331 */
2332 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2333 if (ISSET(bp->nb_flags, NB_IOD)) {
2334 /* An nfsiod is attempting this async operation so */
2335 /* we must not fall asleep on the bufq because we */
2336 /* could be waiting on ourself. Just return error */
2337 /* and we'll do this operation syncrhonously. */
2338 goto out;
2339 }
2340 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2341 NFS_DPF(ASYNCIO,
2342 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
2343 nmp->nm_bufqwant = TRUE;
2344 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
2345 "nfsaio", slptimeo);
2346 if (error) {
2347 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2348 if (error2) {
2349 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2350 return (error2);
2351 }
2352 if (slpflag == PCATCH) {
2353 slpflag = 0;
2354 slptimeo = 2 * hz;
2355 }
2356 }
2357 /*
2358 * We might have lost our iod while sleeping,
2359 * so check and loop if nescessary.
2360 */
2361 if (nmp->nm_bufqiods == 0) {
2362 NFS_DPF(ASYNCIO,
2363 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
2364 goto again;
2365 }
2366 }
2367
2368 if (ISSET(bp->nb_flags, NB_READ)) {
2369 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2370 /*
2371 * NFS has embedded ucred.
2372 * Can not crhold() here as that causes zone corruption
2373 */
2374 bp->nb_rcred = crdup(cred);
2375 }
2376 } else {
2377 SET(bp->nb_flags, NB_WRITEINPROG);
2378 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2379 /*
2380 * NFS has embedded ucred.
2381 * Can not crhold() here as that causes zone corruption
2382 */
2383 bp->nb_wcred = crdup(cred);
2384 }
2385 }
2386
2387 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2388 nmp->nm_bufqlen++;
2389 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2390 return (0);
2391 }
2392
2393 out:
2394 /*
2395 * All the iods are busy on other mounts, so return EIO to
2396 * force the caller to process the i/o synchronously.
2397 */
2398 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
2399 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2400 return (EIO);
2401 }
2402
2403 /*
2404 * Do an I/O operation to/from a cache block. This may be called
2405 * synchronously or from an nfsiod.
2406 */
2407 int
2408 nfs_doio(bp, cr, p)
2409 struct nfsbuf *bp;
2410 struct ucred *cr;
2411 struct proc *p;
2412 {
2413 register struct uio *uiop;
2414 register struct vnode *vp;
2415 struct nfsnode *np;
2416 struct nfsmount *nmp;
2417 int error = 0, diff, len, iomode, must_commit = 0;
2418 struct uio uio;
2419 struct iovec io;
2420
2421 vp = bp->nb_vp;
2422 np = VTONFS(vp);
2423 nmp = VFSTONFS(vp->v_mount);
2424 uiop = &uio;
2425 uiop->uio_iov = &io;
2426 uiop->uio_iovcnt = 1;
2427 uiop->uio_segflg = UIO_SYSSPACE;
2428 uiop->uio_procp = p;
2429
2430 /*
2431 * we've decided to perform I/O for this block,
2432 * so we couldn't possibly NB_DONE. So, clear it.
2433 */
2434 if (ISSET(bp->nb_flags, NB_DONE)) {
2435 if (!ISSET(bp->nb_flags, NB_ASYNC))
2436 panic("nfs_doio: done and not async");
2437 CLR(bp->nb_flags, NB_DONE);
2438 }
2439 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2440 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2441 bp->nb_dirtyend);
2442
2443 if (ISSET(bp->nb_flags, NB_READ)) {
2444 if (vp->v_type == VREG)
2445 NFS_BUF_MAP(bp);
2446 io.iov_len = uiop->uio_resid = bp->nb_bufsize;
2447 io.iov_base = bp->nb_data;
2448 uiop->uio_rw = UIO_READ;
2449 switch (vp->v_type) {
2450 case VREG:
2451 uiop->uio_offset = NBOFF(bp);
2452 nfsstats.read_bios++;
2453 error = nfs_readrpc(vp, uiop, cr);
2454 FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error);
2455 if (!error) {
2456 /* update valid range */
2457 bp->nb_validoff = 0;
2458 if (uiop->uio_resid) {
2459 /*
2460 * If len > 0, there is a hole in the file and
2461 * no writes after the hole have been pushed to
2462 * the server yet.
2463 * Just zero fill the rest of the valid area.
2464 */
2465 diff = bp->nb_bufsize - uiop->uio_resid;
2466 len = np->n_size - (NBOFF(bp) + diff);
2467 if (len > 0) {
2468 len = min(len, uiop->uio_resid);
2469 bzero((char *)bp->nb_data + diff, len);
2470 bp->nb_validend = diff + len;
2471 FSDBG(258, diff, len, 0, 1);
2472 } else
2473 bp->nb_validend = diff;
2474 } else
2475 bp->nb_validend = bp->nb_bufsize;
2476 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2477 if (bp->nb_validend & PAGE_MASK) {
2478 /* valid range ends in the middle of a page so we */
2479 /* need to zero-fill any invalid data at the end */
2480 /* of the last page */
2481 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2482 bp->nb_bufsize - bp->nb_validend);
2483 FSDBG(258, bp->nb_validend,
2484 bp->nb_bufsize - bp->nb_validend, 0, 2);
2485 }
2486 }
2487 if (p && (vp->v_flag & VTEXT) &&
2488 (((nmp->nm_flag & NFSMNT_NQNFS) &&
2489 NQNFS_CKINVALID(vp, np, ND_READ) &&
2490 np->n_lrev != np->n_brev) ||
2491 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
2492 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
2493 uprintf("Process killed due to text file modification\n");
2494 psignal(p, SIGKILL);
2495 p->p_flag |= P_NOSWAP;
2496 }
2497 break;
2498 case VLNK:
2499 uiop->uio_offset = (off_t)0;
2500 nfsstats.readlink_bios++;
2501 error = nfs_readlinkrpc(vp, uiop, cr);
2502 if (!error) {
2503 bp->nb_validoff = 0;
2504 bp->nb_validend = uiop->uio_offset;
2505 }
2506 break;
2507 case VDIR:
2508 nfsstats.readdir_bios++;
2509 uiop->uio_offset = NBOFF(bp);
2510 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2511 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2512 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2513 error = nfs_readdirplusrpc(vp, uiop, cr);
2514 if (error == NFSERR_NOTSUPP)
2515 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2516 }
2517 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2518 error = nfs_readdirrpc(vp, uiop, cr);
2519 if (!error) {
2520 bp->nb_validoff = 0;
2521 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2522 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2523 }
2524 break;
2525 default:
2526 printf("nfs_doio: type %x unexpected\n", vp->v_type);
2527 break;
2528 };
2529 if (error) {
2530 SET(bp->nb_flags, NB_ERROR);
2531 bp->nb_error = error;
2532 }
2533
2534 } else {
2535 /* we're doing a write */
2536 int doff, dend = 0;
2537
2538 /* We need to make sure the pages are locked before doing I/O. */
2539 if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) {
2540 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2541 error = nfs_buf_upl_setup(bp);
2542 if (error) {
2543 printf("nfs_doio: upl create failed %d\n", error);
2544 SET(bp->nb_flags, NB_ERROR);
2545 bp->nb_error = EIO;
2546 return (EIO);
2547 }
2548 nfs_buf_upl_check(bp);
2549 }
2550 }
2551
2552 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2553 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2554 /*
2555 * There are pages marked dirty that need to be written out.
2556 *
2557 * We don't want to just combine the write range with the
2558 * range of pages that are dirty because that could cause us
2559 * to write data that wasn't actually written to.
2560 * We also don't want to write data more than once.
2561 *
2562 * If the dirty range just needs to be committed, we do that.
2563 * Otherwise, we write the dirty range and clear the dirty bits
2564 * for any COMPLETE pages covered by that range.
2565 * If there are dirty pages left after that, we write out the
2566 * parts that we haven't written yet.
2567 */
2568 }
2569
2570 /*
2571 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
2572 * an actual write will have to be done.
2573 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
2574 */
2575 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
2576 doff = NBOFF(bp) + bp->nb_dirtyoff;
2577 SET(bp->nb_flags, NB_WRITEINPROG);
2578 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
2579 bp->nb_wcred, bp->nb_proc);
2580 CLR(bp->nb_flags, NB_WRITEINPROG);
2581 if (!error) {
2582 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2583 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2584 np->n_needcommitcnt--;
2585 CHECK_NEEDCOMMITCNT(np);
2586 } else if (error == NFSERR_STALEWRITEVERF)
2587 nfs_clearcommit(vp->v_mount);
2588 }
2589
2590 if (!error && bp->nb_dirtyend > 0) {
2591 /* there's a dirty range that needs to be written out */
2592 u_int32_t pagemask;
2593 int firstpg, lastpg;
2594
2595 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
2596 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2597
2598 NFS_BUF_MAP(bp);
2599
2600 doff = bp->nb_dirtyoff;
2601 dend = bp->nb_dirtyend;
2602
2603 /* if doff page is dirty, move doff to start of page */
2604 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
2605 doff -= doff & PAGE_MASK;
2606 /* try to expand write range to include preceding dirty pages */
2607 if (!(doff & PAGE_MASK))
2608 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
2609 doff -= PAGE_SIZE;
2610 /* if dend page is dirty, move dend to start of next page */
2611 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
2612 dend = round_page_32(dend);
2613 /* try to expand write range to include trailing dirty pages */
2614 if (!(dend & PAGE_MASK))
2615 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
2616 dend += PAGE_SIZE;
2617 /* make sure to keep dend clipped to EOF */
2618 if (NBOFF(bp) + dend > np->n_size)
2619 dend = np->n_size - NBOFF(bp);
2620 /* calculate range of complete pages being written */
2621 firstpg = round_page_32(doff) / PAGE_SIZE;
2622 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
2623 /* calculate mask for that page range */
2624 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2625
2626 /* compare page mask to nb_dirty; if there are other dirty pages */
2627 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
2628 /* not needcommit/nocache/call; otherwise write FILESYNC */
2629 if (bp->nb_dirty & ~pagemask)
2630 iomode = NFSV3WRITE_FILESYNC;
2631 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
2632 iomode = NFSV3WRITE_UNSTABLE;
2633 else
2634 iomode = NFSV3WRITE_FILESYNC;
2635
2636 /* write the dirty range */
2637 io.iov_len = uiop->uio_resid = dend - doff;
2638 uiop->uio_offset = NBOFF(bp) + doff;
2639 io.iov_base = (char *)bp->nb_data + doff;
2640 uiop->uio_rw = UIO_WRITE;
2641
2642 nfsstats.write_bios++;
2643
2644 SET(bp->nb_flags, NB_WRITEINPROG);
2645 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2646 if (must_commit)
2647 nfs_clearcommit(vp->v_mount);
2648 /* clear dirty bits for pages we've written */
2649 if (!error)
2650 bp->nb_dirty &= ~pagemask;
2651 /* set/clear needcommit flag */
2652 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
2653 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2654 np->n_needcommitcnt++;
2655 SET(bp->nb_flags, NB_NEEDCOMMIT);
2656 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2657 bp->nb_dirtyoff = doff;
2658 bp->nb_dirtyend = dend;
2659 } else {
2660 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2661 np->n_needcommitcnt--;
2662 CHECK_NEEDCOMMITCNT(np);
2663 }
2664 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2665 }
2666 CLR(bp->nb_flags, NB_WRITEINPROG);
2667 /*
2668 * For an interrupted write, the buffer is still valid and the write
2669 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
2670 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
2671 * NB_EINTR is not relevant.
2672 *
2673 * For the case of a V3 write rpc not being committed to stable
2674 * storage, the block is still dirty and requires either a commit rpc
2675 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
2676 * block is reused. This is indicated by setting the NB_DELWRI and
2677 * NB_NEEDCOMMIT flags.
2678 */
2679 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
2680 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
2681 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2682 SET(bp->nb_flags, NB_DELWRI);
2683 nfs_nbdwrite++;
2684 NFSBUFCNTCHK();
2685 }
2686 FSDBG(261, bp->nb_validoff, bp->nb_validend,
2687 bp->nb_bufsize, 0);
2688 /*
2689 * Since for the NB_ASYNC case, nfs_bwrite() has
2690 * reassigned the buffer to the clean list, we have to
2691 * reassign it back to the dirty one. Ugh.
2692 */
2693 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2694 /* move to dirty list */
2695 int s = splbio();
2696 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2697 LIST_REMOVE(bp, nb_vnbufs);
2698 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2699 splx(s);
2700 } else {
2701 SET(bp->nb_flags, NB_EINTR);
2702 }
2703 } else {
2704 /* either there's an error or we don't need to commit */
2705 if (error) {
2706 SET(bp->nb_flags, NB_ERROR);
2707 bp->nb_error = np->n_error = error;
2708 np->n_flag |= NWRITEERR;
2709 }
2710 /* clear the dirty range */
2711 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2712 }
2713 }
2714
2715 if (!error && bp->nb_dirty) {
2716 /* there are pages marked dirty that need to be written out */
2717 int pg, cnt, npages, off, len;
2718
2719 nfsstats.write_bios++;
2720
2721 NFS_BUF_MAP(bp);
2722
2723 /*
2724 * we do these writes synchronously because we can't really
2725 * support the unstable/needommit method. We could write
2726 * them unstable, clear the dirty bits, and then commit the
2727 * whole block later, but if we need to rewrite the data, we
2728 * won't have any idea which pages were written because that
2729 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
2730 * also can't leave the dirty bits set because then we wouldn't
2731 * be able to tell if the pages were re-dirtied between the end
2732 * of the write and the commit.
2733 */
2734 iomode = NFSV3WRITE_FILESYNC;
2735 uiop->uio_rw = UIO_WRITE;
2736
2737 SET(bp->nb_flags, NB_WRITEINPROG);
2738 npages = bp->nb_bufsize/PAGE_SIZE;
2739 for (pg=0; pg < npages; pg++) {
2740 if (!NBPGDIRTY(bp,pg))
2741 continue;
2742 cnt = 1;
2743 while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt))
2744 cnt++;
2745 /* write cnt pages starting with page pg */
2746 off = pg * PAGE_SIZE;
2747 len = cnt * PAGE_SIZE;
2748
2749 /* clip writes to EOF */
2750 if (NBOFF(bp) + off + len > np->n_size)
2751 len -= (NBOFF(bp) + off + len) - np->n_size;
2752 if (len > 0) {
2753 io.iov_len = uiop->uio_resid = len;
2754 uiop->uio_offset = NBOFF(bp) + off;
2755 io.iov_base = (char *)bp->nb_data + off;
2756 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2757 if (must_commit)
2758 nfs_clearcommit(vp->v_mount);
2759 if (error)
2760 break;
2761 }
2762 /* clear dirty bits */
2763 while (cnt--) {
2764 bp->nb_dirty &= ~(1 << pg);
2765 /* leave pg on last page */
2766 if (cnt) pg++;
2767 }
2768 }
2769 if (!error) {
2770 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2771 np->n_needcommitcnt--;
2772 CHECK_NEEDCOMMITCNT(np);
2773 }
2774 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2775 }
2776 CLR(bp->nb_flags, NB_WRITEINPROG);
2777 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
2778 np->n_size);
2779 }
2780
2781 if (error) {
2782 SET(bp->nb_flags, NB_ERROR);
2783 bp->nb_error = error;
2784 }
2785 }
2786
2787 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
2788
2789 nfs_buf_iodone(bp);
2790 return (error);
2791 }