]> git.saurik.com Git - apple/xnu.git/blame - bsd/nfs/nfs_bio.c
xnu-517.3.7.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
CommitLineData
1c79356b 1/*
55e303ae 2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
43866e37 6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
1c79356b 7 *
43866e37
A
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
43866e37
A
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
1c79356b
A
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26/*
27 * Copyright (c) 1989, 1993
28 * The Regents of the University of California. All rights reserved.
29 *
30 * This code is derived from software contributed to Berkeley by
31 * Rick Macklem at The University of Guelph.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
62 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
63 */
1c79356b
A
64#include <sys/param.h>
65#include <sys/systm.h>
66#include <sys/resourcevar.h>
67#include <sys/signalvar.h>
68#include <sys/proc.h>
55e303ae 69#include <sys/malloc.h>
1c79356b 70#include <sys/vnode.h>
55e303ae 71#include <sys/dirent.h>
1c79356b
A
72#include <sys/mount.h>
73#include <sys/kernel.h>
74#include <sys/sysctl.h>
75#include <sys/ubc.h>
76
77#include <sys/vm.h>
78#include <sys/vmparam.h>
79
80#include <sys/time.h>
81#include <kern/clock.h>
82
83#include <nfs/rpcv2.h>
84#include <nfs/nfsproto.h>
85#include <nfs/nfs.h>
86#include <nfs/nfsmount.h>
87#include <nfs/nqnfs.h>
88#include <nfs/nfsnode.h>
89
90#include <sys/kdebug.h>
91
fa4905b1
A
92#define FSDBG(A, B, C, D, E) \
93 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
94 (int)(B), (int)(C), (int)(D), (int)(E), 0)
95#define FSDBG_TOP(A, B, C, D, E) \
96 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
97 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98#define FSDBG_BOT(A, B, C, D, E) \
99 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
100 (int)(B), (int)(C), (int)(D), (int)(E), 0)
101
1c79356b 102extern int nfs_numasync;
55e303ae 103extern int nfs_ioddelwri;
1c79356b 104extern struct nfsstats nfsstats;
55e303ae
A
105
106#define NFSBUFHASH(dvp, lbn) \
107 (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
108LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
109struct nfsbuffreehead nfsbuffree, nfsbufdelwri;
110u_long nfsbufhash;
111int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax;
112int nfsbuffreecnt, nfsbufdelwricnt, nfsneedbuffer;
113int nfs_nbdwrite;
114
115#define NFSBUFWRITE_THROTTLE 9
116
117/*
118 * Initialize nfsbuf lists
119 */
120void
121nfs_nbinit(void)
122{
123 nfsbufhashlock = 0;
124 nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash);
125 TAILQ_INIT(&nfsbuffree);
126 TAILQ_INIT(&nfsbufdelwri);
127 nfsbufcnt = nfsbuffreecnt = nfsbufdelwricnt = 0;
128 nfsbufmin = 128; // XXX tune me!
129 nfsbufmax = 8192; // XXX tune me!
130 nfsneedbuffer = 0;
131 nfs_nbdwrite = 0;
132}
133
134/*
135 * try to free up some excess, unused nfsbufs
136 */
137static void
138nfs_buf_freeup(void)
139{
140 struct nfsbuf *fbp;
141 int cnt;
142
143#define NFS_BUF_FREEUP() \
144 do { \
145 /* only call nfs_buf_freeup() if it has work to do */ \
146 if ((nfsbuffreecnt > nfsbufcnt/4) && \
147 (nfsbufcnt-nfsbuffreecnt/8 > nfsbufmin)) \
148 nfs_buf_freeup(); \
149 } while (0)
150
151 if (nfsbuffreecnt < nfsbufcnt/4)
152 return;
153 cnt = nfsbuffreecnt/8;
154 if (nfsbufcnt-cnt < nfsbufmin)
155 return;
156
157 FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
158 while (cnt-- > 0) {
159 fbp = TAILQ_FIRST(&nfsbuffree);
160 if (!fbp)
161 break;
162 nfs_buf_remfree(fbp);
163 /* disassociate buffer from any vnode */
164 if (fbp->nb_vp) {
165 struct vnode *oldvp;
166 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
167 LIST_REMOVE(fbp, nb_vnbufs);
168 fbp->nb_vnbufs.le_next = NFSNOLIST;
169 }
170 oldvp = fbp->nb_vp;
171 fbp->nb_vp = NULL;
172 HOLDRELE(oldvp);
173 }
174 LIST_REMOVE(fbp, nb_hash);
175 /* nuke any creds */
176 if (fbp->nb_rcred != NOCRED)
177 crfree(fbp->nb_rcred);
178 if (fbp->nb_wcred != NOCRED)
179 crfree(fbp->nb_wcred);
180 /* if buf was NB_META, dump buffer */
181 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
182 FREE(fbp->nb_data, M_TEMP);
183 }
184 FREE(fbp, M_TEMP);
185 nfsbufcnt--;
186 }
187 FSDBG(320, -1, nfsbufcnt, nfsbuffreecnt, cnt);
188}
189
190void
191nfs_buf_remfree(struct nfsbuf *bp)
192{
193 if (bp->nb_free.tqe_next == NFSNOLIST)
194 panic("nfsbuf not on free list");
195 if (ISSET(bp->nb_flags, NB_DELWRI)) {
196 nfsbufdelwricnt--;
197 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
198 } else {
199 nfsbuffreecnt--;
200 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
201 }
202 bp->nb_free.tqe_next = NFSNOLIST;
203 NFSBUFCNTCHK();
204}
205
206/*
207 * check for existence of nfsbuf in cache
208 */
209struct nfsbuf *
210nfs_buf_incore(struct vnode *vp, daddr_t blkno)
211{
212 /* Search hash chain */
213 struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first;
214 for (; bp != NULL; bp = bp->nb_hash.le_next)
215 if (bp->nb_lblkno == blkno && bp->nb_vp == vp &&
216 !ISSET(bp->nb_flags, NB_INVAL)) {
217 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
218 return (bp);
219 }
220 return (NULL);
221}
222
223/*
224 * Check if it's OK to drop a page.
225 *
226 * Called by vnode_pager() on pageout request of non-dirty page.
227 * We need to make sure that it's not part of a delayed write.
228 * If it is, we can't let the VM drop it because we may need it
229 * later when/if we need to write the data (again).
230 */
231int
232nfs_buf_page_inval(struct vnode *vp, off_t offset)
233{
234 struct nfsbuf *bp;
235 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
236 if (!bp)
237 return (0);
238 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
239 if (ISSET(bp->nb_flags, NB_BUSY))
240 return (EBUSY);
241 /*
242 * If there's a dirty range in the buffer, check to
243 * see if this page intersects with the dirty range.
244 * If it does, we can't let the pager drop the page.
245 */
246 if (bp->nb_dirtyend > 0) {
247 int start = offset - NBOFF(bp);
248 if (bp->nb_dirtyend <= start ||
249 bp->nb_dirtyoff >= (start + PAGE_SIZE))
250 return (0);
251 return (EBUSY);
252 }
253 return (0);
254}
255
256int
257nfs_buf_upl_setup(struct nfsbuf *bp)
258{
259 kern_return_t kret;
260 upl_t upl;
261 int s;
262
263 if (ISSET(bp->nb_flags, NB_PAGELIST))
264 return (0);
265
266 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
267 &upl, NULL, UPL_PRECIOUS);
268 if (kret == KERN_INVALID_ARGUMENT) {
269 /* vm object probably doesn't exist any more */
270 bp->nb_pagelist = NULL;
271 return (EINVAL);
272 }
273 if (kret != KERN_SUCCESS) {
274 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
275 bp->nb_pagelist = NULL;
276 return (EIO);
277 }
278
279 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
280
281 s = splbio();
282 bp->nb_pagelist = upl;
283 SET(bp->nb_flags, NB_PAGELIST);
284 splx(s);
285 return (0);
286}
287
288void
289nfs_buf_upl_check(struct nfsbuf *bp)
290{
291 upl_page_info_t *pl;
292 off_t filesize, fileoffset;
293 int i, npages;
294
295 if (!ISSET(bp->nb_flags, NB_PAGELIST))
296 return;
297
298 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
299 filesize = ubc_getsize(bp->nb_vp);
300 fileoffset = NBOFF(bp);
301 if (fileoffset < filesize)
302 SET(bp->nb_flags, NB_CACHE);
303 else
304 CLR(bp->nb_flags, NB_CACHE);
305
306 pl = ubc_upl_pageinfo(bp->nb_pagelist);
307 bp->nb_valid = bp->nb_dirty = 0;
308
309 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
310 /* anything beyond the end of the file is not valid or dirty */
311 if (fileoffset >= filesize)
312 break;
313 if (!upl_valid_page(pl, i)) {
314 CLR(bp->nb_flags, NB_CACHE);
315 continue;
316 }
317 NBPGVALID_SET(bp,i);
318 if (upl_dirty_page(pl, i)) {
319 NBPGDIRTY_SET(bp, i);
320 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
321 SET(bp->nb_flags, NB_WASDIRTY);
322 }
323 }
324 fileoffset = NBOFF(bp);
325 if (ISSET(bp->nb_flags, NB_CACHE)) {
326 bp->nb_validoff = 0;
327 bp->nb_validend = bp->nb_bufsize;
328 if (fileoffset + bp->nb_validend > filesize)
329 bp->nb_validend = filesize - fileoffset;
330 } else {
331 bp->nb_validoff = bp->nb_validend = -1;
332 }
333 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
334 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
335}
336
337static int
338nfs_buf_map(struct nfsbuf *bp)
339{
340 kern_return_t kret;
341
342 if (bp->nb_data)
343 return (0);
344 if (!ISSET(bp->nb_flags, NB_PAGELIST))
345 return (EINVAL);
346
347 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
348 if (kret != KERN_SUCCESS)
349 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
350 if (bp->nb_data == 0)
351 panic("ubc_upl_map mapped 0");
352 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
353 return (0);
354}
355
356/*
357 * check range of pages in nfsbuf's UPL for validity
358 */
359static int
360nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
361{
362 off_t fileoffset, filesize;
363 int pg, lastpg;
364 upl_page_info_t *pl;
365
366 if (!ISSET(bp->nb_flags, NB_PAGELIST))
367 return (0);
368 pl = ubc_upl_pageinfo(bp->nb_pagelist);
369
370 size += off & PAGE_MASK;
371 off &= ~PAGE_MASK;
372 fileoffset = NBOFF(bp);
373 filesize = VTONFS(bp->nb_vp)->n_size;
374 if ((fileoffset + off + size) > filesize)
375 size = filesize - (fileoffset + off);
376
377 pg = off/PAGE_SIZE;
378 lastpg = (off + size - 1)/PAGE_SIZE;
379 while (pg <= lastpg) {
380 if (!upl_valid_page(pl, pg))
381 return (0);
382 pg++;
383 }
384 return (1);
385}
386
387/*
388 * normalize an nfsbuf's valid range
389 *
390 * the read/write code guarantees that we'll always have a valid
391 * region that is an integral number of pages. If either end
392 * of the valid range isn't page-aligned, it gets corrected
393 * here as we extend the valid range through all of the
394 * contiguous valid pages.
395 */
396static void
397nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
398{
399 int pg, npg;
400 /* pull validoff back to start of contiguous valid page range */
401 pg = bp->nb_validoff/PAGE_SIZE;
402 while (pg >= 0 && NBPGVALID(bp,pg))
403 pg--;
404 bp->nb_validoff = (pg+1) * PAGE_SIZE;
405 /* push validend forward to end of contiguous valid page range */
406 npg = bp->nb_bufsize/PAGE_SIZE;
407 pg = bp->nb_validend/PAGE_SIZE;
408 while (pg < npg && NBPGVALID(bp,pg))
409 pg++;
410 bp->nb_validend = pg * PAGE_SIZE;
411 /* clip to EOF */
412 if (NBOFF(bp) + bp->nb_validend > np->n_size)
413 bp->nb_validend = np->n_size % bp->nb_bufsize;
414}
415
416/*
417 * try to push out some delayed/uncommitted writes
418 */
419static void
420nfs_buf_delwri_push(void)
421{
422 struct nfsbuf *bp;
423 int i;
424
425 if (TAILQ_EMPTY(&nfsbufdelwri))
426 return;
427
428 /* first try to tell the nfsiods to do it */
429 if (nfs_asyncio(NULL, NULL) == 0)
430 return;
431
432 /* otherwise, try to do some of the work ourselves */
433 i = 0;
434 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
435 struct nfsnode *np = VTONFS(bp->nb_vp);
436 nfs_buf_remfree(bp);
437 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
438 /* put buffer at end of delwri list */
439 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
440 nfsbufdelwricnt++;
441 nfs_flushcommits(np->n_vnode, (struct proc *)0);
442 } else {
443 SET(bp->nb_flags, (NB_BUSY | NB_ASYNC));
444 nfs_buf_write(bp);
445 }
446 i++;
447 }
448}
449
450/*
451 * Get an nfs cache block.
452 * Allocate a new one if the block isn't currently in the cache
453 * and return the block marked busy. If the calling process is
454 * interrupted by a signal for an interruptible mount point, return
455 * NULL.
456 */
457struct nfsbuf *
458nfs_buf_get(
459 struct vnode *vp,
460 daddr_t blkno,
461 int size,
462 struct proc *p,
463 int operation)
464{
465 struct nfsnode *np = VTONFS(vp);
466 struct nfsbuf *bp;
467 int i, biosize, bufsize, rv;
468 struct ucred *cred;
469 int slpflag = PCATCH;
470
471 FSDBG_TOP(541, vp, blkno, size, operation);
472
473 bufsize = size;
474 if (bufsize > MAXBSIZE)
475 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
476
477 biosize = vp->v_mount->mnt_stat.f_iosize;
478
479 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp))
480 operation = BLK_META;
481 else if (bufsize < biosize)
482 /* reg files should always have biosize blocks */
483 bufsize = biosize;
484
485 /* if BLK_WRITE, check for too many delayed/uncommitted writes */
486 if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
487 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
488
489 /* poke the delwri list */
490 nfs_buf_delwri_push();
491
492 /* sleep to let other threads run... */
493 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
494 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
495 }
496
497loop:
498 /*
499 * Obtain a lock to prevent a race condition if the
500 * MALLOC() below happens to block.
501 */
502 if (nfsbufhashlock) {
503 while (nfsbufhashlock) {
504 nfsbufhashlock = -1;
505 tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0);
506 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))
507 return (NULL);
508 }
509 goto loop;
510 }
511 nfsbufhashlock = 1;
512
513 /* check for existence of nfsbuf in cache */
514 if (bp = nfs_buf_incore(vp, blkno)) {
515 /* if busy, set wanted and wait */
516 if (ISSET(bp->nb_flags, NB_BUSY)) {
517 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
518 SET(bp->nb_flags, NB_WANTED);
519 /* unlock hash */
520 if (nfsbufhashlock < 0) {
521 nfsbufhashlock = 0;
522 wakeup(&nfsbufhashlock);
523 } else
524 nfsbufhashlock = 0;
525 tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz);
526 slpflag = 0;
527 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
528 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
529 FSDBG_BOT(541, vp, blkno, 0, EINTR);
530 return (NULL);
531 }
532 goto loop;
533 }
534 if (bp->nb_bufsize != bufsize)
535 panic("nfsbuf size mismatch");
536 SET(bp->nb_flags, (NB_BUSY | NB_CACHE));
537 nfs_buf_remfree(bp);
538 /* additional paranoia: */
539 if (ISSET(bp->nb_flags, NB_PAGELIST))
540 panic("pagelist buffer was not busy");
541 goto buffer_setup;
542 }
543
544 /*
545 * where to get a free buffer:
546 * - alloc new if we haven't reached min bufs
547 * - free list
548 * - alloc new if we haven't reached max allowed
549 * - start clearing out delwri list and try again
550 */
551
552 if ((nfsbufcnt > nfsbufmin) && !TAILQ_EMPTY(&nfsbuffree)) {
553 /* pull an nfsbuf off the free list */
554 bp = TAILQ_FIRST(&nfsbuffree);
555 FSDBG(544, vp, blkno, bp, bp->nb_flags);
556 nfs_buf_remfree(bp);
557 if (ISSET(bp->nb_flags, NB_DELWRI))
558 panic("nfs_buf_get: delwri");
559 SET(bp->nb_flags, NB_BUSY);
560 /* disassociate buffer from previous vnode */
561 if (bp->nb_vp) {
562 struct vnode *oldvp;
563 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
564 LIST_REMOVE(bp, nb_vnbufs);
565 bp->nb_vnbufs.le_next = NFSNOLIST;
566 }
567 oldvp = bp->nb_vp;
568 bp->nb_vp = NULL;
569 HOLDRELE(oldvp);
570 }
571 LIST_REMOVE(bp, nb_hash);
572 /* nuke any creds we're holding */
573 cred = bp->nb_rcred;
574 if (cred != NOCRED) {
575 bp->nb_rcred = NOCRED;
576 crfree(cred);
577 }
578 cred = bp->nb_wcred;
579 if (cred != NOCRED) {
580 bp->nb_wcred = NOCRED;
581 crfree(cred);
582 }
583 /* if buf will no longer be NB_META, dump old buffer */
584 if ((operation != BLK_META) &&
585 ISSET(bp->nb_flags, NB_META) && bp->nb_data) {
586 FREE(bp->nb_data, M_TEMP);
587 bp->nb_data = NULL;
588 }
589 /* re-init buf fields */
590 bp->nb_error = 0;
591 bp->nb_validoff = bp->nb_validend = -1;
592 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
593 bp->nb_valid = 0;
594 bp->nb_dirty = 0;
595 } else if (nfsbufcnt < nfsbufmax) {
596 /* just alloc a new one */
597 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
598 nfsbufcnt++;
599 NFSBUFCNTCHK();
600 /* init nfsbuf */
601 bzero(bp, sizeof(*bp));
602 bp->nb_free.tqe_next = NFSNOLIST;
603 bp->nb_validoff = bp->nb_validend = -1;
604 FSDBG(545, vp, blkno, bp, 0);
605 } else {
606 /* too many bufs... wait for buffers to free up */
607 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
608 /* unlock hash */
609 if (nfsbufhashlock < 0) {
610 nfsbufhashlock = 0;
611 wakeup(&nfsbufhashlock);
612 } else
613 nfsbufhashlock = 0;
614
615 /* poke the delwri list */
616 nfs_buf_delwri_push();
617
618 nfsneedbuffer = 1;
619 tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0);
620 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
621 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
622 FSDBG_BOT(541, vp, blkno, 0, EINTR);
623 return (NULL);
624 }
625 goto loop;
626 }
627
628setup_nfsbuf:
629
630 /* setup nfsbuf */
631 bp->nb_flags = NB_BUSY;
632 bp->nb_lblkno = blkno;
633 /* insert buf in hash */
634 LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash);
635 /* associate buffer with new vnode */
636 VHOLD(vp);
637 bp->nb_vp = vp;
638 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
639
640buffer_setup:
641
642 switch (operation) {
643 case BLK_META:
644 SET(bp->nb_flags, NB_META);
645 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
646 FREE(bp->nb_data, M_TEMP);
647 bp->nb_data = NULL;
648 bp->nb_validoff = bp->nb_validend = -1;
649 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
650 bp->nb_valid = 0;
651 bp->nb_dirty = 0;
652 CLR(bp->nb_flags, NB_CACHE);
653 }
654 if (!bp->nb_data)
655 MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK);
656 if (!bp->nb_data)
657 panic("nfs_buf_get: null nb_data");
658 bp->nb_bufsize = bufsize;
659 break;
660
661 case BLK_READ:
662 case BLK_WRITE:
663 if (bufsize < PAGE_SIZE)
664 bufsize = PAGE_SIZE;
665 bp->nb_bufsize = bufsize;
666 bp->nb_validoff = bp->nb_validend = -1;
667
668 if (UBCISVALID(vp)) {
669 /* setup upl */
670 if (nfs_buf_upl_setup(bp)) {
671 /* unable to create upl */
672 /* vm object must no longer exist */
673 /* cleanup buffer and return NULL */
674 LIST_REMOVE(bp, nb_vnbufs);
675 bp->nb_vnbufs.le_next = NFSNOLIST;
676 bp->nb_vp = NULL;
677 HOLDRELE(vp);
678 if (bp->nb_free.tqe_next != NFSNOLIST)
679 panic("nfsbuf on freelist");
680 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
681 nfsbuffreecnt++;
682 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
683 return (NULL);
684 }
685 nfs_buf_upl_check(bp);
686 }
687 break;
688
689 default:
690 panic("nfs_buf_get: %d unknown operation", operation);
691 }
692
693 /* unlock hash */
694 if (nfsbufhashlock < 0) {
695 nfsbufhashlock = 0;
696 wakeup(&nfsbufhashlock);
697 } else
698 nfsbufhashlock = 0;
699
700 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
701
702 return (bp);
703}
704
705void
706nfs_buf_release(struct nfsbuf *bp)
707{
708 struct vnode *vp = bp->nb_vp;
709
710 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
711 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
712 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
713
714 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
715 int upl_flags;
716 upl_t upl;
717 int i, rv;
718
719 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
720 rv = nfs_buf_upl_setup(bp);
721 if (rv)
722 printf("nfs_buf_release: upl create failed %d\n", rv);
723 else
724 nfs_buf_upl_check(bp);
725 }
726 upl = bp->nb_pagelist;
727 if (!upl)
728 goto pagelist_cleanup_done;
729 if (bp->nb_data) {
730 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
731 panic("ubc_upl_unmap failed");
732 bp->nb_data = NULL;
733 }
734 if (bp->nb_flags & (NB_ERROR | NB_INVAL)) {
735 if (bp->nb_flags & (NB_READ | NB_INVAL))
736 upl_flags = UPL_ABORT_DUMP_PAGES;
737 else
738 upl_flags = 0;
739 ubc_upl_abort(upl, upl_flags);
740 goto pagelist_cleanup_done;
741 }
742 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
743 if (!NBPGVALID(bp,i))
744 ubc_upl_abort_range(upl,
745 i*PAGE_SIZE, PAGE_SIZE,
746 UPL_ABORT_DUMP_PAGES |
747 UPL_ABORT_FREE_ON_EMPTY);
748 else {
749 if (NBPGDIRTY(bp,i))
750 upl_flags = UPL_COMMIT_SET_DIRTY;
751 else
752 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
753 ubc_upl_commit_range(upl,
754 i*PAGE_SIZE, PAGE_SIZE,
755 upl_flags |
756 UPL_COMMIT_INACTIVATE |
757 UPL_COMMIT_FREE_ON_EMPTY);
758 }
759 }
760pagelist_cleanup_done:
761 /* was this the last buffer in the file? */
762 if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) {
763 /* if so, invalidate all pages of last buffer past EOF */
764 int biosize = vp->v_mount->mnt_stat.f_iosize;
765 off_t off, size;
766 off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
767 size = trunc_page_64(NBOFF(bp) + biosize) - off;
768 if (size)
769 ubc_invalidate(vp, off, size);
770 }
771 CLR(bp->nb_flags, NB_PAGELIST);
772 bp->nb_pagelist = NULL;
773 }
774
775 /* Wake up any processes waiting for any buffer to become free. */
776 if (nfsneedbuffer) {
777 nfsneedbuffer = 0;
778 wakeup(&nfsneedbuffer);
779 }
780 /* Wake up any processes waiting for _this_ buffer to become free. */
781 if (ISSET(bp->nb_flags, NB_WANTED)) {
782 CLR(bp->nb_flags, NB_WANTED);
783 wakeup(bp);
784 }
785
786 /* If it's not cacheable, or an error, mark it invalid. */
787 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
788 SET(bp->nb_flags, NB_INVAL);
789
790 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
791 /* If it's invalid or empty, dissociate it from its vnode */
792 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
793 LIST_REMOVE(bp, nb_vnbufs);
794 bp->nb_vnbufs.le_next = NFSNOLIST;
795 }
796 bp->nb_vp = NULL;
797 HOLDRELE(vp);
798 /* if this was a delayed write, wakeup anyone */
799 /* waiting for delayed writes to complete */
800 if (ISSET(bp->nb_flags, NB_DELWRI)) {
801 CLR(bp->nb_flags, NB_DELWRI);
802 nfs_nbdwrite--;
803 NFSBUFCNTCHK();
804 wakeup((caddr_t)&nfs_nbdwrite);
805 }
806 /* put buffer at head of free list */
807 if (bp->nb_free.tqe_next != NFSNOLIST)
808 panic("nfsbuf on freelist");
809 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
810 nfsbuffreecnt++;
811 NFS_BUF_FREEUP();
812 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
813 /* put buffer at end of delwri list */
814 if (bp->nb_free.tqe_next != NFSNOLIST)
815 panic("nfsbuf on freelist");
816 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
817 nfsbufdelwricnt++;
818 } else {
819 /* put buffer at end of free list */
820 if (bp->nb_free.tqe_next != NFSNOLIST)
821 panic("nfsbuf on freelist");
822 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
823 nfsbuffreecnt++;
824 NFS_BUF_FREEUP();
825 }
826
827 NFSBUFCNTCHK();
828
829 /* Unlock the buffer. */
830 CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD));
831
832 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
833}
834
835/*
836 * Wait for operations on the buffer to complete.
837 * When they do, extract and return the I/O's error value.
838 */
839int
840nfs_buf_iowait(struct nfsbuf *bp)
841{
842 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
843
844 while (!ISSET(bp->nb_flags, NB_DONE))
845 tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0);
846
847 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
848
849 /* check for interruption of I/O, then errors. */
850 if (ISSET(bp->nb_flags, NB_EINTR)) {
851 CLR(bp->nb_flags, NB_EINTR);
852 return (EINTR);
853 } else if (ISSET(bp->nb_flags, NB_ERROR))
854 return (bp->nb_error ? bp->nb_error : EIO);
855 return (0);
856}
857
858/*
859 * Mark I/O complete on a buffer.
860 */
861void
862nfs_buf_iodone(struct nfsbuf *bp)
863{
864 struct vnode *vp;
865
866 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
867
868 if (ISSET(bp->nb_flags, NB_DONE))
869 panic("nfs_buf_iodone already");
870 SET(bp->nb_flags, NB_DONE); /* note that it's done */
871 /*
872 * I/O was done, so don't believe
873 * the DIRTY state from VM anymore
874 */
875 CLR(bp->nb_flags, NB_WASDIRTY);
876
877 if (!ISSET(bp->nb_flags, NB_READ)) {
878 CLR(bp->nb_flags, NB_WRITEINPROG);
879 vpwakeup(bp->nb_vp);
880 }
881
882 /* Wakeup the throttled write operations as needed */
883 vp = bp->nb_vp;
884 if (vp && (vp->v_flag & VTHROTTLED)
885 && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) {
886 vp->v_flag &= ~VTHROTTLED;
887 wakeup((caddr_t)&vp->v_numoutput);
888 }
889
890 if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */
891 nfs_buf_release(bp);
892 else { /* or just wakeup the buffer */
893 CLR(bp->nb_flags, NB_WANTED);
894 wakeup(bp);
895 }
896
897 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
898}
899
900void
901nfs_buf_write_delayed(struct nfsbuf *bp)
902{
903 struct proc *p = current_proc();
904 struct vnode *vp = bp->nb_vp;
905
906 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
907 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
908
909 /*
910 * If the block hasn't been seen before:
911 * (1) Mark it as having been seen,
912 * (2) Charge for the write.
913 * (3) Make sure it's on its vnode's correct block list,
914 */
915 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
916 SET(bp->nb_flags, NB_DELWRI);
917 if (p && p->p_stats)
918 p->p_stats->p_ru.ru_oublock++; /* XXX */
919 nfs_nbdwrite++;
920 NFSBUFCNTCHK();
921 /* move to dirty list */
922 if (bp->nb_vnbufs.le_next != NFSNOLIST)
923 LIST_REMOVE(bp, nb_vnbufs);
924 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
925 }
926
927 /*
928 * If the vnode has "too many" write operations in progress
929 * wait for them to finish the IO
930 */
931 while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) {
932 vp->v_flag |= VTHROTTLED;
933 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0);
934 }
935
936 /*
937 * If we have too many delayed write buffers,
938 * more than we can "safely" handle, just fall back to
939 * doing the async write
940 */
941 if (nfs_nbdwrite < 0)
942 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
943
944 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
945 /* issue async write */
946 SET(bp->nb_flags, NB_ASYNC);
947 nfs_buf_write(bp);
948 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
949 return;
950 }
951
952 /* Otherwise, the "write" is done, so mark and release the buffer. */
953 SET(bp->nb_flags, NB_DONE);
954 nfs_buf_release(bp);
955 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
956 return;
957}
958
1c79356b
A
959
960/*
961 * Vnode op for read using bio
962 * Any similarity to readip() is purely coincidental
963 */
964int
965nfs_bioread(vp, uio, ioflag, cred, getpages)
966 register struct vnode *vp;
967 register struct uio *uio;
968 int ioflag;
969 struct ucred *cred;
55e303ae 970 int getpages; // XXX unused!
1c79356b 971{
55e303ae
A
972 struct nfsnode *np = VTONFS(vp);
973 int biosize, i;
b4c24cb9 974 off_t diff;
55e303ae 975 struct nfsbuf *bp = 0, *rabp;
1c79356b
A
976 struct vattr vattr;
977 struct proc *p;
978 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
55e303ae 979 daddr_t lbn, rabn, lastrabn = -1;
1c79356b 980 int bufsize;
55e303ae 981 int nra, error = 0, n = 0, on = 0;
1c79356b 982 int operation = (getpages? BLK_PAGEIN : BLK_READ);
55e303ae
A
983 caddr_t dp;
984 struct dirent *direntp;
985
986 FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag);
1c79356b
A
987
988#if DIAGNOSTIC
989 if (uio->uio_rw != UIO_READ)
990 panic("nfs_read mode");
991#endif
55e303ae
A
992 if (uio->uio_resid == 0) {
993 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1c79356b 994 return (0);
55e303ae
A
995 }
996 if (uio->uio_offset < 0) {
997 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1c79356b 998 return (EINVAL);
55e303ae 999 }
1c79356b 1000 p = uio->uio_procp;
55e303ae
A
1001 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1002 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1c79356b 1003 (void)nfs_fsinfo(nmp, vp, cred, p);
55e303ae 1004 biosize = vp->v_mount->mnt_stat.f_iosize;
1c79356b
A
1005 /*
1006 * For nfs, cache consistency can only be maintained approximately.
1007 * Although RFC1094 does not specify the criteria, the following is
1008 * believed to be compatible with the reference port.
1009 * For nqnfs, full cache consistency is maintained within the loop.
1010 * For nfs:
1011 * If the file's modify time on the server has changed since the
1012 * last read rpc or you have written to the file,
1013 * you may have lost data cache consistency with the
1014 * server, so flush all of the file's data out of the cache.
1015 * Then force a getattr rpc to ensure that you have up to date
1016 * attributes.
1017 * NB: This implies that cache data can be read when up to
ab86ba33
A
1018 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1019 * current attributes this could be forced by setting n_xid to 0
1020 * before the VOP_GETATTR() call.
1c79356b
A
1021 */
1022 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1023 if (np->n_flag & NMODIFIED) {
1024 if (vp->v_type != VREG) {
1025 if (vp->v_type != VDIR)
1026 panic("nfs: bioread, not dir");
1027 nfs_invaldir(vp);
1028 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1029 if (error) {
1030 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1c79356b 1031 return (error);
55e303ae 1032 }
1c79356b 1033 }
ab86ba33 1034 np->n_xid = 0;
1c79356b 1035 error = VOP_GETATTR(vp, &vattr, cred, p);
55e303ae
A
1036 if (error) {
1037 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1c79356b 1038 return (error);
55e303ae 1039 }
1c79356b
A
1040 np->n_mtime = vattr.va_mtime.tv_sec;
1041 } else {
1042 error = VOP_GETATTR(vp, &vattr, cred, p);
55e303ae
A
1043 if (error) {
1044 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1c79356b 1045 return (error);
55e303ae 1046 }
1c79356b 1047 if (np->n_mtime != vattr.va_mtime.tv_sec) {
55e303ae 1048 if (vp->v_type == VDIR) {
1c79356b 1049 nfs_invaldir(vp);
55e303ae
A
1050 /* purge name cache entries */
1051 cache_purge(vp);
1052 }
1c79356b 1053 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1054 if (error) {
1055 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1c79356b 1056 return (error);
55e303ae 1057 }
1c79356b
A
1058 np->n_mtime = vattr.va_mtime.tv_sec;
1059 }
1060 }
1061 }
1062 do {
1063
1064 /*
1065 * Get a valid lease. If cached data is stale, flush it.
1066 */
1067 if (nmp->nm_flag & NFSMNT_NQNFS) {
1068 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1069 do {
1070 error = nqnfs_getlease(vp, ND_READ, cred, p);
1071 } while (error == NQNFS_EXPIRED);
55e303ae
A
1072 if (error) {
1073 FSDBG_BOT(514, vp, 0xd1e0007, 0, error);
1c79356b 1074 return (error);
55e303ae 1075 }
1c79356b
A
1076 if (np->n_lrev != np->n_brev ||
1077 (np->n_flag & NQNFSNONCACHE) ||
1078 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
1079 if (vp->v_type == VDIR)
1080 nfs_invaldir(vp);
1081 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1082 if (error) {
1083 FSDBG_BOT(514, vp, 0xd1e0008, 0, error);
1c79356b 1084 return (error);
55e303ae 1085 }
1c79356b
A
1086 np->n_brev = np->n_lrev;
1087 }
1088 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
1089 nfs_invaldir(vp);
1090 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1091 if (error) {
1092 FSDBG_BOT(514, vp, 0xd1e0009, 0, error);
1c79356b 1093 return (error);
55e303ae 1094 }
1c79356b
A
1095 }
1096 }
55e303ae
A
1097 if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) {
1098 if ((vp->v_flag & VNOCACHE_DATA) &&
1099 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1100 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1101 if (error) {
1102 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1103 return (error);
1104 }
1105 }
1c79356b
A
1106 switch (vp->v_type) {
1107 case VREG:
55e303ae
A
1108 error = nfs_readrpc(vp, uio, cred);
1109 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1110 return (error);
1c79356b 1111 case VLNK:
55e303ae
A
1112 error = nfs_readlinkrpc(vp, uio, cred);
1113 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1114 return (error);
1c79356b
A
1115 case VDIR:
1116 break;
1117 default:
55e303ae 1118 printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type);
1c79356b
A
1119 };
1120 }
1121 switch (vp->v_type) {
1122 case VREG:
1c79356b 1123 lbn = uio->uio_offset / biosize;
55e303ae
A
1124
1125 /*
1126 * Copy directly from any cached pages without grabbing the bufs.
1127 */
1128 if (uio->uio_segflg == UIO_USERSPACE) {
1129 int io_resid = uio->uio_resid;
1130 diff = np->n_size - uio->uio_offset;
1131 if (diff < io_resid)
1132 io_resid = diff;
1133 if (io_resid > 0) {
1134 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1135 if (error) {
1136 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1137 return (error);
1138 }
1139 }
1140 /* count any biocache reads that we just copied directly */
1141 if (lbn != uio->uio_offset / biosize) {
1142 nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn;
1143 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1144 }
1145 }
1146
1147 lbn = uio->uio_offset / biosize;
1148 on = uio->uio_offset % biosize;
1c79356b
A
1149
1150 /*
1151 * Start the read ahead(s), as required.
1152 */
1153 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
55e303ae 1154 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1c79356b 1155 rabn = lbn + 1 + nra;
55e303ae
A
1156 if (rabn <= lastrabn) {
1157 /* we've already (tried to) read this block */
1158 /* no need to try it again... */
1159 continue;
1c79356b 1160 }
55e303ae
A
1161 lastrabn = rabn;
1162 if ((off_t)rabn * biosize >= np->n_size)
1163 break;
1164 /* check if block exists and is valid. */
1165 rabp = nfs_buf_incore(vp, rabn);
1166 if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize))
1167 continue;
1168 rabp = nfs_buf_get(vp, rabn, biosize, p, operation);
1169 if (!rabp) {
1170 FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR);
1171 return (EINTR);
1172 }
1173 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1174 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1175 if (nfs_asyncio(rabp, cred)) {
1176 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1177 rabp->nb_error = EIO;
1178 nfs_buf_release(rabp);
1179 }
1180 } else
1181 nfs_buf_release(rabp);
1182 }
1c79356b
A
1183 }
1184
55e303ae
A
1185 if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) {
1186 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa);
1187 return (0);
1188 }
1189
1190 nfsstats.biocache_reads++;
1191
1c79356b
A
1192 /*
1193 * If the block is in the cache and has the required data
1194 * in a valid region, just copy it out.
1195 * Otherwise, get the block and write back/read in,
1196 * as required.
1197 */
1198again:
1199 bufsize = biosize;
55e303ae 1200 n = min((unsigned)(bufsize - on), uio->uio_resid);
1c79356b
A
1201 diff = np->n_size - uio->uio_offset;
1202 if (diff < n)
1203 n = diff;
55e303ae
A
1204
1205 bp = nfs_buf_get(vp, lbn, bufsize, p, operation);
1206 if (!bp) {
1207 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1208 return (EINTR);
1209 }
1210
1211 /* if any pages are valid... */
1212 if (bp->nb_valid) {
1213 /* ...check for any invalid pages in the read range */
1214 int pg, firstpg, lastpg, dirtypg;
1215 dirtypg = firstpg = lastpg = -1;
1216 pg = on/PAGE_SIZE;
1217 while (pg <= (on + n - 1)/PAGE_SIZE) {
1218 if (!NBPGVALID(bp,pg)) {
1219 if (firstpg < 0)
1220 firstpg = pg;
1221 lastpg = pg;
1222 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1223 dirtypg = pg;
1224 pg++;
1225 }
1226
1227 /* if there are no invalid pages, we're all set */
1228 if (firstpg < 0) {
1229 if (bp->nb_validoff < 0) {
1230 /* valid range isn't set up, so */
1231 /* set it to what we know is valid */
1232 bp->nb_validoff = trunc_page_32(on);
1233 bp->nb_validend = round_page_32(on+n);
1234 nfs_buf_normalize_valid_range(np, bp);
1235 }
1236 goto buffer_ready;
1237 }
1238
1239 /* there are invalid pages in the read range */
1240 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1241 /* there are also dirty page(s) in the range, */
1242 /* so write the buffer out and try again */
1243 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1244 SET(bp->nb_flags, NB_ASYNC);
1245 /*
1246 * NFS has embedded ucred so crhold() risks zone corruption
1247 */
1248 if (bp->nb_wcred == NOCRED)
1249 bp->nb_wcred = crdup(cred);
1250 error = nfs_buf_write(bp);
1251 if (error) {
1252 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1253 return (error);
1254 }
1c79356b
A
1255 goto again;
1256 }
55e303ae
A
1257 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1258 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1259 /* we need to read in more than half the buffer and the */
1260 /* buffer's not dirty, so just fetch the whole buffer */
1261 bp->nb_valid = 0;
1262 } else {
1263 /* read the page range in */
1264 struct iovec iov;
1265 struct uio auio;
1266 auio.uio_iov = &iov;
1267 auio.uio_iovcnt = 1;
1268 auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64;
1269 auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE;
1270 auio.uio_segflg = UIO_SYSSPACE;
1271 auio.uio_rw = UIO_READ;
1272 auio.uio_procp = p;
1273 NFS_BUF_MAP(bp);
1274 iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE;
1275 iov.iov_len = auio.uio_resid;
1276 error = nfs_readrpc(vp, &auio, cred);
1277 if (error) {
1278 nfs_buf_release(bp);
1279 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1280 return (error);
1281 }
1282 /* Make sure that the valid range is set to cover this read. */
1283 bp->nb_validoff = trunc_page_32(on);
1284 bp->nb_validend = round_page_32(on+n);
1285 nfs_buf_normalize_valid_range(np, bp);
1286 if (auio.uio_resid > 0) {
1287 /* if short read, must have hit EOF, */
1288 /* so zero the rest of the range */
1289 bzero(iov.iov_base, auio.uio_resid);
1290 }
1291 /* mark the pages (successfully read) as valid */
1292 for (pg=firstpg; pg <= lastpg; pg++)
1293 NBPGVALID_SET(bp,pg);
1294 }
1c79356b 1295 }
55e303ae
A
1296 /* if no pages are valid, read the whole block */
1297 if (!bp->nb_valid) {
1298 SET(bp->nb_flags, NB_READ);
1299 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1300 error = nfs_doio(bp, cred, p);
1301 if (error) {
1302 nfs_buf_release(bp);
1303 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1304 return (error);
1305 }
1306 }
1307buffer_ready:
1c79356b 1308 vp->v_lastr = lbn;
55e303ae
A
1309 /* validate read range against valid range and clip */
1310 if (bp->nb_validend > 0) {
1311 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1312 if (diff < n)
1313 n = diff;
1314 }
1315 if (n > 0)
1316 NFS_BUF_MAP(bp);
1c79356b
A
1317 break;
1318 case VLNK:
1319 nfsstats.biocache_readlinks++;
55e303ae
A
1320 bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
1321 if (!bp) {
1322 FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR);
1c79356b 1323 return (EINTR);
55e303ae
A
1324 }
1325 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1326 SET(bp->nb_flags, NB_READ);
1c79356b
A
1327 error = nfs_doio(bp, cred, p);
1328 if (error) {
55e303ae
A
1329 SET(bp->nb_flags, NB_ERROR);
1330 nfs_buf_release(bp);
1331 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1c79356b
A
1332 return (error);
1333 }
1334 }
55e303ae 1335 n = min(uio->uio_resid, bp->nb_validend);
1c79356b
A
1336 on = 0;
1337 break;
1338 case VDIR:
1339 nfsstats.biocache_readdirs++;
55e303ae
A
1340 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1341 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1342 return (0);
1c79356b
A
1343 }
1344 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1345 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
55e303ae
A
1346 bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation);
1347 if (!bp) {
1348 FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR);
1349 return (EINTR);
1350 }
1351 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1352 SET(bp->nb_flags, NB_READ);
1c79356b
A
1353 error = nfs_doio(bp, cred, p);
1354 if (error) {
55e303ae 1355 nfs_buf_release(bp);
1c79356b 1356 }
fa4905b1
A
1357 while (error == NFSERR_BAD_COOKIE) {
1358 nfs_invaldir(vp);
1359 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1360 /*
1361 * Yuck! The directory has been modified on the
1362 * server. The only way to get the block is by
1363 * reading from the beginning to get all the
1364 * offset cookies.
1365 */
1366 for (i = 0; i <= lbn && !error; i++) {
1367 if (np->n_direofoffset
55e303ae
A
1368 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1369 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
fa4905b1 1370 return (0);
55e303ae
A
1371 }
1372 bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation);
1373 if (!bp) {
1374 FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR);
fa4905b1 1375 return (EINTR);
55e303ae
A
1376 }
1377 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1378 SET(bp->nb_flags, NB_READ);
fa4905b1
A
1379 error = nfs_doio(bp, cred, p);
1380 /*
55e303ae 1381 * no error + NB_INVAL == directory EOF,
fa4905b1
A
1382 * use the block.
1383 */
55e303ae 1384 if (error == 0 && (bp->nb_flags & NB_INVAL))
fa4905b1
A
1385 break;
1386 }
1387 /*
1388 * An error will throw away the block and the
1389 * for loop will break out. If no error and this
1390 * is not the block we want, we throw away the
1391 * block and go for the next one via the for loop.
1392 */
1393 if (error || i < lbn)
55e303ae 1394 nfs_buf_release(bp);
fa4905b1
A
1395 }
1396 }
1397 /*
1398 * The above while is repeated if we hit another cookie
1399 * error. If we hit an error and it wasn't a cookie error,
1400 * we give up.
1401 */
55e303ae
A
1402 if (error) {
1403 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
fa4905b1 1404 return (error);
55e303ae 1405 }
1c79356b
A
1406 }
1407
1408 /*
1409 * If not eof and read aheads are enabled, start one.
1410 * (You need the current block first, so that you have the
1411 * directory offset cookie of the next block.)
1412 */
1413 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1414 (np->n_direofoffset == 0 ||
1415 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1416 !(np->n_flag & NQNFSNONCACHE) &&
55e303ae
A
1417 !nfs_buf_incore(vp, lbn + 1)) {
1418 rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p,
fa4905b1 1419 operation);
1c79356b 1420 if (rabp) {
55e303ae
A
1421 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1422 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
fa4905b1 1423 if (nfs_asyncio(rabp, cred)) {
55e303ae
A
1424 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1425 rabp->nb_error = EIO;
1426 nfs_buf_release(rabp);
fa4905b1 1427 }
1c79356b 1428 } else {
55e303ae 1429 nfs_buf_release(rabp);
1c79356b
A
1430 }
1431 }
1432 }
1433 /*
1434 * Make sure we use a signed variant of min() since
1435 * the second term may be negative.
1436 */
55e303ae 1437 n = lmin(uio->uio_resid, bp->nb_validend - on);
fa4905b1 1438 /*
55e303ae
A
1439 * We keep track of the directory eof in
1440 * np->n_direofoffset and chop it off as an
1441 * extra step right here.
fa4905b1
A
1442 */
1443 if (np->n_direofoffset &&
1444 n > np->n_direofoffset - uio->uio_offset)
1445 n = np->n_direofoffset - uio->uio_offset;
55e303ae
A
1446 /*
1447 * Make sure that we return an integral number of entries so
1448 * that any subsequent calls will start copying from the start
1449 * of the next entry.
1450 *
1451 * If the current value of n has the last entry cut short,
1452 * set n to copy everything up to the last entry instead.
1453 */
1454 if (n > 0) {
1455 dp = bp->nb_data + on;
1456 while (dp < (bp->nb_data + on + n)) {
1457 direntp = (struct dirent *)dp;
1458 dp += direntp->d_reclen;
1459 }
1460 if (dp > (bp->nb_data + on + n))
1461 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1462 }
1c79356b
A
1463 break;
1464 default:
55e303ae
A
1465 printf("nfs_bioread: type %x unexpected\n",vp->v_type);
1466 FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL);
1467 return (EINVAL);
1c79356b
A
1468 };
1469
1470 if (n > 0) {
55e303ae 1471 error = uiomove(bp->nb_data + on, (int)n, uio);
1c79356b
A
1472 }
1473 switch (vp->v_type) {
1474 case VREG:
1475 break;
1476 case VLNK:
1477 n = 0;
1478 break;
1479 case VDIR:
1480 if (np->n_flag & NQNFSNONCACHE)
55e303ae 1481 SET(bp->nb_flags, NB_INVAL);
1c79356b 1482 break;
1c79356b 1483 }
55e303ae 1484 nfs_buf_release(bp);
1c79356b 1485 } while (error == 0 && uio->uio_resid > 0 && n > 0);
55e303ae 1486 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1c79356b
A
1487 return (error);
1488}
1489
fa4905b1 1490
1c79356b
A
1491/*
1492 * Vnode op for write using bio
1493 */
1494int
1495nfs_write(ap)
1496 struct vop_write_args /* {
1497 struct vnode *a_vp;
1498 struct uio *a_uio;
1499 int a_ioflag;
1500 struct ucred *a_cred;
1501 } */ *ap;
1502{
55e303ae 1503 struct uio *uio = ap->a_uio;
1c79356b 1504 struct proc *p = uio->uio_procp;
55e303ae 1505 struct vnode *vp = ap->a_vp;
1c79356b 1506 struct nfsnode *np = VTONFS(vp);
55e303ae 1507 struct ucred *cred = ap->a_cred;
1c79356b 1508 int ioflag = ap->a_ioflag;
55e303ae 1509 struct nfsbuf *bp;
1c79356b
A
1510 struct vattr vattr;
1511 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1512 daddr_t lbn;
55e303ae 1513 int biosize, bufsize, writeop;
1c79356b 1514 int n, on, error = 0, iomode, must_commit;
55e303ae 1515 off_t boff, start, end;
fa4905b1
A
1516 struct iovec iov;
1517 struct uio auio;
1c79356b 1518
55e303ae
A
1519 FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag);
1520
1c79356b
A
1521#if DIAGNOSTIC
1522 if (uio->uio_rw != UIO_WRITE)
1523 panic("nfs_write mode");
1524 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
1525 panic("nfs_write proc");
1526#endif
1527 if (vp->v_type != VREG)
1528 return (EIO);
1529 if (np->n_flag & NWRITEERR) {
1530 np->n_flag &= ~NWRITEERR;
55e303ae 1531 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error);
1c79356b
A
1532 return (np->n_error);
1533 }
55e303ae
A
1534 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1535 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1c79356b
A
1536 (void)nfs_fsinfo(nmp, vp, cred, p);
1537 if (ioflag & (IO_APPEND | IO_SYNC)) {
1538 if (np->n_flag & NMODIFIED) {
ab86ba33 1539 np->n_xid = 0;
1c79356b 1540 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1541 if (error) {
1542 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1c79356b 1543 return (error);
55e303ae 1544 }
1c79356b
A
1545 }
1546 if (ioflag & IO_APPEND) {
ab86ba33 1547 np->n_xid = 0;
1c79356b 1548 error = VOP_GETATTR(vp, &vattr, cred, p);
55e303ae
A
1549 if (error) {
1550 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
1c79356b 1551 return (error);
55e303ae 1552 }
1c79356b
A
1553 uio->uio_offset = np->n_size;
1554 }
1555 }
55e303ae
A
1556 if (uio->uio_offset < 0) {
1557 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
1c79356b 1558 return (EINVAL);
55e303ae
A
1559 }
1560 if (uio->uio_resid == 0) {
1561 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
1c79356b 1562 return (0);
55e303ae 1563 }
1c79356b
A
1564 /*
1565 * Maybe this should be above the vnode op call, but so long as
1566 * file servers have no limits, i don't think it matters
1567 */
1568 if (p && uio->uio_offset + uio->uio_resid >
1569 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
1570 psignal(p, SIGXFSZ);
55e303ae 1571 FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG);
1c79356b
A
1572 return (EFBIG);
1573 }
55e303ae
A
1574
1575 biosize = vp->v_mount->mnt_stat.f_iosize;
1c79356b
A
1576
1577 do {
1578 /*
1579 * Check for a valid write lease.
1580 */
1581 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1582 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1583 do {
1584 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1585 } while (error == NQNFS_EXPIRED);
55e303ae
A
1586 if (error) {
1587 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error);
1c79356b 1588 return (error);
55e303ae 1589 }
1c79356b
A
1590 if (np->n_lrev != np->n_brev ||
1591 (np->n_flag & NQNFSNONCACHE)) {
1592 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1593 if (error) {
1594 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error);
1c79356b 1595 return (error);
55e303ae 1596 }
1c79356b
A
1597 np->n_brev = np->n_lrev;
1598 }
1599 }
55e303ae
A
1600 if (ISSET(vp->v_flag, VNOCACHE_DATA) &&
1601 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1602 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1603 if (error) {
1604 FSDBG_BOT(515, vp, 0, 0, error);
1605 return (error);
1606 }
1607 }
1608 if (((np->n_flag & NQNFSNONCACHE) ||
1609 ISSET(vp->v_flag, VNOCACHE_DATA)) &&
1610 uio->uio_iovcnt == 1) {
1c79356b
A
1611 iomode = NFSV3WRITE_FILESYNC;
1612 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
1613 if (must_commit)
1614 nfs_clearcommit(vp->v_mount);
55e303ae 1615 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1c79356b
A
1616 return (error);
1617 }
1618 nfsstats.biocache_writes++;
1619 lbn = uio->uio_offset / biosize;
55e303ae 1620 on = uio->uio_offset % biosize;
1c79356b
A
1621 n = min((unsigned)(biosize - on), uio->uio_resid);
1622again:
1c79356b 1623 bufsize = biosize;
fa4905b1
A
1624 /*
1625 * Get a cache block for writing. The range to be written is
55e303ae 1626 * (off..off+n) within the block. We ensure that the block
fa4905b1
A
1627 * either has no dirty region or that the given range is
1628 * contiguous with the existing dirty region.
1629 */
55e303ae
A
1630 bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE);
1631 if (!bp) {
1632 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR);
1c79356b 1633 return (EINTR);
55e303ae
A
1634 }
1635 /* map the block because we know we're going to write to it */
1636 NFS_BUF_MAP(bp);
1637
1638 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1639 SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL));
1640
1641 /*
1642 * NFS has embedded ucred so crhold() risks zone corruption
1643 */
1644 if (bp->nb_wcred == NOCRED)
1645 bp->nb_wcred = crdup(cred);
1646
1647 /*
1648 * If there's already a dirty range AND dirty pages in this block we
1649 * need to send a commit AND write the dirty pages before continuing.
1650 *
1651 * If there's already a dirty range OR dirty pages in this block
1652 * and the new write range is not contiguous with the existing range,
1653 * then force the buffer to be written out now.
1654 * (We used to just extend the dirty range to cover the valid,
1655 * but unwritten, data in between also. But writing ranges
1656 * of data that weren't actually written by an application
1657 * risks overwriting some other client's data with stale data
1658 * that's just masquerading as new written data.)
1659 */
1660 if (bp->nb_dirtyend > 0) {
1661 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
1662 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
1663 /* write/commit buffer "synchronously" */
1664 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1665 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1666 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1667 error = nfs_buf_write(bp);
1668 if (error) {
1669 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1670 return (error);
1671 }
1672 goto again;
1673 }
1674 } else if (bp->nb_dirty) {
1675 int firstpg, lastpg;
1676 u_int32_t pagemask;
1677 /* calculate write range pagemask */
1678 firstpg = on/PAGE_SIZE;
1679 lastpg = (on+n-1)/PAGE_SIZE;
1680 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
1681 /* check if there are dirty pages outside the write range */
1682 if (bp->nb_dirty & ~pagemask) {
1683 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
1684 /* write/commit buffer "synchronously" */
1685 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1686 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1687 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1688 error = nfs_buf_write(bp);
1689 if (error) {
1690 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1691 return (error);
1692 }
1693 goto again;
1694 }
1695 /* if the first or last pages are already dirty */
1696 /* make sure that the dirty range encompasses those pages */
1697 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
1698 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
1699 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
1700 if (NBPGDIRTY(bp,lastpg)) {
1701 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
1702 /* clip to EOF */
1703 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1704 bp->nb_dirtyend = np->n_size - NBOFF(bp);
1705 } else
1706 bp->nb_dirtyend = on+n;
1707 }
1708 }
1709
fa4905b1 1710 /*
55e303ae
A
1711 * Are we extending the size of the file with this write?
1712 * If so, update file size now that we have the block.
fa4905b1
A
1713 * If there was a partial buf at the old eof, validate
1714 * and zero the new bytes.
1715 */
1716 if (uio->uio_offset + n > np->n_size) {
55e303ae
A
1717 struct nfsbuf *eofbp = NULL;
1718 daddr_t eofbn = np->n_size / biosize;
1719 int eofoff = np->n_size % biosize;
1720 int neweofoff = (uio->uio_offset + n) % biosize;
1721
1722 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
fa4905b1 1723
55e303ae
A
1724 if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn))
1725 eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE);
1726
1727 /* if we're extending within the same last block */
1728 /* and the block is flagged as being cached... */
1729 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
1730 /* ...check that all pages in buffer are valid */
1731 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
1732 u_int32_t pagemask;
1733 /* pagemask only has to extend to last page being written to */
1734 pagemask = (1 << (endpg+1)) - 1;
1735 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
1736 if ((bp->nb_valid & pagemask) != pagemask) {
1737 /* zerofill any hole */
1738 if (on > bp->nb_validend) {
1739 int i;
1740 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
1741 NBPGVALID_SET(bp, i);
1742 NFS_BUF_MAP(bp);
1743 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
1744 bzero((char *)bp->nb_data + bp->nb_validend,
1745 on - bp->nb_validend);
1746 }
1747 /* zerofill any trailing data in the last page */
1748 if (neweofoff) {
1749 NFS_BUF_MAP(bp);
1750 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
1751 bzero((char *)bp->nb_data + neweofoff,
1752 PAGE_SIZE - (neweofoff & PAGE_MASK));
1753 }
1754 }
1755 }
fa4905b1
A
1756 np->n_flag |= NMODIFIED;
1757 np->n_size = uio->uio_offset + n;
1758 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
55e303ae
A
1759 if (eofbp) {
1760 /*
1761 * We may need to zero any previously invalid data
1762 * after the old EOF in the previous EOF buffer.
1763 *
1764 * For the old last page, don't zero bytes if there
1765 * are invalid bytes in that page (i.e. the page isn't
1766 * currently valid).
1767 * For pages after the old last page, zero them and
1768 * mark them as valid.
1769 */
1770 char *d;
1771 int i;
1772 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1773 SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL));
1774 NFS_BUF_MAP(eofbp);
1775 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
1776 d = eofbp->nb_data;
1777 i = eofoff/PAGE_SIZE;
1778 while (eofoff < biosize) {
1779 int poff = eofoff & PAGE_MASK;
1780 if (!poff || NBPGVALID(eofbp,i)) {
1781 bzero(d + eofoff, PAGE_SIZE - poff);
1782 NBPGVALID_SET(eofbp, i);
1783 }
1784 if (bp->nb_validend == eofoff)
1785 bp->nb_validend += PAGE_SIZE - poff;
1786 eofoff += PAGE_SIZE - poff;
1787 i++;
1788 }
1789 nfs_buf_release(eofbp);
fa4905b1
A
1790 }
1791 }
fa4905b1
A
1792 /*
1793 * If dirtyend exceeds file size, chop it down. This should
1794 * not occur unless there is a race.
1795 */
55e303ae
A
1796 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1797 bp->nb_dirtyend = np->n_size - NBOFF(bp);
fa4905b1 1798 /*
55e303ae
A
1799 * UBC doesn't handle partial pages, so we need to make sure
1800 * that any pages left in the page cache are completely valid.
1801 *
1802 * Writes that are smaller than a block are delayed if they
1803 * don't extend to the end of the block.
fa4905b1 1804 *
55e303ae
A
1805 * If the block isn't (completely) cached, we may need to read
1806 * in some parts of pages that aren't covered by the write.
1807 * If the write offset (on) isn't page aligned, we'll need to
1808 * read the start of the first page being written to. Likewise,
1809 * if the offset of the end of the write (on+n) isn't page aligned,
1810 * we'll need to read the end of the last page being written to.
1811 *
1812 * Notes:
1813 * We don't want to read anything we're just going to write over.
1814 * We don't want to issue multiple I/Os if we don't have to
1815 * (because they're synchronous rpcs).
1816 * We don't want to read anything we already have modified in the
1817 * page cache.
fa4905b1 1818 */
55e303ae
A
1819 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
1820 int firstpg, lastpg, dirtypg;
1821 int firstpgoff, lastpgoff;
1822 start = end = -1;
1823 firstpg = on/PAGE_SIZE;
1824 firstpgoff = on & PAGE_MASK;
1825 lastpg = (on+n-1)/PAGE_SIZE;
1826 lastpgoff = (on+n) & PAGE_MASK;
1827 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
1828 /* need to read start of first page */
1829 start = firstpg * PAGE_SIZE;
1830 end = start + firstpgoff;
fa4905b1 1831 }
55e303ae
A
1832 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
1833 /* need to read end of last page */
1834 if (start < 0)
1835 start = (lastpg * PAGE_SIZE) + lastpgoff;
1836 end = (lastpg + 1) * PAGE_SIZE;
fa4905b1 1837 }
fa4905b1 1838 if (end > start) {
55e303ae
A
1839 /* need to read the data in range: start...end-1 */
1840
1841 /*
1842 * XXX: If we know any of these reads are beyond the
1843 * current EOF (what np->n_size was before we possibly
1844 * just modified it above), we could short-circuit the
1845 * reads and just zero buffer. No need to make a trip
1846 * across the network to read nothing.
1847 */
1848
1849 /* first, check for dirty pages in between */
1850 /* if there are, we'll have to do two reads because */
1851 /* we don't want to overwrite the dirty pages. */
1852 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
1853 if (NBPGDIRTY(bp,dirtypg))
1854 break;
1855
1856 /* if start is at beginning of page, try */
1857 /* to get any preceeding pages as well. */
1858 if (!(start & PAGE_MASK)) {
1859 /* stop at next dirty/valid page or start of block */
1860 for (; start > 0; start-=PAGE_SIZE)
1861 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
1862 break;
1863 }
1864
1865 NFS_BUF_MAP(bp);
1866 /* setup uio for read(s) */
1867 boff = NBOFF(bp);
fa4905b1
A
1868 auio.uio_iov = &iov;
1869 auio.uio_iovcnt = 1;
fa4905b1
A
1870 auio.uio_segflg = UIO_SYSSPACE;
1871 auio.uio_rw = UIO_READ;
1872 auio.uio_procp = p;
55e303ae
A
1873
1874 if (dirtypg <= (end-1)/PAGE_SIZE) {
1875 /* there's a dirty page in the way, so just do two reads */
1876 /* we'll read the preceding data here */
1877 auio.uio_offset = boff + start;
1878 auio.uio_resid = iov.iov_len = on - start;
1879 iov.iov_base = bp->nb_data + start;
1880 error = nfs_readrpc(vp, &auio, cred);
1881 if (error) {
1882 bp->nb_error = error;
1883 SET(bp->nb_flags, NB_ERROR);
1884 printf("nfs_write: readrpc %d", error);
1885 }
1886 if (auio.uio_resid > 0) {
1887 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01);
1888 bzero(iov.iov_base, auio.uio_resid);
1889 }
1890 /* update validoff/validend if necessary */
1891 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
1892 bp->nb_validoff = start;
1893 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
1894 bp->nb_validend = on;
1895 if (np->n_size > boff + bp->nb_validend)
1896 bp->nb_validend = min(np->n_size - (boff + start), biosize);
1897 /* validate any pages before the write offset */
1898 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
1899 NBPGVALID_SET(bp, start/PAGE_SIZE);
1900 /* adjust start to read any trailing data */
1901 start = on+n;
1902 }
1903
1904 /* if end is at end of page, try to */
1905 /* get any following pages as well. */
1906 if (!(end & PAGE_MASK)) {
1907 /* stop at next valid page or end of block */
1908 for (; end < bufsize; end+=PAGE_SIZE)
1909 if (NBPGVALID(bp,end/PAGE_SIZE))
1910 break;
1911 }
1912
1913 /* now we'll read the (rest of the) data */
1914 auio.uio_offset = boff + start;
1915 auio.uio_resid = iov.iov_len = end - start;
1916 iov.iov_base = bp->nb_data + start;
fa4905b1 1917 error = nfs_readrpc(vp, &auio, cred);
fa4905b1 1918 if (error) {
55e303ae
A
1919 bp->nb_error = error;
1920 SET(bp->nb_flags, NB_ERROR);
1921 printf("nfs_write: readrpc %d", error);
fa4905b1 1922 }
55e303ae
A
1923 if (auio.uio_resid > 0) {
1924 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02);
fa4905b1 1925 bzero(iov.iov_base, auio.uio_resid);
55e303ae
A
1926 }
1927 /* update validoff/validend if necessary */
1928 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
1929 bp->nb_validoff = start;
1930 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
1931 bp->nb_validend = end;
1932 if (np->n_size > boff + bp->nb_validend)
1933 bp->nb_validend = min(np->n_size - (boff + start), biosize);
1934 /* validate any pages before the write offset's page */
1935 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
1936 NBPGVALID_SET(bp, start/PAGE_SIZE);
1937 /* validate any pages after the range of pages being written to */
1938 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
1939 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
1940 /* Note: pages being written to will be validated when written */
fa4905b1 1941 }
fa4905b1 1942 }
55e303ae
A
1943
1944 if (ISSET(bp->nb_flags, NB_ERROR)) {
1945 error = bp->nb_error;
1946 nfs_buf_release(bp);
1947 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1c79356b
A
1948 return (error);
1949 }
55e303ae 1950
1c79356b
A
1951 np->n_flag |= NMODIFIED;
1952
1953 /*
1954 * Check for valid write lease and get one as required.
55e303ae 1955 * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
1c79356b
A
1956 */
1957 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1958 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1959 do {
1960 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1961 } while (error == NQNFS_EXPIRED);
1962 if (error) {
55e303ae
A
1963 nfs_buf_release(bp);
1964 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error);
1c79356b
A
1965 return (error);
1966 }
1967 if (np->n_lrev != np->n_brev ||
1968 (np->n_flag & NQNFSNONCACHE)) {
55e303ae 1969 nfs_buf_release(bp);
1c79356b 1970 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
1971 if (error) {
1972 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error);
1c79356b 1973 return (error);
55e303ae 1974 }
1c79356b
A
1975 np->n_brev = np->n_lrev;
1976 goto again;
1977 }
1978 }
55e303ae
A
1979 NFS_BUF_MAP(bp);
1980 error = uiomove((char *)bp->nb_data + on, n, uio);
1c79356b 1981 if (error) {
55e303ae
A
1982 SET(bp->nb_flags, NB_ERROR);
1983 nfs_buf_release(bp);
1984 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1c79356b
A
1985 return (error);
1986 }
55e303ae
A
1987
1988 /* validate any pages written to */
1989 start = on & ~PAGE_MASK;
1990 for (; start < on+n; start += PAGE_SIZE) {
1991 NBPGVALID_SET(bp, start/PAGE_SIZE);
1992 /*
1993 * This may seem a little weird, but we don't actually set the
1994 * dirty bits for writes. This is because we keep the dirty range
1995 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
1996 * delayed writes, when we give the pages back to the VM we don't
1997 * want to keep them marked dirty, because when we later write the
1998 * buffer we won't be able to tell which pages were written dirty
1999 * and which pages were mmapped and dirtied.
2000 */
2001 }
2002 if (bp->nb_dirtyend > 0) {
2003 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2004 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
1c79356b 2005 } else {
55e303ae
A
2006 bp->nb_dirtyoff = on;
2007 bp->nb_dirtyend = on + n;
1c79356b 2008 }
55e303ae
A
2009 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2010 bp->nb_validoff > bp->nb_dirtyend) {
2011 bp->nb_validoff = bp->nb_dirtyoff;
2012 bp->nb_validend = bp->nb_dirtyend;
1c79356b 2013 } else {
55e303ae
A
2014 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2015 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
1c79356b 2016 }
55e303ae
A
2017 if (!ISSET(bp->nb_flags, NB_CACHE))
2018 nfs_buf_normalize_valid_range(np, bp);
1c79356b
A
2019
2020 /*
2021 * Since this block is being modified, it must be written
2022 * again and not just committed.
2023 */
55e303ae
A
2024 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2025 np->n_needcommitcnt--;
2026 CHECK_NEEDCOMMITCNT(np);
2027 }
2028 CLR(bp->nb_flags, NB_NEEDCOMMIT);
1c79356b 2029
55e303ae
A
2030 if ((np->n_flag & NQNFSNONCACHE) ||
2031 (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) {
2032 bp->nb_proc = p;
2033 error = nfs_buf_write(bp);
2034 if (error) {
2035 FSDBG_BOT(515, vp, uio->uio_offset,
2036 uio->uio_resid, error);
1c79356b 2037 return (error);
55e303ae 2038 }
1c79356b
A
2039 if (np->n_flag & NQNFSNONCACHE) {
2040 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
55e303ae
A
2041 if (error) {
2042 FSDBG_BOT(515, vp, uio->uio_offset,
2043 uio->uio_resid, error);
1c79356b 2044 return (error);
55e303ae 2045 }
1c79356b 2046 }
55e303ae
A
2047 } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
2048 bp->nb_proc = (struct proc *)0;
2049 SET(bp->nb_flags, NB_ASYNC);
2050 nfs_buf_write(bp);
1c79356b 2051 } else
55e303ae
A
2052 nfs_buf_write_delayed(bp);
2053
2054 if (np->n_needcommitcnt > (nbuf/16))
2055 nfs_flushcommits(vp, p);
2056
1c79356b 2057 } while (uio->uio_resid > 0 && n > 0);
55e303ae
A
2058
2059 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
1c79356b
A
2060 return (0);
2061}
2062
1c79356b 2063/*
55e303ae
A
2064 * Flush out and invalidate all buffers associated with a vnode.
2065 * Called with the underlying object locked.
1c79356b 2066 */
55e303ae
A
2067static int
2068nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo)
2069 register struct vnode *vp;
2070 int flags;
2071 struct ucred *cred;
1c79356b 2072 struct proc *p;
55e303ae 2073 int slpflag, slptimeo;
1c79356b 2074{
55e303ae
A
2075 struct nfsbuf *bp;
2076 struct nfsbuf *nbp, *blist;
2077 int s, error = 0;
2078 struct nfsnode *np = VTONFS(vp);
9bccf70c 2079
55e303ae
A
2080 if (flags & V_SAVE) {
2081 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
2082 return (error);
2083 if (np->n_dirtyblkhd.lh_first)
2084 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2085 vp, np->n_dirtyblkhd.lh_first);
9bccf70c
A
2086 }
2087
55e303ae
A
2088 for (;;) {
2089 blist = np->n_cleanblkhd.lh_first;
2090 if (!blist)
2091 blist = np->n_dirtyblkhd.lh_first;
2092 if (!blist)
2093 break;
1c79356b 2094
55e303ae
A
2095 for (bp = blist; bp; bp = nbp) {
2096 nbp = bp->nb_vnbufs.le_next;
2097 s = splbio();
2098 if (ISSET(bp->nb_flags, NB_BUSY)) {
2099 SET(bp->nb_flags, NB_WANTED);
2100 FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags);
2101 error = tsleep((caddr_t)bp,
2102 slpflag | (PRIBIO + 1), "nfs_vinvalbuf",
2103 slptimeo);
2104 FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags);
2105 splx(s);
2106 if (error) {
2107 FSDBG(554, vp, bp, -1, error);
2108 return (error);
2109 }
2110 break;
2111 }
2112 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2113 nfs_buf_remfree(bp);
2114 SET(bp->nb_flags, NB_BUSY);
2115 splx(s);
2116 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) {
2117 /* XXX extra paranoia: make sure we're not */
2118 /* somehow leaving any dirty data around */
2119 int mustwrite = 0;
2120 int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ?
2121 bp->nb_bufsize : (np->n_size - NBOFF(bp));
2122 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2123 error = nfs_buf_upl_setup(bp);
2124 if (error == EINVAL) {
2125 /* vm object must no longer exist */
2126 /* hopefully we don't need to do */
2127 /* anything for this buffer */
2128 } else if (error)
2129 printf("nfs_vinvalbuf: upl setup failed %d\n",
2130 error);
2131 bp->nb_valid = bp->nb_dirty = 0;
2132 }
2133 nfs_buf_upl_check(bp);
2134 /* check for any dirty data before the EOF */
2135 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2136 /* clip dirty range to EOF */
2137 if (bp->nb_dirtyend > end)
2138 bp->nb_dirtyend = end;
2139 mustwrite++;
2140 }
2141 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2142 if (bp->nb_dirty)
2143 mustwrite++;
2144 if (mustwrite) {
2145 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2146 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2147 panic("nfs_vinvalbuf: dirty buffer without upl");
2148 /* gotta write out dirty data before invalidating */
2149 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2150 /* (NB_NOCACHE indicates buffer should be discarded) */
2151 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2152 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2153 /*
2154 * NFS has embedded ucred so crhold() risks zone corruption
2155 */
2156 if (bp->nb_wcred == NOCRED)
2157 bp->nb_wcred = crdup(cred);
2158 error = nfs_buf_write(bp);
2159 // Note: bp has been released
2160 if (error) {
2161 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2162 np->n_error = error;
2163 np->n_flag |= NWRITEERR;
2164 error = 0;
2165 }
2166 break;
2167 }
2168 }
2169 SET(bp->nb_flags, NB_INVAL);
2170 nfs_buf_release(bp);
2171 }
2172 }
2173 if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)
2174 panic("nfs_vinvalbuf: flush failed");
2175 return (0);
1c79356b
A
2176}
2177
55e303ae 2178
1c79356b
A
2179/*
2180 * Flush and invalidate all dirty buffers. If another process is already
2181 * doing the flush, just wait for completion.
2182 */
2183int
2184nfs_vinvalbuf(vp, flags, cred, p, intrflg)
2185 struct vnode *vp;
2186 int flags;
2187 struct ucred *cred;
2188 struct proc *p;
2189 int intrflg;
2190{
2191 register struct nfsnode *np = VTONFS(vp);
2192 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2193 int error = 0, slpflag, slptimeo;
0b4e3aa0 2194 int didhold = 0;
1c79356b 2195
55e303ae
A
2196 FSDBG_TOP(554, vp, flags, intrflg, 0);
2197
2198 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
1c79356b
A
2199 intrflg = 0;
2200 if (intrflg) {
2201 slpflag = PCATCH;
2202 slptimeo = 2 * hz;
2203 } else {
2204 slpflag = 0;
2205 slptimeo = 0;
2206 }
2207 /*
2208 * First wait for any other process doing a flush to complete.
2209 */
2210 while (np->n_flag & NFLUSHINPROG) {
2211 np->n_flag |= NFLUSHWANT;
55e303ae
A
2212 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2213 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2214 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2215 if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) {
2216 FSDBG_BOT(554, vp, flags, intrflg, error);
2217 return (error);
2218 }
1c79356b
A
2219 }
2220
2221 /*
2222 * Now, flush as required.
2223 */
2224 np->n_flag |= NFLUSHINPROG;
55e303ae 2225 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
1c79356b 2226 while (error) {
55e303ae
A
2227 FSDBG(554, vp, 0, 0, error);
2228 error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p);
2229 if (error) {
1c79356b
A
2230 np->n_flag &= ~NFLUSHINPROG;
2231 if (np->n_flag & NFLUSHWANT) {
2232 np->n_flag &= ~NFLUSHWANT;
2233 wakeup((caddr_t)&np->n_flag);
2234 }
55e303ae
A
2235 FSDBG_BOT(554, vp, flags, intrflg, error);
2236 return (error);
1c79356b 2237 }
55e303ae 2238 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
1c79356b
A
2239 }
2240 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2241 if (np->n_flag & NFLUSHWANT) {
2242 np->n_flag &= ~NFLUSHWANT;
2243 wakeup((caddr_t)&np->n_flag);
2244 }
0b4e3aa0
A
2245 didhold = ubc_hold(vp);
2246 if (didhold) {
55e303ae
A
2247 int rv = ubc_clean(vp, 1); /* get the pages out of vm also */
2248 if (!rv)
2249 panic("nfs_vinvalbuf(): ubc_clean failed!");
0b4e3aa0
A
2250 ubc_rele(vp);
2251 }
55e303ae 2252 FSDBG_BOT(554, vp, flags, intrflg, 0);
1c79356b
A
2253 return (0);
2254}
2255
2256/*
2257 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2258 * This is mainly to avoid queueing async I/O requests when the nfsiods
2259 * are all hung on a dead server.
2260 */
2261int
2262nfs_asyncio(bp, cred)
55e303ae 2263 struct nfsbuf *bp;
1c79356b
A
2264 struct ucred *cred;
2265{
2266 struct nfsmount *nmp;
2267 int i;
2268 int gotiod;
2269 int slpflag = 0;
2270 int slptimeo = 0;
55e303ae 2271 int error, error2;
1c79356b
A
2272
2273 if (nfs_numasync == 0)
2274 return (EIO);
55e303ae
A
2275
2276 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2277
2278 nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL);
1c79356b 2279again:
55e303ae 2280 if (nmp && nmp->nm_flag & NFSMNT_INT)
1c79356b
A
2281 slpflag = PCATCH;
2282 gotiod = FALSE;
2283
55e303ae
A
2284 /* no nfsbuf means tell nfsiod to process delwri list */
2285 if (!bp)
2286 nfs_ioddelwri = 1;
2287
1c79356b
A
2288 /*
2289 * Find a free iod to process this request.
2290 */
2291 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2292 if (nfs_iodwant[i]) {
2293 /*
2294 * Found one, so wake it up and tell it which
2295 * mount to process.
2296 */
2297 NFS_DPF(ASYNCIO,
2298 ("nfs_asyncio: waking iod %d for mount %p\n",
2299 i, nmp));
2300 nfs_iodwant[i] = (struct proc *)0;
2301 nfs_iodmount[i] = nmp;
55e303ae
A
2302 if (nmp)
2303 nmp->nm_bufqiods++;
1c79356b
A
2304 wakeup((caddr_t)&nfs_iodwant[i]);
2305 gotiod = TRUE;
2306 break;
2307 }
2308
55e303ae
A
2309 /* if we're just poking the delwri list, we're done */
2310 if (!bp)
2311 return (0);
2312
1c79356b
A
2313 /*
2314 * If none are free, we may already have an iod working on this mount
2315 * point. If so, it will process our request.
2316 */
2317 if (!gotiod) {
2318 if (nmp->nm_bufqiods > 0) {
2319 NFS_DPF(ASYNCIO,
2320 ("nfs_asyncio: %d iods are already processing mount %p\n",
2321 nmp->nm_bufqiods, nmp));
2322 gotiod = TRUE;
2323 }
2324 }
2325
2326 /*
2327 * If we have an iod which can process the request, then queue
2328 * the buffer.
2329 */
55e303ae 2330 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
1c79356b
A
2331 if (gotiod) {
2332 /*
2333 * Ensure that the queue never grows too large.
2334 */
2335 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
55e303ae
A
2336 if (ISSET(bp->nb_flags, NB_IOD)) {
2337 /* An nfsiod is attempting this async operation so */
2338 /* we must not fall asleep on the bufq because we */
2339 /* could be waiting on ourself. Just return error */
2340 /* and we'll do this operation syncrhonously. */
2341 goto out;
2342 }
2343 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
1c79356b
A
2344 NFS_DPF(ASYNCIO,
2345 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
2346 nmp->nm_bufqwant = TRUE;
2347 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
2348 "nfsaio", slptimeo);
2349 if (error) {
55e303ae
A
2350 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2351 if (error2) {
2352 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2353 return (error2);
2354 }
1c79356b
A
2355 if (slpflag == PCATCH) {
2356 slpflag = 0;
2357 slptimeo = 2 * hz;
2358 }
2359 }
2360 /*
2361 * We might have lost our iod while sleeping,
2362 * so check and loop if nescessary.
2363 */
2364 if (nmp->nm_bufqiods == 0) {
2365 NFS_DPF(ASYNCIO,
2366 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
2367 goto again;
2368 }
2369 }
2370
55e303ae
A
2371 if (ISSET(bp->nb_flags, NB_READ)) {
2372 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
0b4e3aa0
A
2373 /*
2374 * NFS has embedded ucred.
2375 * Can not crhold() here as that causes zone corruption
2376 */
55e303ae 2377 bp->nb_rcred = crdup(cred);
1c79356b
A
2378 }
2379 } else {
55e303ae
A
2380 SET(bp->nb_flags, NB_WRITEINPROG);
2381 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
0b4e3aa0
A
2382 /*
2383 * NFS has embedded ucred.
2384 * Can not crhold() here as that causes zone corruption
2385 */
55e303ae 2386 bp->nb_wcred = crdup(cred);
1c79356b
A
2387 }
2388 }
2389
55e303ae 2390 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
1c79356b 2391 nmp->nm_bufqlen++;
55e303ae 2392 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
1c79356b
A
2393 return (0);
2394 }
2395
55e303ae 2396out:
1c79356b
A
2397 /*
2398 * All the iods are busy on other mounts, so return EIO to
2399 * force the caller to process the i/o synchronously.
2400 */
2401 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
55e303ae 2402 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
1c79356b
A
2403 return (EIO);
2404}
2405
2406/*
2407 * Do an I/O operation to/from a cache block. This may be called
2408 * synchronously or from an nfsiod.
2409 */
2410int
2411nfs_doio(bp, cr, p)
55e303ae 2412 struct nfsbuf *bp;
1c79356b
A
2413 struct ucred *cr;
2414 struct proc *p;
2415{
2416 register struct uio *uiop;
2417 register struct vnode *vp;
2418 struct nfsnode *np;
2419 struct nfsmount *nmp;
2420 int error = 0, diff, len, iomode, must_commit = 0;
2421 struct uio uio;
2422 struct iovec io;
2423
55e303ae 2424 vp = bp->nb_vp;
1c79356b
A
2425 np = VTONFS(vp);
2426 nmp = VFSTONFS(vp->v_mount);
2427 uiop = &uio;
2428 uiop->uio_iov = &io;
2429 uiop->uio_iovcnt = 1;
2430 uiop->uio_segflg = UIO_SYSSPACE;
2431 uiop->uio_procp = p;
2432
55e303ae
A
2433 /*
2434 * we've decided to perform I/O for this block,
2435 * so we couldn't possibly NB_DONE. So, clear it.
1c79356b 2436 */
55e303ae
A
2437 if (ISSET(bp->nb_flags, NB_DONE)) {
2438 if (!ISSET(bp->nb_flags, NB_ASYNC))
1c79356b 2439 panic("nfs_doio: done and not async");
55e303ae 2440 CLR(bp->nb_flags, NB_DONE);
1c79356b 2441 }
55e303ae
A
2442 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2443 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2444 bp->nb_dirtyend);
2445
2446 if (ISSET(bp->nb_flags, NB_READ)) {
2447 if (vp->v_type == VREG)
2448 NFS_BUF_MAP(bp);
2449 io.iov_len = uiop->uio_resid = bp->nb_bufsize;
2450 io.iov_base = bp->nb_data;
1c79356b
A
2451 uiop->uio_rw = UIO_READ;
2452 switch (vp->v_type) {
2453 case VREG:
55e303ae 2454 uiop->uio_offset = NBOFF(bp);
1c79356b
A
2455 nfsstats.read_bios++;
2456 error = nfs_readrpc(vp, uiop, cr);
55e303ae 2457 FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error);
1c79356b 2458 if (!error) {
55e303ae
A
2459 /* update valid range */
2460 bp->nb_validoff = 0;
1c79356b
A
2461 if (uiop->uio_resid) {
2462 /*
2463 * If len > 0, there is a hole in the file and
2464 * no writes after the hole have been pushed to
2465 * the server yet.
2466 * Just zero fill the rest of the valid area.
2467 */
55e303ae
A
2468 diff = bp->nb_bufsize - uiop->uio_resid;
2469 len = np->n_size - (NBOFF(bp) + diff);
fa4905b1
A
2470 if (len > 0) {
2471 len = min(len, uiop->uio_resid);
55e303ae
A
2472 bzero((char *)bp->nb_data + diff, len);
2473 bp->nb_validend = diff + len;
fa4905b1
A
2474 FSDBG(258, diff, len, 0, 1);
2475 } else
55e303ae 2476 bp->nb_validend = diff;
1c79356b 2477 } else
55e303ae
A
2478 bp->nb_validend = bp->nb_bufsize;
2479 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2480 if (bp->nb_validend & PAGE_MASK) {
2481 /* valid range ends in the middle of a page so we */
2482 /* need to zero-fill any invalid data at the end */
2483 /* of the last page */
2484 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2485 bp->nb_bufsize - bp->nb_validend);
2486 FSDBG(258, bp->nb_validend,
2487 bp->nb_bufsize - bp->nb_validend, 0, 2);
1c79356b 2488 }
1c79356b
A
2489 }
2490 if (p && (vp->v_flag & VTEXT) &&
2491 (((nmp->nm_flag & NFSMNT_NQNFS) &&
2492 NQNFS_CKINVALID(vp, np, ND_READ) &&
2493 np->n_lrev != np->n_brev) ||
2494 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
2495 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
2496 uprintf("Process killed due to text file modification\n");
2497 psignal(p, SIGKILL);
2498 p->p_flag |= P_NOSWAP;
2499 }
2500 break;
2501 case VLNK:
2502 uiop->uio_offset = (off_t)0;
2503 nfsstats.readlink_bios++;
2504 error = nfs_readlinkrpc(vp, uiop, cr);
55e303ae
A
2505 if (!error) {
2506 bp->nb_validoff = 0;
2507 bp->nb_validend = uiop->uio_offset;
2508 }
1c79356b
A
2509 break;
2510 case VDIR:
2511 nfsstats.readdir_bios++;
55e303ae 2512 uiop->uio_offset = NBOFF(bp);
1c79356b
A
2513 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2514 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2515 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2516 error = nfs_readdirplusrpc(vp, uiop, cr);
2517 if (error == NFSERR_NOTSUPP)
2518 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2519 }
2520 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2521 error = nfs_readdirrpc(vp, uiop, cr);
55e303ae
A
2522 if (!error) {
2523 bp->nb_validoff = 0;
2524 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2525 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2526 }
1c79356b
A
2527 break;
2528 default:
fa4905b1 2529 printf("nfs_doio: type %x unexpected\n", vp->v_type);
1c79356b
A
2530 break;
2531 };
2532 if (error) {
55e303ae
A
2533 SET(bp->nb_flags, NB_ERROR);
2534 bp->nb_error = error;
1c79356b 2535 }
55e303ae 2536
1c79356b 2537 } else {
55e303ae
A
2538 /* we're doing a write */
2539 int doff, dend = 0;
2540
2541 /* We need to make sure the pages are locked before doing I/O. */
2542 if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) {
2543 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2544 error = nfs_buf_upl_setup(bp);
2545 if (error) {
2546 printf("nfs_doio: upl create failed %d\n", error);
2547 SET(bp->nb_flags, NB_ERROR);
2548 bp->nb_error = EIO;
2549 return (EIO);
2550 }
2551 nfs_buf_upl_check(bp);
2552 }
2553 }
2554
2555 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2556 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2557 /*
2558 * There are pages marked dirty that need to be written out.
2559 *
2560 * We don't want to just combine the write range with the
2561 * range of pages that are dirty because that could cause us
2562 * to write data that wasn't actually written to.
2563 * We also don't want to write data more than once.
2564 *
2565 * If the dirty range just needs to be committed, we do that.
2566 * Otherwise, we write the dirty range and clear the dirty bits
2567 * for any COMPLETE pages covered by that range.
2568 * If there are dirty pages left after that, we write out the
2569 * parts that we haven't written yet.
2570 */
2571 }
2572
fa4905b1 2573 /*
55e303ae
A
2574 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
2575 * an actual write will have to be done.
2576 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
fa4905b1 2577 */
55e303ae
A
2578 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
2579 doff = NBOFF(bp) + bp->nb_dirtyoff;
2580 SET(bp->nb_flags, NB_WRITEINPROG);
2581 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
2582 bp->nb_wcred, bp->nb_proc);
2583 CLR(bp->nb_flags, NB_WRITEINPROG);
2584 if (!error) {
2585 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2586 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2587 np->n_needcommitcnt--;
2588 CHECK_NEEDCOMMITCNT(np);
2589 } else if (error == NFSERR_STALEWRITEVERF)
2590 nfs_clearcommit(vp->v_mount);
fa4905b1 2591 }
1c79356b 2592
55e303ae
A
2593 if (!error && bp->nb_dirtyend > 0) {
2594 /* there's a dirty range that needs to be written out */
2595 u_int32_t pagemask;
2596 int firstpg, lastpg;
2597
2598 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
2599 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2600
2601 NFS_BUF_MAP(bp);
2602
2603 doff = bp->nb_dirtyoff;
2604 dend = bp->nb_dirtyend;
2605
2606 /* if doff page is dirty, move doff to start of page */
2607 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
2608 doff -= doff & PAGE_MASK;
2609 /* try to expand write range to include preceding dirty pages */
2610 if (!(doff & PAGE_MASK))
2611 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
2612 doff -= PAGE_SIZE;
2613 /* if dend page is dirty, move dend to start of next page */
2614 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
2615 dend = round_page_32(dend);
2616 /* try to expand write range to include trailing dirty pages */
2617 if (!(dend & PAGE_MASK))
2618 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
2619 dend += PAGE_SIZE;
2620 /* make sure to keep dend clipped to EOF */
2621 if (NBOFF(bp) + dend > np->n_size)
2622 dend = np->n_size - NBOFF(bp);
2623 /* calculate range of complete pages being written */
2624 firstpg = round_page_32(doff) / PAGE_SIZE;
2625 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
2626 /* calculate mask for that page range */
2627 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2628
2629 /* compare page mask to nb_dirty; if there are other dirty pages */
2630 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
2631 /* not needcommit/nocache/call; otherwise write FILESYNC */
2632 if (bp->nb_dirty & ~pagemask)
2633 iomode = NFSV3WRITE_FILESYNC;
2634 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
1c79356b
A
2635 iomode = NFSV3WRITE_UNSTABLE;
2636 else
2637 iomode = NFSV3WRITE_FILESYNC;
55e303ae
A
2638
2639 /* write the dirty range */
2640 io.iov_len = uiop->uio_resid = dend - doff;
2641 uiop->uio_offset = NBOFF(bp) + doff;
2642 io.iov_base = (char *)bp->nb_data + doff;
2643 uiop->uio_rw = UIO_WRITE;
2644
2645 nfsstats.write_bios++;
2646
2647 SET(bp->nb_flags, NB_WRITEINPROG);
1c79356b 2648 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
55e303ae
A
2649 if (must_commit)
2650 nfs_clearcommit(vp->v_mount);
2651 /* clear dirty bits for pages we've written */
2652 if (!error)
2653 bp->nb_dirty &= ~pagemask;
2654 /* set/clear needcommit flag */
2655 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
2656 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2657 np->n_needcommitcnt++;
2658 SET(bp->nb_flags, NB_NEEDCOMMIT);
2659 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2660 bp->nb_dirtyoff = doff;
2661 bp->nb_dirtyend = dend;
2662 } else {
2663 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2664 np->n_needcommitcnt--;
2665 CHECK_NEEDCOMMITCNT(np);
2666 }
2667 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2668 }
2669 CLR(bp->nb_flags, NB_WRITEINPROG);
1c79356b 2670 /*
55e303ae
A
2671 * For an interrupted write, the buffer is still valid and the write
2672 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
2673 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
2674 * NB_EINTR is not relevant.
2675 *
2676 * For the case of a V3 write rpc not being committed to stable
2677 * storage, the block is still dirty and requires either a commit rpc
2678 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
2679 * block is reused. This is indicated by setting the NB_DELWRI and
2680 * NB_NEEDCOMMIT flags.
1c79356b 2681 */
55e303ae
A
2682 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
2683 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
2684 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2685 SET(bp->nb_flags, NB_DELWRI);
2686 nfs_nbdwrite++;
2687 NFSBUFCNTCHK();
2688 }
2689 FSDBG(261, bp->nb_validoff, bp->nb_validend,
2690 bp->nb_bufsize, 0);
2691 /*
2692 * Since for the NB_ASYNC case, nfs_bwrite() has
2693 * reassigned the buffer to the clean list, we have to
2694 * reassign it back to the dirty one. Ugh.
2695 */
2696 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2697 /* move to dirty list */
2698 int s = splbio();
2699 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2700 LIST_REMOVE(bp, nb_vnbufs);
2701 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2702 splx(s);
2703 } else {
2704 SET(bp->nb_flags, NB_EINTR);
2705 }
1c79356b 2706 } else {
55e303ae 2707 /* either there's an error or we don't need to commit */
1c79356b 2708 if (error) {
55e303ae
A
2709 SET(bp->nb_flags, NB_ERROR);
2710 bp->nb_error = np->n_error = error;
2711 np->n_flag |= NWRITEERR;
1c79356b 2712 }
55e303ae
A
2713 /* clear the dirty range */
2714 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
1c79356b 2715 }
55e303ae
A
2716 }
2717
2718 if (!error && bp->nb_dirty) {
2719 /* there are pages marked dirty that need to be written out */
2720 int pg, cnt, npages, off, len;
2721
2722 nfsstats.write_bios++;
1c79356b 2723
55e303ae
A
2724 NFS_BUF_MAP(bp);
2725
2726 /*
2727 * we do these writes synchronously because we can't really
2728 * support the unstable/needommit method. We could write
2729 * them unstable, clear the dirty bits, and then commit the
2730 * whole block later, but if we need to rewrite the data, we
2731 * won't have any idea which pages were written because that
2732 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
2733 * also can't leave the dirty bits set because then we wouldn't
2734 * be able to tell if the pages were re-dirtied between the end
2735 * of the write and the commit.
2736 */
2737 iomode = NFSV3WRITE_FILESYNC;
2738 uiop->uio_rw = UIO_WRITE;
2739
2740 SET(bp->nb_flags, NB_WRITEINPROG);
2741 npages = bp->nb_bufsize/PAGE_SIZE;
2742 for (pg=0; pg < npages; pg++) {
2743 if (!NBPGDIRTY(bp,pg))
2744 continue;
2745 cnt = 1;
2746 while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt))
2747 cnt++;
2748 /* write cnt pages starting with page pg */
2749 off = pg * PAGE_SIZE;
2750 len = cnt * PAGE_SIZE;
2751
2752 /* clip writes to EOF */
2753 if (NBOFF(bp) + off + len > np->n_size)
2754 len -= (NBOFF(bp) + off + len) - np->n_size;
2755 if (len > 0) {
2756 io.iov_len = uiop->uio_resid = len;
2757 uiop->uio_offset = NBOFF(bp) + off;
2758 io.iov_base = (char *)bp->nb_data + off;
2759 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2760 if (must_commit)
2761 nfs_clearcommit(vp->v_mount);
2762 if (error)
2763 break;
2764 }
2765 /* clear dirty bits */
2766 while (cnt--) {
2767 bp->nb_dirty &= ~(1 << pg);
2768 /* leave pg on last page */
2769 if (cnt) pg++;
2770 }
fa4905b1 2771 }
55e303ae
A
2772 if (!error) {
2773 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2774 np->n_needcommitcnt--;
2775 CHECK_NEEDCOMMITCNT(np);
2776 }
2777 CLR(bp->nb_flags, NB_NEEDCOMMIT);
fa4905b1 2778 }
55e303ae
A
2779 CLR(bp->nb_flags, NB_WRITEINPROG);
2780 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
fa4905b1 2781 np->n_size);
1c79356b 2782 }
1c79356b 2783
55e303ae
A
2784 if (error) {
2785 SET(bp->nb_flags, NB_ERROR);
2786 bp->nb_error = error;
2787 }
1c79356b 2788 }
1c79356b 2789
55e303ae
A
2790 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
2791
2792 nfs_buf_iodone(bp);
1c79356b
A
2793 return (error);
2794}