]> git.saurik.com Git - apple/xnu.git/blob - bsd/nfs/nfs_bio.c
xnu-517.7.21.tar.gz
[apple/xnu.git] / bsd / nfs / nfs_bio.c
1 /*
2 * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1989, 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * This code is derived from software contributed to Berkeley by
28 * Rick Macklem at The University of Guelph.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
59 * FreeBSD-Id: nfs_bio.c,v 1.44 1997/09/10 19:52:25 phk Exp $
60 */
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/resourcevar.h>
64 #include <sys/signalvar.h>
65 #include <sys/proc.h>
66 #include <sys/malloc.h>
67 #include <sys/vnode.h>
68 #include <sys/dirent.h>
69 #include <sys/mount.h>
70 #include <sys/kernel.h>
71 #include <sys/sysctl.h>
72 #include <sys/ubc.h>
73
74 #include <sys/vm.h>
75 #include <sys/vmparam.h>
76
77 #include <sys/time.h>
78 #include <kern/clock.h>
79
80 #include <nfs/rpcv2.h>
81 #include <nfs/nfsproto.h>
82 #include <nfs/nfs.h>
83 #include <nfs/nfsmount.h>
84 #include <nfs/nqnfs.h>
85 #include <nfs/nfsnode.h>
86
87 #include <sys/kdebug.h>
88
89 #define FSDBG(A, B, C, D, E) \
90 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_NONE, \
91 (int)(B), (int)(C), (int)(D), (int)(E), 0)
92 #define FSDBG_TOP(A, B, C, D, E) \
93 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_START, \
94 (int)(B), (int)(C), (int)(D), (int)(E), 0)
95 #define FSDBG_BOT(A, B, C, D, E) \
96 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, (A))) | DBG_FUNC_END, \
97 (int)(B), (int)(C), (int)(D), (int)(E), 0)
98
99 extern int nfs_numasync;
100 extern int nfs_ioddelwri;
101 extern struct nfsstats nfsstats;
102
103 #define NFSBUFHASH(dvp, lbn) \
104 (&nfsbufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & nfsbufhash])
105 LIST_HEAD(nfsbufhashhead, nfsbuf) *nfsbufhashtbl;
106 struct nfsbuffreehead nfsbuffree, nfsbuffreemeta, nfsbufdelwri;
107 u_long nfsbufhash;
108 int nfsbufhashlock, nfsbufcnt, nfsbufmin, nfsbufmax;
109 int nfsbuffreecnt, nfsbuffreemetacnt, nfsbufdelwricnt, nfsneedbuffer;
110 int nfs_nbdwrite;
111 time_t nfsbuffreeuptimestamp;
112
113 #define NFSBUFWRITE_THROTTLE 9
114 #define NFSBUF_LRU_STALE 120
115 #define NFSBUF_META_STALE 240
116
117 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list */
118 #define LRU_TO_FREEUP 6
119 /* number of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list */
120 #define META_TO_FREEUP 3
121 /* total number of nfsbufs nfs_buf_freeup() should attempt to free */
122 #define TOTAL_TO_FREEUP (LRU_TO_FREEUP+META_TO_FREEUP)
123 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffree list when called from nfs_timer() */
124 #define LRU_FREEUP_FRAC_ON_TIMER 8
125 /* fraction of nfsbufs nfs_buf_freeup() should attempt to free from nfsbuffreemeta list when called from nfs_timer() */
126 #define META_FREEUP_FRAC_ON_TIMER 16
127 /* fraction of total nfsbufs that nfsbuffreecnt should exceed before bothering to call nfs_buf_freeup() */
128 #define LRU_FREEUP_MIN_FRAC 4
129 /* fraction of total nfsbufs that nfsbuffreemetacnt should exceed before bothering to call nfs_buf_freeup() */
130 #define META_FREEUP_MIN_FRAC 2
131
132 #define NFS_BUF_FREEUP() \
133 do { \
134 /* only call nfs_buf_freeup() if it has work to do: */ \
135 if (((nfsbuffreecnt > nfsbufcnt/LRU_FREEUP_MIN_FRAC) || \
136 (nfsbuffreemetacnt > nfsbufcnt/META_FREEUP_MIN_FRAC)) && \
137 ((nfsbufcnt - TOTAL_TO_FREEUP) > nfsbufmin)) \
138 nfs_buf_freeup(0); \
139 } while (0)
140
141 /*
142 * Initialize nfsbuf lists
143 */
144 void
145 nfs_nbinit(void)
146 {
147 nfsbufhashlock = 0;
148 nfsbufhashtbl = hashinit(nbuf, M_TEMP, &nfsbufhash);
149 TAILQ_INIT(&nfsbuffree);
150 TAILQ_INIT(&nfsbuffreemeta);
151 TAILQ_INIT(&nfsbufdelwri);
152 nfsbufcnt = nfsbuffreecnt = nfsbuffreemetacnt = nfsbufdelwricnt = 0;
153 nfsbufmin = 128; // XXX tune me!
154 nfsbufmax = 8192; // XXX tune me!
155 nfsneedbuffer = 0;
156 nfs_nbdwrite = 0;
157 nfsbuffreeuptimestamp = 0;
158 }
159
160 /*
161 * try to free up some excess, unused nfsbufs
162 */
163 void
164 nfs_buf_freeup(int timer)
165 {
166 struct nfsbuf *fbp;
167 struct timeval now;
168 int count;
169
170 microuptime(&now);
171 nfsbuffreeuptimestamp = now.tv_sec;
172
173 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count);
174 count = timer ? nfsbuffreecnt/LRU_FREEUP_FRAC_ON_TIMER : LRU_TO_FREEUP;
175 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
176 fbp = TAILQ_FIRST(&nfsbuffree);
177 if (!fbp)
178 break;
179 if ((fbp->nb_timestamp + (2*NFSBUF_LRU_STALE)) > now.tv_sec)
180 break;
181 nfs_buf_remfree(fbp);
182 /* disassociate buffer from any vnode */
183 if (fbp->nb_vp) {
184 struct vnode *oldvp;
185 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
186 LIST_REMOVE(fbp, nb_vnbufs);
187 fbp->nb_vnbufs.le_next = NFSNOLIST;
188 }
189 oldvp = fbp->nb_vp;
190 fbp->nb_vp = NULL;
191 HOLDRELE(oldvp);
192 }
193 LIST_REMOVE(fbp, nb_hash);
194 /* nuke any creds */
195 if (fbp->nb_rcred != NOCRED)
196 crfree(fbp->nb_rcred);
197 if (fbp->nb_wcred != NOCRED)
198 crfree(fbp->nb_wcred);
199 /* if buf was NB_META, dump buffer */
200 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
201 FREE(fbp->nb_data, M_TEMP);
202 }
203 FREE(fbp, M_TEMP);
204 nfsbufcnt--;
205 }
206
207 count = timer ? nfsbuffreemetacnt/META_FREEUP_FRAC_ON_TIMER : META_TO_FREEUP;
208 while ((nfsbufcnt > nfsbufmin) && (count-- > 0)) {
209 fbp = TAILQ_FIRST(&nfsbuffreemeta);
210 if (!fbp)
211 break;
212 if ((fbp->nb_timestamp + (2*NFSBUF_META_STALE)) > now.tv_sec)
213 break;
214 nfs_buf_remfree(fbp);
215 /* disassociate buffer from any vnode */
216 if (fbp->nb_vp) {
217 struct vnode *oldvp;
218 if (fbp->nb_vnbufs.le_next != NFSNOLIST) {
219 LIST_REMOVE(fbp, nb_vnbufs);
220 fbp->nb_vnbufs.le_next = NFSNOLIST;
221 }
222 oldvp = fbp->nb_vp;
223 fbp->nb_vp = NULL;
224 HOLDRELE(oldvp);
225 }
226 LIST_REMOVE(fbp, nb_hash);
227 /* nuke any creds */
228 if (fbp->nb_rcred != NOCRED)
229 crfree(fbp->nb_rcred);
230 if (fbp->nb_wcred != NOCRED)
231 crfree(fbp->nb_wcred);
232 /* if buf was NB_META, dump buffer */
233 if (ISSET(fbp->nb_flags, NB_META) && fbp->nb_data) {
234 FREE(fbp->nb_data, M_TEMP);
235 }
236 FREE(fbp, M_TEMP);
237 nfsbufcnt--;
238 }
239 FSDBG(320, nfsbufcnt, nfsbuffreecnt, nfsbuffreemetacnt, count);
240 }
241
242 void
243 nfs_buf_remfree(struct nfsbuf *bp)
244 {
245 if (bp->nb_free.tqe_next == NFSNOLIST)
246 panic("nfsbuf not on free list");
247 if (ISSET(bp->nb_flags, NB_DELWRI)) {
248 nfsbufdelwricnt--;
249 TAILQ_REMOVE(&nfsbufdelwri, bp, nb_free);
250 } else if (ISSET(bp->nb_flags, NB_META) && !ISSET(bp->nb_flags, NB_INVAL)) {
251 nfsbuffreemetacnt--;
252 TAILQ_REMOVE(&nfsbuffreemeta, bp, nb_free);
253 } else {
254 nfsbuffreecnt--;
255 TAILQ_REMOVE(&nfsbuffree, bp, nb_free);
256 }
257 bp->nb_free.tqe_next = NFSNOLIST;
258 NFSBUFCNTCHK();
259 }
260
261 /*
262 * check for existence of nfsbuf in cache
263 */
264 struct nfsbuf *
265 nfs_buf_incore(struct vnode *vp, daddr_t blkno)
266 {
267 /* Search hash chain */
268 struct nfsbuf * bp = NFSBUFHASH(vp, blkno)->lh_first;
269 for (; bp != NULL; bp = bp->nb_hash.le_next)
270 if (bp->nb_lblkno == blkno && bp->nb_vp == vp) {
271 if (!ISSET(bp->nb_flags, NB_INVAL)) {
272 FSDBG(547, bp, blkno, bp->nb_flags, bp->nb_vp);
273 return (bp);
274 }
275 }
276 return (NULL);
277 }
278
279 /*
280 * Check if it's OK to drop a page.
281 *
282 * Called by vnode_pager() on pageout request of non-dirty page.
283 * We need to make sure that it's not part of a delayed write.
284 * If it is, we can't let the VM drop it because we may need it
285 * later when/if we need to write the data (again).
286 */
287 int
288 nfs_buf_page_inval(struct vnode *vp, off_t offset)
289 {
290 struct nfsbuf *bp;
291 bp = nfs_buf_incore(vp, ubc_offtoblk(vp, offset));
292 if (!bp)
293 return (0);
294 FSDBG(325, bp, bp->nb_flags, bp->nb_dirtyoff, bp->nb_dirtyend);
295 if (ISSET(bp->nb_flags, NB_BUSY))
296 return (EBUSY);
297 /*
298 * If there's a dirty range in the buffer, check to
299 * see if this page intersects with the dirty range.
300 * If it does, we can't let the pager drop the page.
301 */
302 if (bp->nb_dirtyend > 0) {
303 int start = offset - NBOFF(bp);
304 if (bp->nb_dirtyend <= start ||
305 bp->nb_dirtyoff >= (start + PAGE_SIZE))
306 return (0);
307 return (EBUSY);
308 }
309 return (0);
310 }
311
312 int
313 nfs_buf_upl_setup(struct nfsbuf *bp)
314 {
315 kern_return_t kret;
316 upl_t upl;
317 int s;
318
319 if (ISSET(bp->nb_flags, NB_PAGELIST))
320 return (0);
321
322 kret = ubc_create_upl(bp->nb_vp, NBOFF(bp), bp->nb_bufsize,
323 &upl, NULL, UPL_PRECIOUS);
324 if (kret == KERN_INVALID_ARGUMENT) {
325 /* vm object probably doesn't exist any more */
326 bp->nb_pagelist = NULL;
327 return (EINVAL);
328 }
329 if (kret != KERN_SUCCESS) {
330 printf("nfs_buf_upl_setup(): failed to get pagelist %d\n", kret);
331 bp->nb_pagelist = NULL;
332 return (EIO);
333 }
334
335 FSDBG(538, bp, NBOFF(bp), bp->nb_bufsize, bp->nb_vp);
336
337 s = splbio();
338 bp->nb_pagelist = upl;
339 SET(bp->nb_flags, NB_PAGELIST);
340 splx(s);
341 return (0);
342 }
343
344 void
345 nfs_buf_upl_check(struct nfsbuf *bp)
346 {
347 upl_page_info_t *pl;
348 off_t filesize, fileoffset;
349 int i, npages;
350
351 if (!ISSET(bp->nb_flags, NB_PAGELIST))
352 return;
353
354 npages = round_page_32(bp->nb_bufsize) / PAGE_SIZE;
355 filesize = ubc_getsize(bp->nb_vp);
356 fileoffset = NBOFF(bp);
357 if (fileoffset < filesize)
358 SET(bp->nb_flags, NB_CACHE);
359 else
360 CLR(bp->nb_flags, NB_CACHE);
361
362 pl = ubc_upl_pageinfo(bp->nb_pagelist);
363 bp->nb_valid = bp->nb_dirty = 0;
364
365 for (i=0; i < npages; i++, fileoffset += PAGE_SIZE_64) {
366 /* anything beyond the end of the file is not valid or dirty */
367 if (fileoffset >= filesize)
368 break;
369 if (!upl_valid_page(pl, i)) {
370 CLR(bp->nb_flags, NB_CACHE);
371 continue;
372 }
373 NBPGVALID_SET(bp,i);
374 if (upl_dirty_page(pl, i)) {
375 NBPGDIRTY_SET(bp, i);
376 if (!ISSET(bp->nb_flags, NB_WASDIRTY))
377 SET(bp->nb_flags, NB_WASDIRTY);
378 }
379 }
380 fileoffset = NBOFF(bp);
381 if (ISSET(bp->nb_flags, NB_CACHE)) {
382 bp->nb_validoff = 0;
383 bp->nb_validend = bp->nb_bufsize;
384 if (fileoffset + bp->nb_validend > filesize)
385 bp->nb_validend = filesize - fileoffset;
386 } else {
387 bp->nb_validoff = bp->nb_validend = -1;
388 }
389 FSDBG(539, bp, fileoffset, bp->nb_valid, bp->nb_dirty);
390 FSDBG(539, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
391 }
392
393 static int
394 nfs_buf_map(struct nfsbuf *bp)
395 {
396 kern_return_t kret;
397
398 if (bp->nb_data)
399 return (0);
400 if (!ISSET(bp->nb_flags, NB_PAGELIST))
401 return (EINVAL);
402
403 kret = ubc_upl_map(bp->nb_pagelist, (vm_address_t *)&(bp->nb_data));
404 if (kret != KERN_SUCCESS)
405 panic("nfs_buf_map: ubc_upl_map() failed with (%d)", kret);
406 if (bp->nb_data == 0)
407 panic("ubc_upl_map mapped 0");
408 FSDBG(540, bp, bp->nb_flags, NBOFF(bp), bp->nb_data);
409 return (0);
410 }
411
412 /*
413 * check range of pages in nfsbuf's UPL for validity
414 */
415 static int
416 nfs_buf_upl_valid_range(struct nfsbuf *bp, int off, int size)
417 {
418 off_t fileoffset, filesize;
419 int pg, lastpg;
420 upl_page_info_t *pl;
421
422 if (!ISSET(bp->nb_flags, NB_PAGELIST))
423 return (0);
424 pl = ubc_upl_pageinfo(bp->nb_pagelist);
425
426 size += off & PAGE_MASK;
427 off &= ~PAGE_MASK;
428 fileoffset = NBOFF(bp);
429 filesize = VTONFS(bp->nb_vp)->n_size;
430 if ((fileoffset + off + size) > filesize)
431 size = filesize - (fileoffset + off);
432
433 pg = off/PAGE_SIZE;
434 lastpg = (off + size - 1)/PAGE_SIZE;
435 while (pg <= lastpg) {
436 if (!upl_valid_page(pl, pg))
437 return (0);
438 pg++;
439 }
440 return (1);
441 }
442
443 /*
444 * normalize an nfsbuf's valid range
445 *
446 * the read/write code guarantees that we'll always have a valid
447 * region that is an integral number of pages. If either end
448 * of the valid range isn't page-aligned, it gets corrected
449 * here as we extend the valid range through all of the
450 * contiguous valid pages.
451 */
452 static void
453 nfs_buf_normalize_valid_range(struct nfsnode *np, struct nfsbuf *bp)
454 {
455 int pg, npg;
456 /* pull validoff back to start of contiguous valid page range */
457 pg = bp->nb_validoff/PAGE_SIZE;
458 while (pg >= 0 && NBPGVALID(bp,pg))
459 pg--;
460 bp->nb_validoff = (pg+1) * PAGE_SIZE;
461 /* push validend forward to end of contiguous valid page range */
462 npg = bp->nb_bufsize/PAGE_SIZE;
463 pg = bp->nb_validend/PAGE_SIZE;
464 while (pg < npg && NBPGVALID(bp,pg))
465 pg++;
466 bp->nb_validend = pg * PAGE_SIZE;
467 /* clip to EOF */
468 if (NBOFF(bp) + bp->nb_validend > np->n_size)
469 bp->nb_validend = np->n_size % bp->nb_bufsize;
470 }
471
472 /*
473 * try to push out some delayed/uncommitted writes
474 */
475 static void
476 nfs_buf_delwri_push(void)
477 {
478 struct nfsbuf *bp;
479 int i;
480
481 if (TAILQ_EMPTY(&nfsbufdelwri))
482 return;
483
484 /* first try to tell the nfsiods to do it */
485 if (nfs_asyncio(NULL, NULL) == 0)
486 return;
487
488 /* otherwise, try to do some of the work ourselves */
489 i = 0;
490 while (i < 8 && (bp = TAILQ_FIRST(&nfsbufdelwri)) != NULL) {
491 struct nfsnode *np = VTONFS(bp->nb_vp);
492 nfs_buf_remfree(bp);
493 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
494 /* put buffer at end of delwri list */
495 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
496 nfsbufdelwricnt++;
497 nfs_flushcommits(np->n_vnode, (struct proc *)0);
498 } else {
499 SET(bp->nb_flags, (NB_BUSY | NB_ASYNC));
500 nfs_buf_write(bp);
501 }
502 i++;
503 }
504 }
505
506 /*
507 * Get an nfs cache block.
508 * Allocate a new one if the block isn't currently in the cache
509 * and return the block marked busy. If the calling process is
510 * interrupted by a signal for an interruptible mount point, return
511 * NULL.
512 */
513 struct nfsbuf *
514 nfs_buf_get(
515 struct vnode *vp,
516 daddr_t blkno,
517 int size,
518 struct proc *p,
519 int operation)
520 {
521 struct nfsnode *np = VTONFS(vp);
522 struct nfsbuf *bp;
523 int i, biosize, bufsize, rv;
524 struct ucred *cred;
525 int slpflag = PCATCH;
526
527 FSDBG_TOP(541, vp, blkno, size, operation);
528
529 bufsize = size;
530 if (bufsize > MAXBSIZE)
531 panic("nfs_buf_get: buffer larger than MAXBSIZE requested");
532
533 biosize = vp->v_mount->mnt_stat.f_iosize;
534
535 if (UBCINVALID(vp) || !UBCINFOEXISTS(vp))
536 operation = BLK_META;
537 else if (bufsize < biosize)
538 /* reg files should always have biosize blocks */
539 bufsize = biosize;
540
541 /* if BLK_WRITE, check for too many delayed/uncommitted writes */
542 if ((operation == BLK_WRITE) && (nfs_nbdwrite > ((nfsbufcnt*3)/4))) {
543 FSDBG_TOP(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
544
545 /* poke the delwri list */
546 nfs_buf_delwri_push();
547
548 /* sleep to let other threads run... */
549 tsleep(&nfs_nbdwrite, PCATCH, "nfs_nbdwrite", 1);
550 FSDBG_BOT(542, vp, blkno, nfs_nbdwrite, ((nfsbufcnt*3)/4));
551 }
552
553 loop:
554 /*
555 * Obtain a lock to prevent a race condition if the
556 * MALLOC() below happens to block.
557 */
558 if (nfsbufhashlock) {
559 while (nfsbufhashlock) {
560 nfsbufhashlock = -1;
561 tsleep(&nfsbufhashlock, PCATCH, "nfsbufget", 0);
562 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))
563 return (NULL);
564 }
565 goto loop;
566 }
567 nfsbufhashlock = 1;
568
569 /* check for existence of nfsbuf in cache */
570 if (bp = nfs_buf_incore(vp, blkno)) {
571 /* if busy, set wanted and wait */
572 if (ISSET(bp->nb_flags, NB_BUSY)) {
573 FSDBG_TOP(543, vp, blkno, bp, bp->nb_flags);
574 SET(bp->nb_flags, NB_WANTED);
575 /* unlock hash */
576 if (nfsbufhashlock < 0) {
577 nfsbufhashlock = 0;
578 wakeup(&nfsbufhashlock);
579 } else
580 nfsbufhashlock = 0;
581 tsleep(bp, slpflag|(PRIBIO+1), "nfsbufget", (slpflag == PCATCH) ? 0 : 2*hz);
582 slpflag = 0;
583 FSDBG_BOT(543, vp, blkno, bp, bp->nb_flags);
584 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
585 FSDBG_BOT(541, vp, blkno, 0, EINTR);
586 return (NULL);
587 }
588 goto loop;
589 }
590 if (bp->nb_bufsize != bufsize)
591 panic("nfsbuf size mismatch");
592 SET(bp->nb_flags, (NB_BUSY | NB_CACHE));
593 nfs_buf_remfree(bp);
594 /* additional paranoia: */
595 if (ISSET(bp->nb_flags, NB_PAGELIST))
596 panic("pagelist buffer was not busy");
597 goto buffer_setup;
598 }
599
600 /*
601 * where to get a free buffer:
602 * - alloc new if we haven't reached min bufs
603 * - if free lists are NOT empty
604 * - if free list is stale, use it
605 * - else if freemeta list is stale, use it
606 * - else if max bufs allocated, use least-time-to-stale
607 * - alloc new if we haven't reached max allowed
608 * - start clearing out delwri list and try again
609 */
610
611 if ((nfsbufcnt > nfsbufmin) &&
612 (!TAILQ_EMPTY(&nfsbuffree) || !TAILQ_EMPTY(&nfsbuffreemeta))) {
613 /* try to pull an nfsbuf off a free list */
614 struct nfsbuf *lrubp, *metabp;
615 struct timeval now;
616 microuptime(&now);
617
618 /* if the next LRU or META buffer is stale, use it */
619 lrubp = TAILQ_FIRST(&nfsbuffree);
620 if (lrubp && ((lrubp->nb_timestamp + NFSBUF_LRU_STALE) < now.tv_sec))
621 bp = lrubp;
622 metabp = TAILQ_FIRST(&nfsbuffreemeta);
623 if (!bp && metabp && ((metabp->nb_timestamp + NFSBUF_META_STALE) < now.tv_sec))
624 bp = metabp;
625
626 if (!bp && (nfsbufcnt >= nfsbufmax)) {
627 /* we've already allocated all bufs, so */
628 /* choose the buffer that'll go stale first */
629 if (!metabp)
630 bp = lrubp;
631 else if (!lrubp)
632 bp = metabp;
633 else {
634 int32_t lru_stale_time, meta_stale_time;
635 lru_stale_time = lrubp->nb_timestamp + NFSBUF_LRU_STALE;
636 meta_stale_time = metabp->nb_timestamp + NFSBUF_META_STALE;
637 if (lru_stale_time <= meta_stale_time)
638 bp = lrubp;
639 else
640 bp = metabp;
641 }
642 }
643
644 if (bp) {
645 /* we have a buffer to reuse */
646 FSDBG(544, vp, blkno, bp, bp->nb_flags);
647 nfs_buf_remfree(bp);
648 if (ISSET(bp->nb_flags, NB_DELWRI))
649 panic("nfs_buf_get: delwri");
650 SET(bp->nb_flags, NB_BUSY);
651 /* disassociate buffer from previous vnode */
652 if (bp->nb_vp) {
653 struct vnode *oldvp;
654 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
655 LIST_REMOVE(bp, nb_vnbufs);
656 bp->nb_vnbufs.le_next = NFSNOLIST;
657 }
658 oldvp = bp->nb_vp;
659 bp->nb_vp = NULL;
660 HOLDRELE(oldvp);
661 }
662 LIST_REMOVE(bp, nb_hash);
663 /* nuke any creds we're holding */
664 cred = bp->nb_rcred;
665 if (cred != NOCRED) {
666 bp->nb_rcred = NOCRED;
667 crfree(cred);
668 }
669 cred = bp->nb_wcred;
670 if (cred != NOCRED) {
671 bp->nb_wcred = NOCRED;
672 crfree(cred);
673 }
674 /* if buf will no longer be NB_META, dump old buffer */
675 if ((operation != BLK_META) &&
676 ISSET(bp->nb_flags, NB_META) && bp->nb_data) {
677 FREE(bp->nb_data, M_TEMP);
678 bp->nb_data = NULL;
679 }
680 /* re-init buf fields */
681 bp->nb_error = 0;
682 bp->nb_validoff = bp->nb_validend = -1;
683 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
684 bp->nb_valid = 0;
685 bp->nb_dirty = 0;
686 }
687 }
688
689 if (!bp) {
690 if (nfsbufcnt < nfsbufmax) {
691 /* just alloc a new one */
692 MALLOC(bp, struct nfsbuf *, sizeof(struct nfsbuf), M_TEMP, M_WAITOK);
693 nfsbufcnt++;
694 NFSBUFCNTCHK();
695 /* init nfsbuf */
696 bzero(bp, sizeof(*bp));
697 bp->nb_free.tqe_next = NFSNOLIST;
698 bp->nb_validoff = bp->nb_validend = -1;
699 FSDBG(545, vp, blkno, bp, 0);
700 } else {
701 /* too many bufs... wait for buffers to free up */
702 FSDBG_TOP(546, vp, blkno, nfsbufcnt, nfsbufmax);
703 /* unlock hash */
704 if (nfsbufhashlock < 0) {
705 nfsbufhashlock = 0;
706 wakeup(&nfsbufhashlock);
707 } else
708 nfsbufhashlock = 0;
709
710 /* poke the delwri list */
711 nfs_buf_delwri_push();
712
713 nfsneedbuffer = 1;
714 tsleep(&nfsneedbuffer, PCATCH, "nfsbufget", 0);
715 FSDBG_BOT(546, vp, blkno, nfsbufcnt, nfsbufmax);
716 if (nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p)) {
717 FSDBG_BOT(541, vp, blkno, 0, EINTR);
718 return (NULL);
719 }
720 goto loop;
721 }
722 }
723
724 setup_nfsbuf:
725
726 /* setup nfsbuf */
727 bp->nb_flags = NB_BUSY;
728 bp->nb_lblkno = blkno;
729 /* insert buf in hash */
730 LIST_INSERT_HEAD(NFSBUFHASH(vp, blkno), bp, nb_hash);
731 /* associate buffer with new vnode */
732 VHOLD(vp);
733 bp->nb_vp = vp;
734 LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs);
735
736 buffer_setup:
737
738 switch (operation) {
739 case BLK_META:
740 SET(bp->nb_flags, NB_META);
741 if ((bp->nb_bufsize != bufsize) && bp->nb_data) {
742 FREE(bp->nb_data, M_TEMP);
743 bp->nb_data = NULL;
744 bp->nb_validoff = bp->nb_validend = -1;
745 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
746 bp->nb_valid = 0;
747 bp->nb_dirty = 0;
748 CLR(bp->nb_flags, NB_CACHE);
749 }
750 if (!bp->nb_data)
751 MALLOC(bp->nb_data, caddr_t, bufsize, M_TEMP, M_WAITOK);
752 if (!bp->nb_data)
753 panic("nfs_buf_get: null nb_data");
754 bp->nb_bufsize = bufsize;
755 break;
756
757 case BLK_READ:
758 case BLK_WRITE:
759 if (bufsize < PAGE_SIZE)
760 bufsize = PAGE_SIZE;
761 bp->nb_bufsize = bufsize;
762 bp->nb_validoff = bp->nb_validend = -1;
763
764 if (UBCISVALID(vp)) {
765 /* setup upl */
766 if (nfs_buf_upl_setup(bp)) {
767 /* unable to create upl */
768 /* vm object must no longer exist */
769 /* cleanup buffer and return NULL */
770 LIST_REMOVE(bp, nb_vnbufs);
771 bp->nb_vnbufs.le_next = NFSNOLIST;
772 bp->nb_vp = NULL;
773 /* clear usage timestamp to allow immediate freeing */
774 bp->nb_timestamp = 0;
775 HOLDRELE(vp);
776 if (bp->nb_free.tqe_next != NFSNOLIST)
777 panic("nfsbuf on freelist");
778 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
779 nfsbuffreecnt++;
780 FSDBG_BOT(541, vp, blkno, 0x2bc, EIO);
781 return (NULL);
782 }
783 nfs_buf_upl_check(bp);
784 }
785 break;
786
787 default:
788 panic("nfs_buf_get: %d unknown operation", operation);
789 }
790
791 /* unlock hash */
792 if (nfsbufhashlock < 0) {
793 nfsbufhashlock = 0;
794 wakeup(&nfsbufhashlock);
795 } else
796 nfsbufhashlock = 0;
797
798 FSDBG_BOT(541, vp, blkno, bp, bp->nb_flags);
799
800 return (bp);
801 }
802
803 void
804 nfs_buf_release(struct nfsbuf *bp, int freeup)
805 {
806 struct vnode *vp = bp->nb_vp;
807 struct timeval now;
808
809 FSDBG_TOP(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
810 FSDBG(548, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff, bp->nb_dirtyend);
811 FSDBG(548, bp->nb_valid, 0, bp->nb_dirty, 0);
812
813 if (UBCINFOEXISTS(vp) && bp->nb_bufsize) {
814 int upl_flags;
815 upl_t upl;
816 int i, rv;
817
818 if (!ISSET(bp->nb_flags, NB_PAGELIST) && !ISSET(bp->nb_flags, NB_INVAL)) {
819 rv = nfs_buf_upl_setup(bp);
820 if (rv)
821 printf("nfs_buf_release: upl create failed %d\n", rv);
822 else
823 nfs_buf_upl_check(bp);
824 }
825 upl = bp->nb_pagelist;
826 if (!upl)
827 goto pagelist_cleanup_done;
828 if (bp->nb_data) {
829 if (ubc_upl_unmap(upl) != KERN_SUCCESS)
830 panic("ubc_upl_unmap failed");
831 bp->nb_data = NULL;
832 }
833 if (bp->nb_flags & (NB_ERROR | NB_INVAL)) {
834 if (bp->nb_flags & (NB_READ | NB_INVAL))
835 upl_flags = UPL_ABORT_DUMP_PAGES;
836 else
837 upl_flags = 0;
838 ubc_upl_abort(upl, upl_flags);
839 goto pagelist_cleanup_done;
840 }
841 for (i=0; i <= (bp->nb_bufsize - 1)/PAGE_SIZE; i++) {
842 if (!NBPGVALID(bp,i))
843 ubc_upl_abort_range(upl,
844 i*PAGE_SIZE, PAGE_SIZE,
845 UPL_ABORT_DUMP_PAGES |
846 UPL_ABORT_FREE_ON_EMPTY);
847 else {
848 if (NBPGDIRTY(bp,i))
849 upl_flags = UPL_COMMIT_SET_DIRTY;
850 else
851 upl_flags = UPL_COMMIT_CLEAR_DIRTY;
852 ubc_upl_commit_range(upl,
853 i*PAGE_SIZE, PAGE_SIZE,
854 upl_flags |
855 UPL_COMMIT_INACTIVATE |
856 UPL_COMMIT_FREE_ON_EMPTY);
857 }
858 }
859 pagelist_cleanup_done:
860 /* was this the last buffer in the file? */
861 if (NBOFF(bp) + bp->nb_bufsize > VTONFS(vp)->n_size) {
862 /* if so, invalidate all pages of last buffer past EOF */
863 int biosize = vp->v_mount->mnt_stat.f_iosize;
864 off_t off, size;
865 off = trunc_page_64(VTONFS(vp)->n_size) + PAGE_SIZE_64;
866 size = trunc_page_64(NBOFF(bp) + biosize) - off;
867 if (size)
868 ubc_invalidate(vp, off, size);
869 }
870 CLR(bp->nb_flags, NB_PAGELIST);
871 bp->nb_pagelist = NULL;
872 }
873
874 /* Wake up any processes waiting for any buffer to become free. */
875 if (nfsneedbuffer) {
876 nfsneedbuffer = 0;
877 wakeup(&nfsneedbuffer);
878 }
879 /* Wake up any processes waiting for _this_ buffer to become free. */
880 if (ISSET(bp->nb_flags, NB_WANTED)) {
881 CLR(bp->nb_flags, NB_WANTED);
882 wakeup(bp);
883 }
884
885 /* If it's not cacheable, or an error, mark it invalid. */
886 if (ISSET(bp->nb_flags, (NB_NOCACHE|NB_ERROR)))
887 SET(bp->nb_flags, NB_INVAL);
888
889 if ((bp->nb_bufsize <= 0) || ISSET(bp->nb_flags, NB_INVAL)) {
890 /* If it's invalid or empty, dissociate it from its vnode */
891 if (bp->nb_vnbufs.le_next != NFSNOLIST) {
892 LIST_REMOVE(bp, nb_vnbufs);
893 bp->nb_vnbufs.le_next = NFSNOLIST;
894 }
895 bp->nb_vp = NULL;
896 HOLDRELE(vp);
897 /* if this was a delayed write, wakeup anyone */
898 /* waiting for delayed writes to complete */
899 if (ISSET(bp->nb_flags, NB_DELWRI)) {
900 CLR(bp->nb_flags, NB_DELWRI);
901 nfs_nbdwrite--;
902 NFSBUFCNTCHK();
903 wakeup((caddr_t)&nfs_nbdwrite);
904 }
905 /* clear usage timestamp to allow immediate freeing */
906 bp->nb_timestamp = 0;
907 /* put buffer at head of free list */
908 if (bp->nb_free.tqe_next != NFSNOLIST)
909 panic("nfsbuf on freelist");
910 SET(bp->nb_flags, NB_INVAL);
911 TAILQ_INSERT_HEAD(&nfsbuffree, bp, nb_free);
912 nfsbuffreecnt++;
913 if (freeup)
914 NFS_BUF_FREEUP();
915 } else if (ISSET(bp->nb_flags, NB_DELWRI)) {
916 /* put buffer at end of delwri list */
917 if (bp->nb_free.tqe_next != NFSNOLIST)
918 panic("nfsbuf on freelist");
919 TAILQ_INSERT_TAIL(&nfsbufdelwri, bp, nb_free);
920 nfsbufdelwricnt++;
921 } else {
922 /* update usage timestamp */
923 microuptime(&now);
924 bp->nb_timestamp = now.tv_sec;
925 /* put buffer at end of free list */
926 if (bp->nb_free.tqe_next != NFSNOLIST)
927 panic("nfsbuf on freelist");
928 if (ISSET(bp->nb_flags, NB_META)) {
929 TAILQ_INSERT_TAIL(&nfsbuffreemeta, bp, nb_free);
930 nfsbuffreemetacnt++;
931 } else {
932 TAILQ_INSERT_TAIL(&nfsbuffree, bp, nb_free);
933 nfsbuffreecnt++;
934 }
935 if (freeup)
936 NFS_BUF_FREEUP();
937 }
938
939 NFSBUFCNTCHK();
940
941 /* Unlock the buffer. */
942 CLR(bp->nb_flags, (NB_ASYNC | NB_BUSY | NB_NOCACHE | NB_STABLE | NB_IOD));
943
944 FSDBG_BOT(548, bp, NBOFF(bp), bp->nb_flags, bp->nb_data);
945 }
946
947 /*
948 * Wait for operations on the buffer to complete.
949 * When they do, extract and return the I/O's error value.
950 */
951 int
952 nfs_buf_iowait(struct nfsbuf *bp)
953 {
954 FSDBG_TOP(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
955
956 while (!ISSET(bp->nb_flags, NB_DONE))
957 tsleep(bp, PRIBIO + 1, "nfs_buf_iowait", 0);
958
959 FSDBG_BOT(549, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
960
961 /* check for interruption of I/O, then errors. */
962 if (ISSET(bp->nb_flags, NB_EINTR)) {
963 CLR(bp->nb_flags, NB_EINTR);
964 return (EINTR);
965 } else if (ISSET(bp->nb_flags, NB_ERROR))
966 return (bp->nb_error ? bp->nb_error : EIO);
967 return (0);
968 }
969
970 /*
971 * Mark I/O complete on a buffer.
972 */
973 void
974 nfs_buf_iodone(struct nfsbuf *bp)
975 {
976 struct vnode *vp;
977
978 FSDBG_TOP(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
979
980 if (ISSET(bp->nb_flags, NB_DONE))
981 panic("nfs_buf_iodone already");
982 SET(bp->nb_flags, NB_DONE); /* note that it's done */
983 /*
984 * I/O was done, so don't believe
985 * the DIRTY state from VM anymore
986 */
987 CLR(bp->nb_flags, NB_WASDIRTY);
988
989 if (!ISSET(bp->nb_flags, NB_READ)) {
990 CLR(bp->nb_flags, NB_WRITEINPROG);
991 vpwakeup(bp->nb_vp);
992 }
993
994 /* Wakeup the throttled write operations as needed */
995 vp = bp->nb_vp;
996 if (vp && (vp->v_flag & VTHROTTLED)
997 && (vp->v_numoutput <= (NFSBUFWRITE_THROTTLE / 3))) {
998 vp->v_flag &= ~VTHROTTLED;
999 wakeup((caddr_t)&vp->v_numoutput);
1000 }
1001
1002 if (ISSET(bp->nb_flags, NB_ASYNC)) /* if async, release it */
1003 nfs_buf_release(bp, 1);
1004 else { /* or just wakeup the buffer */
1005 CLR(bp->nb_flags, NB_WANTED);
1006 wakeup(bp);
1007 }
1008
1009 FSDBG_BOT(550, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1010 }
1011
1012 void
1013 nfs_buf_write_delayed(struct nfsbuf *bp)
1014 {
1015 struct proc *p = current_proc();
1016 struct vnode *vp = bp->nb_vp;
1017
1018 FSDBG_TOP(551, bp, NBOFF(bp), bp->nb_flags, 0);
1019 FSDBG(551, bp, bp->nb_dirtyoff, bp->nb_dirtyend, bp->nb_dirty);
1020
1021 /*
1022 * If the block hasn't been seen before:
1023 * (1) Mark it as having been seen,
1024 * (2) Charge for the write.
1025 * (3) Make sure it's on its vnode's correct block list,
1026 */
1027 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
1028 SET(bp->nb_flags, NB_DELWRI);
1029 if (p && p->p_stats)
1030 p->p_stats->p_ru.ru_oublock++; /* XXX */
1031 nfs_nbdwrite++;
1032 NFSBUFCNTCHK();
1033 /* move to dirty list */
1034 if (bp->nb_vnbufs.le_next != NFSNOLIST)
1035 LIST_REMOVE(bp, nb_vnbufs);
1036 LIST_INSERT_HEAD(&VTONFS(vp)->n_dirtyblkhd, bp, nb_vnbufs);
1037 }
1038
1039 /*
1040 * If the vnode has "too many" write operations in progress
1041 * wait for them to finish the IO
1042 */
1043 while (vp->v_numoutput >= NFSBUFWRITE_THROTTLE) {
1044 vp->v_flag |= VTHROTTLED;
1045 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "nfs_buf_write_delayed", 0);
1046 }
1047
1048 /*
1049 * If we have too many delayed write buffers,
1050 * more than we can "safely" handle, just fall back to
1051 * doing the async write
1052 */
1053 if (nfs_nbdwrite < 0)
1054 panic("nfs_buf_write_delayed: Negative nfs_nbdwrite");
1055
1056 if (nfs_nbdwrite > ((nfsbufcnt/4)*3)) {
1057 /* issue async write */
1058 SET(bp->nb_flags, NB_ASYNC);
1059 nfs_buf_write(bp);
1060 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, bp->nb_error);
1061 return;
1062 }
1063
1064 /* Otherwise, the "write" is done, so mark and release the buffer. */
1065 SET(bp->nb_flags, NB_DONE);
1066 nfs_buf_release(bp, 1);
1067 FSDBG_BOT(551, bp, NBOFF(bp), bp->nb_flags, 0);
1068 return;
1069 }
1070
1071
1072 /*
1073 * Vnode op for read using bio
1074 * Any similarity to readip() is purely coincidental
1075 */
1076 int
1077 nfs_bioread(vp, uio, ioflag, cred, getpages)
1078 register struct vnode *vp;
1079 register struct uio *uio;
1080 int ioflag;
1081 struct ucred *cred;
1082 int getpages; // XXX unused!
1083 {
1084 struct nfsnode *np = VTONFS(vp);
1085 int biosize, i;
1086 off_t diff;
1087 struct nfsbuf *bp = 0, *rabp;
1088 struct vattr vattr;
1089 struct proc *p;
1090 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1091 daddr_t lbn, rabn, lastrabn = -1;
1092 int bufsize;
1093 int nra, error = 0, n = 0, on = 0;
1094 int operation = (getpages? BLK_PAGEIN : BLK_READ);
1095 caddr_t dp;
1096 struct dirent *direntp;
1097
1098 FSDBG_TOP(514, vp, uio->uio_offset, uio->uio_resid, ioflag);
1099
1100 #if DIAGNOSTIC
1101 if (uio->uio_rw != UIO_READ)
1102 panic("nfs_read mode");
1103 #endif
1104 if (uio->uio_resid == 0) {
1105 FSDBG_BOT(514, vp, 0xd1e0001, 0, 0);
1106 return (0);
1107 }
1108 if (uio->uio_offset < 0) {
1109 FSDBG_BOT(514, vp, 0xd1e0002, 0, EINVAL);
1110 return (EINVAL);
1111 }
1112 p = uio->uio_procp;
1113 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1114 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1115 (void)nfs_fsinfo(nmp, vp, cred, p);
1116 biosize = vp->v_mount->mnt_stat.f_iosize;
1117 /*
1118 * For nfs, cache consistency can only be maintained approximately.
1119 * Although RFC1094 does not specify the criteria, the following is
1120 * believed to be compatible with the reference port.
1121 * For nqnfs, full cache consistency is maintained within the loop.
1122 * For nfs:
1123 * If the file's modify time on the server has changed since the
1124 * last read rpc or you have written to the file,
1125 * you may have lost data cache consistency with the
1126 * server, so flush all of the file's data out of the cache.
1127 * Then force a getattr rpc to ensure that you have up to date
1128 * attributes.
1129 * NB: This implies that cache data can be read when up to
1130 * NFS_MAXATTRTIMEO seconds out of date. If you find that you need
1131 * current attributes this could be forced by setting n_xid to 0
1132 * before the VOP_GETATTR() call.
1133 */
1134 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1135 if (np->n_flag & NMODIFIED) {
1136 if (vp->v_type != VREG) {
1137 if (vp->v_type != VDIR)
1138 panic("nfs: bioread, not dir");
1139 nfs_invaldir(vp);
1140 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1141 if (error) {
1142 FSDBG_BOT(514, vp, 0xd1e0003, 0, error);
1143 return (error);
1144 }
1145 }
1146 np->n_xid = 0;
1147 error = VOP_GETATTR(vp, &vattr, cred, p);
1148 if (error) {
1149 FSDBG_BOT(514, vp, 0xd1e0004, 0, error);
1150 return (error);
1151 }
1152 if (vp->v_type == VDIR) {
1153 /* if directory changed, purge any name cache entries */
1154 if (np->n_ncmtime != vattr.va_mtime.tv_sec)
1155 cache_purge(vp);
1156 np->n_ncmtime = vattr.va_mtime.tv_sec;
1157 }
1158 np->n_mtime = vattr.va_mtime.tv_sec;
1159 } else {
1160 error = VOP_GETATTR(vp, &vattr, cred, p);
1161 if (error) {
1162 FSDBG_BOT(514, vp, 0xd1e0005, 0, error);
1163 return (error);
1164 }
1165 if (np->n_mtime != vattr.va_mtime.tv_sec) {
1166 if (vp->v_type == VDIR) {
1167 nfs_invaldir(vp);
1168 /* purge name cache entries */
1169 if (np->n_ncmtime != vattr.va_mtime.tv_sec)
1170 cache_purge(vp);
1171 }
1172 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1173 if (error) {
1174 FSDBG_BOT(514, vp, 0xd1e0006, 0, error);
1175 return (error);
1176 }
1177 if (vp->v_type == VDIR)
1178 np->n_ncmtime = vattr.va_mtime.tv_sec;
1179 np->n_mtime = vattr.va_mtime.tv_sec;
1180 }
1181 }
1182 }
1183 do {
1184
1185 /*
1186 * Get a valid lease. If cached data is stale, flush it.
1187 */
1188 if (nmp->nm_flag & NFSMNT_NQNFS) {
1189 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1190 do {
1191 error = nqnfs_getlease(vp, ND_READ, cred, p);
1192 } while (error == NQNFS_EXPIRED);
1193 if (error) {
1194 FSDBG_BOT(514, vp, 0xd1e0007, 0, error);
1195 return (error);
1196 }
1197 if (np->n_lrev != np->n_brev ||
1198 (np->n_flag & NQNFSNONCACHE) ||
1199 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
1200 if (vp->v_type == VDIR)
1201 nfs_invaldir(vp);
1202 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1203 if (error) {
1204 FSDBG_BOT(514, vp, 0xd1e0008, 0, error);
1205 return (error);
1206 }
1207 np->n_brev = np->n_lrev;
1208 }
1209 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
1210 nfs_invaldir(vp);
1211 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1212 if (error) {
1213 FSDBG_BOT(514, vp, 0xd1e0009, 0, error);
1214 return (error);
1215 }
1216 }
1217 }
1218 if ((np->n_flag & NQNFSNONCACHE) || (vp->v_flag & VNOCACHE_DATA)) {
1219 if ((vp->v_flag & VNOCACHE_DATA) &&
1220 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1221 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1222 if (error) {
1223 FSDBG_BOT(514, vp, 0xd1e000a, 0, error);
1224 return (error);
1225 }
1226 }
1227 switch (vp->v_type) {
1228 case VREG:
1229 error = nfs_readrpc(vp, uio, cred);
1230 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1231 return (error);
1232 case VLNK:
1233 error = nfs_readlinkrpc(vp, uio, cred);
1234 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1235 return (error);
1236 case VDIR:
1237 break;
1238 default:
1239 printf(" NQNFSNONCACHE: type %x unexpected\n", vp->v_type);
1240 };
1241 }
1242 switch (vp->v_type) {
1243 case VREG:
1244 lbn = uio->uio_offset / biosize;
1245
1246 /*
1247 * Copy directly from any cached pages without grabbing the bufs.
1248 */
1249 if (uio->uio_segflg == UIO_USERSPACE) {
1250 int io_resid = uio->uio_resid;
1251 diff = np->n_size - uio->uio_offset;
1252 if (diff < io_resid)
1253 io_resid = diff;
1254 if (io_resid > 0) {
1255 error = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1256 if (error) {
1257 FSDBG_BOT(514, vp, uio->uio_offset, 0xcacefeed, error);
1258 return (error);
1259 }
1260 }
1261 /* count any biocache reads that we just copied directly */
1262 if (lbn != uio->uio_offset / biosize) {
1263 nfsstats.biocache_reads += (uio->uio_offset / biosize) - lbn;
1264 FSDBG(514, vp, 0xcacefeed, uio->uio_offset, error);
1265 }
1266 }
1267
1268 lbn = uio->uio_offset / biosize;
1269 on = uio->uio_offset % biosize;
1270
1271 /*
1272 * Start the read ahead(s), as required.
1273 */
1274 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1275 for (nra = 0; nra < nmp->nm_readahead; nra++) {
1276 rabn = lbn + 1 + nra;
1277 if (rabn <= lastrabn) {
1278 /* we've already (tried to) read this block */
1279 /* no need to try it again... */
1280 continue;
1281 }
1282 lastrabn = rabn;
1283 if ((off_t)rabn * biosize >= np->n_size)
1284 break;
1285 /* check if block exists and is valid. */
1286 rabp = nfs_buf_incore(vp, rabn);
1287 if (rabp && nfs_buf_upl_valid_range(rabp, 0, rabp->nb_bufsize))
1288 continue;
1289 rabp = nfs_buf_get(vp, rabn, biosize, p, operation);
1290 if (!rabp) {
1291 FSDBG_BOT(514, vp, 0xd1e000b, 0, EINTR);
1292 return (EINTR);
1293 }
1294 if (!ISSET(rabp->nb_flags, (NB_CACHE|NB_DELWRI))) {
1295 SET(rabp->nb_flags, (NB_READ|NB_ASYNC));
1296 if (nfs_asyncio(rabp, cred)) {
1297 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1298 rabp->nb_error = EIO;
1299 nfs_buf_release(rabp, 1);
1300 }
1301 } else
1302 nfs_buf_release(rabp, 1);
1303 }
1304 }
1305
1306 if ((uio->uio_resid <= 0) || (uio->uio_offset >= np->n_size)) {
1307 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, 0xaaaaaaaa);
1308 return (0);
1309 }
1310
1311 nfsstats.biocache_reads++;
1312
1313 /*
1314 * If the block is in the cache and has the required data
1315 * in a valid region, just copy it out.
1316 * Otherwise, get the block and write back/read in,
1317 * as required.
1318 */
1319 again:
1320 bufsize = biosize;
1321 n = min((unsigned)(bufsize - on), uio->uio_resid);
1322 diff = np->n_size - uio->uio_offset;
1323 if (diff < n)
1324 n = diff;
1325
1326 bp = nfs_buf_get(vp, lbn, bufsize, p, operation);
1327 if (!bp) {
1328 FSDBG_BOT(514, vp, 0xd1e000c, 0, EINTR);
1329 return (EINTR);
1330 }
1331
1332 /* if any pages are valid... */
1333 if (bp->nb_valid) {
1334 /* ...check for any invalid pages in the read range */
1335 int pg, firstpg, lastpg, dirtypg;
1336 dirtypg = firstpg = lastpg = -1;
1337 pg = on/PAGE_SIZE;
1338 while (pg <= (on + n - 1)/PAGE_SIZE) {
1339 if (!NBPGVALID(bp,pg)) {
1340 if (firstpg < 0)
1341 firstpg = pg;
1342 lastpg = pg;
1343 } else if (firstpg >= 0 && dirtypg < 0 && NBPGDIRTY(bp,pg))
1344 dirtypg = pg;
1345 pg++;
1346 }
1347
1348 /* if there are no invalid pages, we're all set */
1349 if (firstpg < 0) {
1350 if (bp->nb_validoff < 0) {
1351 /* valid range isn't set up, so */
1352 /* set it to what we know is valid */
1353 bp->nb_validoff = trunc_page_32(on);
1354 bp->nb_validend = round_page_32(on+n);
1355 nfs_buf_normalize_valid_range(np, bp);
1356 }
1357 goto buffer_ready;
1358 }
1359
1360 /* there are invalid pages in the read range */
1361 if ((dirtypg > firstpg) && (dirtypg < lastpg)) {
1362 /* there are also dirty page(s) in the range, */
1363 /* so write the buffer out and try again */
1364 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1365 SET(bp->nb_flags, NB_ASYNC);
1366 /*
1367 * NFS has embedded ucred so crhold() risks zone corruption
1368 */
1369 if (bp->nb_wcred == NOCRED)
1370 bp->nb_wcred = crdup(cred);
1371 error = nfs_buf_write(bp);
1372 if (error) {
1373 FSDBG_BOT(514, vp, 0xd1e000d, 0, error);
1374 return (error);
1375 }
1376 goto again;
1377 }
1378 if (!bp->nb_dirty && bp->nb_dirtyend <= 0 &&
1379 (lastpg - firstpg + 1) > (bufsize/PAGE_SIZE)/2) {
1380 /* we need to read in more than half the buffer and the */
1381 /* buffer's not dirty, so just fetch the whole buffer */
1382 bp->nb_valid = 0;
1383 } else {
1384 /* read the page range in */
1385 struct iovec iov;
1386 struct uio auio;
1387 auio.uio_iov = &iov;
1388 auio.uio_iovcnt = 1;
1389 auio.uio_offset = NBOFF(bp) + firstpg * PAGE_SIZE_64;
1390 auio.uio_resid = (lastpg - firstpg + 1) * PAGE_SIZE;
1391 auio.uio_segflg = UIO_SYSSPACE;
1392 auio.uio_rw = UIO_READ;
1393 auio.uio_procp = p;
1394 NFS_BUF_MAP(bp);
1395 iov.iov_base = bp->nb_data + firstpg * PAGE_SIZE;
1396 iov.iov_len = auio.uio_resid;
1397 error = nfs_readrpc(vp, &auio, cred);
1398 if (error) {
1399 nfs_buf_release(bp, 1);
1400 FSDBG_BOT(514, vp, 0xd1e000e, 0, error);
1401 return (error);
1402 }
1403 /* Make sure that the valid range is set to cover this read. */
1404 bp->nb_validoff = trunc_page_32(on);
1405 bp->nb_validend = round_page_32(on+n);
1406 nfs_buf_normalize_valid_range(np, bp);
1407 if (auio.uio_resid > 0) {
1408 /* if short read, must have hit EOF, */
1409 /* so zero the rest of the range */
1410 bzero(iov.iov_base, auio.uio_resid);
1411 }
1412 /* mark the pages (successfully read) as valid */
1413 for (pg=firstpg; pg <= lastpg; pg++)
1414 NBPGVALID_SET(bp,pg);
1415 }
1416 }
1417 /* if no pages are valid, read the whole block */
1418 if (!bp->nb_valid) {
1419 SET(bp->nb_flags, NB_READ);
1420 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1421 error = nfs_doio(bp, cred, p);
1422 if (error) {
1423 nfs_buf_release(bp, 1);
1424 FSDBG_BOT(514, vp, 0xd1e000f, 0, error);
1425 return (error);
1426 }
1427 }
1428 buffer_ready:
1429 vp->v_lastr = lbn;
1430 /* validate read range against valid range and clip */
1431 if (bp->nb_validend > 0) {
1432 diff = (on >= bp->nb_validend) ? 0 : (bp->nb_validend - on);
1433 if (diff < n)
1434 n = diff;
1435 }
1436 if (n > 0)
1437 NFS_BUF_MAP(bp);
1438 break;
1439 case VLNK:
1440 nfsstats.biocache_readlinks++;
1441 bp = nfs_buf_get(vp, (daddr_t)0, NFS_MAXPATHLEN, p, operation);
1442 if (!bp) {
1443 FSDBG_BOT(514, vp, 0xd1e0010, 0, EINTR);
1444 return (EINTR);
1445 }
1446 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1447 SET(bp->nb_flags, NB_READ);
1448 error = nfs_doio(bp, cred, p);
1449 if (error) {
1450 SET(bp->nb_flags, NB_ERROR);
1451 nfs_buf_release(bp, 1);
1452 FSDBG_BOT(514, vp, 0xd1e0011, 0, error);
1453 return (error);
1454 }
1455 }
1456 n = min(uio->uio_resid, bp->nb_validend);
1457 on = 0;
1458 break;
1459 case VDIR:
1460 nfsstats.biocache_readdirs++;
1461 if (np->n_direofoffset && uio->uio_offset >= np->n_direofoffset) {
1462 FSDBG_BOT(514, vp, 0xde0f0001, 0, 0);
1463 return (0);
1464 }
1465 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
1466 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1467 bp = nfs_buf_get(vp, lbn, NFS_DIRBLKSIZ, p, operation);
1468 if (!bp) {
1469 FSDBG_BOT(514, vp, 0xd1e0012, 0, EINTR);
1470 return (EINTR);
1471 }
1472 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1473 SET(bp->nb_flags, NB_READ);
1474 error = nfs_doio(bp, cred, p);
1475 if (error) {
1476 nfs_buf_release(bp, 1);
1477 }
1478 while (error == NFSERR_BAD_COOKIE) {
1479 nfs_invaldir(vp);
1480 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
1481 /*
1482 * Yuck! The directory has been modified on the
1483 * server. The only way to get the block is by
1484 * reading from the beginning to get all the
1485 * offset cookies.
1486 */
1487 for (i = 0; i <= lbn && !error; i++) {
1488 if (np->n_direofoffset
1489 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) {
1490 FSDBG_BOT(514, vp, 0xde0f0002, 0, 0);
1491 return (0);
1492 }
1493 bp = nfs_buf_get(vp, i, NFS_DIRBLKSIZ, p, operation);
1494 if (!bp) {
1495 FSDBG_BOT(514, vp, 0xd1e0013, 0, EINTR);
1496 return (EINTR);
1497 }
1498 if (!ISSET(bp->nb_flags, NB_CACHE)) {
1499 SET(bp->nb_flags, NB_READ);
1500 error = nfs_doio(bp, cred, p);
1501 /*
1502 * no error + NB_INVAL == directory EOF,
1503 * use the block.
1504 */
1505 if (error == 0 && (bp->nb_flags & NB_INVAL))
1506 break;
1507 }
1508 /*
1509 * An error will throw away the block and the
1510 * for loop will break out. If no error and this
1511 * is not the block we want, we throw away the
1512 * block and go for the next one via the for loop.
1513 */
1514 if (error || i < lbn)
1515 nfs_buf_release(bp, 1);
1516 }
1517 }
1518 /*
1519 * The above while is repeated if we hit another cookie
1520 * error. If we hit an error and it wasn't a cookie error,
1521 * we give up.
1522 */
1523 if (error) {
1524 FSDBG_BOT(514, vp, 0xd1e0014, 0, error);
1525 return (error);
1526 }
1527 }
1528
1529 /*
1530 * If not eof and read aheads are enabled, start one.
1531 * (You need the current block first, so that you have the
1532 * directory offset cookie of the next block.)
1533 */
1534 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
1535 (np->n_direofoffset == 0 ||
1536 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
1537 !(np->n_flag & NQNFSNONCACHE) &&
1538 !nfs_buf_incore(vp, lbn + 1)) {
1539 rabp = nfs_buf_get(vp, lbn + 1, NFS_DIRBLKSIZ, p,
1540 operation);
1541 if (rabp) {
1542 if (!ISSET(rabp->nb_flags, (NB_CACHE))) {
1543 SET(rabp->nb_flags, (NB_READ | NB_ASYNC));
1544 if (nfs_asyncio(rabp, cred)) {
1545 SET(rabp->nb_flags, (NB_INVAL|NB_ERROR));
1546 rabp->nb_error = EIO;
1547 nfs_buf_release(rabp, 1);
1548 }
1549 } else {
1550 nfs_buf_release(rabp, 1);
1551 }
1552 }
1553 }
1554 /*
1555 * Make sure we use a signed variant of min() since
1556 * the second term may be negative.
1557 */
1558 n = lmin(uio->uio_resid, bp->nb_validend - on);
1559 /*
1560 * We keep track of the directory eof in
1561 * np->n_direofoffset and chop it off as an
1562 * extra step right here.
1563 */
1564 if (np->n_direofoffset &&
1565 n > np->n_direofoffset - uio->uio_offset)
1566 n = np->n_direofoffset - uio->uio_offset;
1567 /*
1568 * Make sure that we return an integral number of entries so
1569 * that any subsequent calls will start copying from the start
1570 * of the next entry.
1571 *
1572 * If the current value of n has the last entry cut short,
1573 * set n to copy everything up to the last entry instead.
1574 */
1575 if (n > 0) {
1576 dp = bp->nb_data + on;
1577 while (dp < (bp->nb_data + on + n)) {
1578 direntp = (struct dirent *)dp;
1579 dp += direntp->d_reclen;
1580 }
1581 if (dp > (bp->nb_data + on + n))
1582 n = (dp - direntp->d_reclen) - (bp->nb_data + on);
1583 }
1584 break;
1585 default:
1586 printf("nfs_bioread: type %x unexpected\n",vp->v_type);
1587 FSDBG_BOT(514, vp, 0xd1e0015, 0, EINVAL);
1588 return (EINVAL);
1589 };
1590
1591 if (n > 0) {
1592 error = uiomove(bp->nb_data + on, (int)n, uio);
1593 }
1594 switch (vp->v_type) {
1595 case VREG:
1596 break;
1597 case VLNK:
1598 n = 0;
1599 break;
1600 case VDIR:
1601 if (np->n_flag & NQNFSNONCACHE)
1602 SET(bp->nb_flags, NB_INVAL);
1603 break;
1604 }
1605 nfs_buf_release(bp, 1);
1606 } while (error == 0 && uio->uio_resid > 0 && n > 0);
1607 FSDBG_BOT(514, vp, uio->uio_offset, uio->uio_resid, error);
1608 return (error);
1609 }
1610
1611
1612 /*
1613 * Vnode op for write using bio
1614 */
1615 int
1616 nfs_write(ap)
1617 struct vop_write_args /* {
1618 struct vnode *a_vp;
1619 struct uio *a_uio;
1620 int a_ioflag;
1621 struct ucred *a_cred;
1622 } */ *ap;
1623 {
1624 struct uio *uio = ap->a_uio;
1625 struct proc *p = uio->uio_procp;
1626 struct vnode *vp = ap->a_vp;
1627 struct nfsnode *np = VTONFS(vp);
1628 struct ucred *cred = ap->a_cred;
1629 int ioflag = ap->a_ioflag;
1630 struct nfsbuf *bp;
1631 struct vattr vattr;
1632 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1633 daddr_t lbn;
1634 int biosize, bufsize, writeop;
1635 int n, on, error = 0, iomode, must_commit;
1636 off_t boff, start, end, cureof;
1637 struct iovec iov;
1638 struct uio auio;
1639
1640 FSDBG_TOP(515, vp, uio->uio_offset, uio->uio_resid, ioflag);
1641
1642 #if DIAGNOSTIC
1643 if (uio->uio_rw != UIO_WRITE)
1644 panic("nfs_write mode");
1645 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != current_proc())
1646 panic("nfs_write proc");
1647 #endif
1648 if (vp->v_type != VREG)
1649 return (EIO);
1650 if (np->n_flag & NWRITEERR) {
1651 np->n_flag &= ~NWRITEERR;
1652 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, np->n_error);
1653 return (np->n_error);
1654 }
1655 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1656 !(nmp->nm_state & NFSSTA_GOTFSINFO))
1657 (void)nfs_fsinfo(nmp, vp, cred, p);
1658 if (ioflag & (IO_APPEND | IO_SYNC)) {
1659 if (np->n_flag & NMODIFIED) {
1660 np->n_xid = 0;
1661 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1662 if (error) {
1663 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad01, error);
1664 return (error);
1665 }
1666 }
1667 if (ioflag & IO_APPEND) {
1668 np->n_xid = 0;
1669 error = VOP_GETATTR(vp, &vattr, cred, p);
1670 if (error) {
1671 FSDBG_BOT(515, vp, uio->uio_offset, 0x10bad02, error);
1672 return (error);
1673 }
1674 uio->uio_offset = np->n_size;
1675 }
1676 }
1677 if (uio->uio_offset < 0) {
1678 FSDBG_BOT(515, vp, uio->uio_offset, 0xbad0ff, EINVAL);
1679 return (EINVAL);
1680 }
1681 if (uio->uio_resid == 0) {
1682 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
1683 return (0);
1684 }
1685 /*
1686 * Maybe this should be above the vnode op call, but so long as
1687 * file servers have no limits, i don't think it matters
1688 */
1689 if (p && uio->uio_offset + uio->uio_resid >
1690 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
1691 psignal(p, SIGXFSZ);
1692 FSDBG_BOT(515, vp, uio->uio_offset, 0x2b1f, EFBIG);
1693 return (EFBIG);
1694 }
1695
1696 biosize = vp->v_mount->mnt_stat.f_iosize;
1697
1698 do {
1699 /*
1700 * Check for a valid write lease.
1701 */
1702 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
1703 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1704 do {
1705 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1706 } while (error == NQNFS_EXPIRED);
1707 if (error) {
1708 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110001, error);
1709 return (error);
1710 }
1711 if (np->n_lrev != np->n_brev ||
1712 (np->n_flag & NQNFSNONCACHE)) {
1713 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1714 if (error) {
1715 FSDBG_BOT(515, vp, uio->uio_offset, 0x11110002, error);
1716 return (error);
1717 }
1718 np->n_brev = np->n_lrev;
1719 }
1720 }
1721 if (ISSET(vp->v_flag, VNOCACHE_DATA) &&
1722 (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)) {
1723 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
1724 if (error) {
1725 FSDBG_BOT(515, vp, 0, 0, error);
1726 return (error);
1727 }
1728 }
1729 if (((np->n_flag & NQNFSNONCACHE) ||
1730 ISSET(vp->v_flag, VNOCACHE_DATA)) &&
1731 uio->uio_iovcnt == 1) {
1732 iomode = NFSV3WRITE_FILESYNC;
1733 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
1734 if (must_commit)
1735 nfs_clearcommit(vp->v_mount);
1736 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1737 return (error);
1738 }
1739 nfsstats.biocache_writes++;
1740 lbn = uio->uio_offset / biosize;
1741 on = uio->uio_offset % biosize;
1742 n = min((unsigned)(biosize - on), uio->uio_resid);
1743 again:
1744 bufsize = biosize;
1745 /*
1746 * Get a cache block for writing. The range to be written is
1747 * (off..off+n) within the block. We ensure that the block
1748 * either has no dirty region or that the given range is
1749 * contiguous with the existing dirty region.
1750 */
1751 bp = nfs_buf_get(vp, lbn, bufsize, p, BLK_WRITE);
1752 if (!bp) {
1753 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, EINTR);
1754 return (EINTR);
1755 }
1756 /* map the block because we know we're going to write to it */
1757 NFS_BUF_MAP(bp);
1758
1759 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1760 SET(bp->nb_flags, (NB_NOCACHE|NB_INVAL));
1761
1762 /*
1763 * NFS has embedded ucred so crhold() risks zone corruption
1764 */
1765 if (bp->nb_wcred == NOCRED)
1766 bp->nb_wcred = crdup(cred);
1767
1768 /*
1769 * If there's already a dirty range AND dirty pages in this block we
1770 * need to send a commit AND write the dirty pages before continuing.
1771 *
1772 * If there's already a dirty range OR dirty pages in this block
1773 * and the new write range is not contiguous with the existing range,
1774 * then force the buffer to be written out now.
1775 * (We used to just extend the dirty range to cover the valid,
1776 * but unwritten, data in between also. But writing ranges
1777 * of data that weren't actually written by an application
1778 * risks overwriting some other client's data with stale data
1779 * that's just masquerading as new written data.)
1780 */
1781 if (bp->nb_dirtyend > 0) {
1782 if (on > bp->nb_dirtyend || (on + n) < bp->nb_dirtyoff || bp->nb_dirty) {
1783 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c001);
1784 /* write/commit buffer "synchronously" */
1785 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1786 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1787 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1788 error = nfs_buf_write(bp);
1789 if (error) {
1790 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1791 return (error);
1792 }
1793 goto again;
1794 }
1795 } else if (bp->nb_dirty) {
1796 int firstpg, lastpg;
1797 u_int32_t pagemask;
1798 /* calculate write range pagemask */
1799 firstpg = on/PAGE_SIZE;
1800 lastpg = (on+n-1)/PAGE_SIZE;
1801 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
1802 /* check if there are dirty pages outside the write range */
1803 if (bp->nb_dirty & ~pagemask) {
1804 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c002);
1805 /* write/commit buffer "synchronously" */
1806 /* (NB_STABLE indicates that data writes should be FILESYNC) */
1807 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL));
1808 SET(bp->nb_flags, (NB_ASYNC | NB_STABLE));
1809 error = nfs_buf_write(bp);
1810 if (error) {
1811 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
1812 return (error);
1813 }
1814 goto again;
1815 }
1816 /* if the first or last pages are already dirty */
1817 /* make sure that the dirty range encompasses those pages */
1818 if (NBPGDIRTY(bp,firstpg) || NBPGDIRTY(bp,lastpg)) {
1819 FSDBG(515, vp, uio->uio_offset, bp, 0xd15c003);
1820 bp->nb_dirtyoff = min(on, firstpg * PAGE_SIZE);
1821 if (NBPGDIRTY(bp,lastpg)) {
1822 bp->nb_dirtyend = (lastpg+1) * PAGE_SIZE;
1823 /* clip to EOF */
1824 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1825 bp->nb_dirtyend = np->n_size - NBOFF(bp);
1826 } else
1827 bp->nb_dirtyend = on+n;
1828 }
1829 }
1830
1831 /*
1832 * Are we extending the size of the file with this write?
1833 * If so, update file size now that we have the block.
1834 * If there was a partial buf at the old eof, validate
1835 * and zero the new bytes.
1836 */
1837 cureof = (off_t)np->n_size;
1838 if (uio->uio_offset + n > np->n_size) {
1839 struct nfsbuf *eofbp = NULL;
1840 daddr_t eofbn = np->n_size / biosize;
1841 int eofoff = np->n_size % biosize;
1842 int neweofoff = (uio->uio_offset + n) % biosize;
1843
1844 FSDBG(515, 0xb1ffa000, uio->uio_offset + n, eofoff, neweofoff);
1845
1846 if (eofoff && eofbn < lbn && nfs_buf_incore(vp, eofbn))
1847 eofbp = nfs_buf_get(vp, eofbn, biosize, p, BLK_WRITE);
1848
1849 /* if we're extending within the same last block */
1850 /* and the block is flagged as being cached... */
1851 if ((lbn == eofbn) && ISSET(bp->nb_flags, NB_CACHE)) {
1852 /* ...check that all pages in buffer are valid */
1853 int endpg = ((neweofoff ? neweofoff : biosize) - 1)/PAGE_SIZE;
1854 u_int32_t pagemask;
1855 /* pagemask only has to extend to last page being written to */
1856 pagemask = (1 << (endpg+1)) - 1;
1857 FSDBG(515, 0xb1ffa001, bp->nb_valid, pagemask, 0);
1858 if ((bp->nb_valid & pagemask) != pagemask) {
1859 /* zerofill any hole */
1860 if (on > bp->nb_validend) {
1861 int i;
1862 for (i=bp->nb_validend/PAGE_SIZE; i <= (on - 1)/PAGE_SIZE; i++)
1863 NBPGVALID_SET(bp, i);
1864 NFS_BUF_MAP(bp);
1865 FSDBG(516, bp, bp->nb_validend, on - bp->nb_validend, 0xf01e);
1866 bzero((char *)bp->nb_data + bp->nb_validend,
1867 on - bp->nb_validend);
1868 }
1869 /* zerofill any trailing data in the last page */
1870 if (neweofoff) {
1871 NFS_BUF_MAP(bp);
1872 FSDBG(516, bp, neweofoff, PAGE_SIZE - (neweofoff & PAGE_MASK), 0xe0f);
1873 bzero((char *)bp->nb_data + neweofoff,
1874 PAGE_SIZE - (neweofoff & PAGE_MASK));
1875 }
1876 }
1877 }
1878 np->n_flag |= NMODIFIED;
1879 np->n_size = uio->uio_offset + n;
1880 ubc_setsize(vp, (off_t)np->n_size); /* XXX errors */
1881 if (eofbp) {
1882 /*
1883 * We may need to zero any previously invalid data
1884 * after the old EOF in the previous EOF buffer.
1885 *
1886 * For the old last page, don't zero bytes if there
1887 * are invalid bytes in that page (i.e. the page isn't
1888 * currently valid).
1889 * For pages after the old last page, zero them and
1890 * mark them as valid.
1891 */
1892 char *d;
1893 int i;
1894 if (ISSET(vp->v_flag, VNOCACHE_DATA))
1895 SET(eofbp->nb_flags, (NB_NOCACHE|NB_INVAL));
1896 NFS_BUF_MAP(eofbp);
1897 FSDBG(516, eofbp, eofoff, biosize - eofoff, 0xe0fff01e);
1898 d = eofbp->nb_data;
1899 i = eofoff/PAGE_SIZE;
1900 while (eofoff < biosize) {
1901 int poff = eofoff & PAGE_MASK;
1902 if (!poff || NBPGVALID(eofbp,i)) {
1903 bzero(d + eofoff, PAGE_SIZE - poff);
1904 NBPGVALID_SET(eofbp, i);
1905 }
1906 if (bp->nb_validend == eofoff)
1907 bp->nb_validend += PAGE_SIZE - poff;
1908 eofoff += PAGE_SIZE - poff;
1909 i++;
1910 }
1911 nfs_buf_release(eofbp, 1);
1912 }
1913 }
1914 /*
1915 * If dirtyend exceeds file size, chop it down. This should
1916 * not occur unless there is a race.
1917 */
1918 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
1919 bp->nb_dirtyend = np->n_size - NBOFF(bp);
1920 /*
1921 * UBC doesn't handle partial pages, so we need to make sure
1922 * that any pages left in the page cache are completely valid.
1923 *
1924 * Writes that are smaller than a block are delayed if they
1925 * don't extend to the end of the block.
1926 *
1927 * If the block isn't (completely) cached, we may need to read
1928 * in some parts of pages that aren't covered by the write.
1929 * If the write offset (on) isn't page aligned, we'll need to
1930 * read the start of the first page being written to. Likewise,
1931 * if the offset of the end of the write (on+n) isn't page aligned,
1932 * we'll need to read the end of the last page being written to.
1933 *
1934 * Notes:
1935 * We don't want to read anything we're just going to write over.
1936 * We don't want to issue multiple I/Os if we don't have to
1937 * (because they're synchronous rpcs).
1938 * We don't want to read anything we already have modified in the
1939 * page cache.
1940 */
1941 if (!ISSET(bp->nb_flags, NB_CACHE) && n < biosize) {
1942 int firstpg, lastpg, dirtypg;
1943 int firstpgoff, lastpgoff;
1944 start = end = -1;
1945 firstpg = on/PAGE_SIZE;
1946 firstpgoff = on & PAGE_MASK;
1947 lastpg = (on+n-1)/PAGE_SIZE;
1948 lastpgoff = (on+n) & PAGE_MASK;
1949 if (firstpgoff && !NBPGVALID(bp,firstpg)) {
1950 /* need to read start of first page */
1951 start = firstpg * PAGE_SIZE;
1952 end = start + firstpgoff;
1953 }
1954 if (lastpgoff && !NBPGVALID(bp,lastpg)) {
1955 /* need to read end of last page */
1956 if (start < 0)
1957 start = (lastpg * PAGE_SIZE) + lastpgoff;
1958 end = (lastpg + 1) * PAGE_SIZE;
1959 }
1960 if (end > start) {
1961 /* need to read the data in range: start...end-1 */
1962
1963 /* first, check for dirty pages in between */
1964 /* if there are, we'll have to do two reads because */
1965 /* we don't want to overwrite the dirty pages. */
1966 for (dirtypg=start/PAGE_SIZE; dirtypg <= (end-1)/PAGE_SIZE; dirtypg++)
1967 if (NBPGDIRTY(bp,dirtypg))
1968 break;
1969
1970 /* if start is at beginning of page, try */
1971 /* to get any preceeding pages as well. */
1972 if (!(start & PAGE_MASK)) {
1973 /* stop at next dirty/valid page or start of block */
1974 for (; start > 0; start-=PAGE_SIZE)
1975 if (NBPGVALID(bp,((start-1)/PAGE_SIZE)))
1976 break;
1977 }
1978
1979 NFS_BUF_MAP(bp);
1980 /* setup uio for read(s) */
1981 boff = NBOFF(bp);
1982 auio.uio_iov = &iov;
1983 auio.uio_iovcnt = 1;
1984 auio.uio_segflg = UIO_SYSSPACE;
1985 auio.uio_rw = UIO_READ;
1986 auio.uio_procp = p;
1987
1988 if (dirtypg <= (end-1)/PAGE_SIZE) {
1989 /* there's a dirty page in the way, so just do two reads */
1990 /* we'll read the preceding data here */
1991 auio.uio_offset = boff + start;
1992 auio.uio_resid = iov.iov_len = on - start;
1993 iov.iov_base = bp->nb_data + start;
1994 error = nfs_readrpc(vp, &auio, cred);
1995 if (error) {
1996 bp->nb_error = error;
1997 SET(bp->nb_flags, NB_ERROR);
1998 printf("nfs_write: readrpc %d", error);
1999 }
2000 if (auio.uio_resid > 0) {
2001 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee01);
2002 bzero(iov.iov_base, auio.uio_resid);
2003 }
2004 /* update validoff/validend if necessary */
2005 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2006 bp->nb_validoff = start;
2007 if ((bp->nb_validend < 0) || (bp->nb_validend < on))
2008 bp->nb_validend = on;
2009 if (np->n_size > boff + bp->nb_validend)
2010 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2011 /* validate any pages before the write offset */
2012 for (; start < on/PAGE_SIZE; start+=PAGE_SIZE)
2013 NBPGVALID_SET(bp, start/PAGE_SIZE);
2014 /* adjust start to read any trailing data */
2015 start = on+n;
2016 }
2017
2018 /* if end is at end of page, try to */
2019 /* get any following pages as well. */
2020 if (!(end & PAGE_MASK)) {
2021 /* stop at next valid page or end of block */
2022 for (; end < bufsize; end+=PAGE_SIZE)
2023 if (NBPGVALID(bp,end/PAGE_SIZE))
2024 break;
2025 }
2026
2027 if (((boff+start) >= cureof) || ((start >= on) && ((boff + on + n) >= cureof))) {
2028 /*
2029 * Either this entire read is beyond the current EOF
2030 * or the range that we won't be modifying (on+n...end)
2031 * is all beyond the current EOF.
2032 * No need to make a trip across the network to
2033 * read nothing. So, just zero the buffer instead.
2034 */
2035 FSDBG(516, bp, start, end - start, 0xd00dee00);
2036 bzero(bp->nb_data + start, end - start);
2037 } else {
2038 /* now we'll read the (rest of the) data */
2039 auio.uio_offset = boff + start;
2040 auio.uio_resid = iov.iov_len = end - start;
2041 iov.iov_base = bp->nb_data + start;
2042 error = nfs_readrpc(vp, &auio, cred);
2043 if (error) {
2044 bp->nb_error = error;
2045 SET(bp->nb_flags, NB_ERROR);
2046 printf("nfs_write: readrpc %d", error);
2047 }
2048 if (auio.uio_resid > 0) {
2049 FSDBG(516, bp, iov.iov_base - bp->nb_data, auio.uio_resid, 0xd00dee02);
2050 bzero(iov.iov_base, auio.uio_resid);
2051 }
2052 }
2053 /* update validoff/validend if necessary */
2054 if ((bp->nb_validoff < 0) || (bp->nb_validoff > start))
2055 bp->nb_validoff = start;
2056 if ((bp->nb_validend < 0) || (bp->nb_validend < end))
2057 bp->nb_validend = end;
2058 if (np->n_size > boff + bp->nb_validend)
2059 bp->nb_validend = min(np->n_size - (boff + start), biosize);
2060 /* validate any pages before the write offset's page */
2061 for (; start < trunc_page_32(on); start+=PAGE_SIZE)
2062 NBPGVALID_SET(bp, start/PAGE_SIZE);
2063 /* validate any pages after the range of pages being written to */
2064 for (; (end - 1) > round_page_32(on+n-1); end-=PAGE_SIZE)
2065 NBPGVALID_SET(bp, (end-1)/PAGE_SIZE);
2066 /* Note: pages being written to will be validated when written */
2067 }
2068 }
2069
2070 if (ISSET(bp->nb_flags, NB_ERROR)) {
2071 error = bp->nb_error;
2072 nfs_buf_release(bp, 1);
2073 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
2074 return (error);
2075 }
2076
2077 np->n_flag |= NMODIFIED;
2078
2079 /*
2080 * Check for valid write lease and get one as required.
2081 * In case nfs_buf_get() and/or nfs_buf_write() delayed us.
2082 */
2083 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
2084 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
2085 do {
2086 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
2087 } while (error == NQNFS_EXPIRED);
2088 if (error) {
2089 nfs_buf_release(bp, 1);
2090 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220001, error);
2091 return (error);
2092 }
2093 if (np->n_lrev != np->n_brev ||
2094 (np->n_flag & NQNFSNONCACHE)) {
2095 nfs_buf_release(bp, 1);
2096 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2097 if (error) {
2098 FSDBG_BOT(515, vp, uio->uio_offset, 0x11220002, error);
2099 return (error);
2100 }
2101 np->n_brev = np->n_lrev;
2102 goto again;
2103 }
2104 }
2105 NFS_BUF_MAP(bp);
2106 error = uiomove((char *)bp->nb_data + on, n, uio);
2107 if (error) {
2108 SET(bp->nb_flags, NB_ERROR);
2109 nfs_buf_release(bp, 1);
2110 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, error);
2111 return (error);
2112 }
2113
2114 /* validate any pages written to */
2115 start = on & ~PAGE_MASK;
2116 for (; start < on+n; start += PAGE_SIZE) {
2117 NBPGVALID_SET(bp, start/PAGE_SIZE);
2118 /*
2119 * This may seem a little weird, but we don't actually set the
2120 * dirty bits for writes. This is because we keep the dirty range
2121 * in the nb_dirtyoff/nb_dirtyend fields. Also, particularly for
2122 * delayed writes, when we give the pages back to the VM we don't
2123 * want to keep them marked dirty, because when we later write the
2124 * buffer we won't be able to tell which pages were written dirty
2125 * and which pages were mmapped and dirtied.
2126 */
2127 }
2128 if (bp->nb_dirtyend > 0) {
2129 bp->nb_dirtyoff = min(on, bp->nb_dirtyoff);
2130 bp->nb_dirtyend = max((on + n), bp->nb_dirtyend);
2131 } else {
2132 bp->nb_dirtyoff = on;
2133 bp->nb_dirtyend = on + n;
2134 }
2135 if (bp->nb_validend <= 0 || bp->nb_validend < bp->nb_dirtyoff ||
2136 bp->nb_validoff > bp->nb_dirtyend) {
2137 bp->nb_validoff = bp->nb_dirtyoff;
2138 bp->nb_validend = bp->nb_dirtyend;
2139 } else {
2140 bp->nb_validoff = min(bp->nb_validoff, bp->nb_dirtyoff);
2141 bp->nb_validend = max(bp->nb_validend, bp->nb_dirtyend);
2142 }
2143 if (!ISSET(bp->nb_flags, NB_CACHE))
2144 nfs_buf_normalize_valid_range(np, bp);
2145
2146 /*
2147 * Since this block is being modified, it must be written
2148 * again and not just committed.
2149 */
2150 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2151 np->n_needcommitcnt--;
2152 CHECK_NEEDCOMMITCNT(np);
2153 }
2154 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2155
2156 if ((np->n_flag & NQNFSNONCACHE) ||
2157 (ioflag & IO_SYNC) || (vp->v_flag & VNOCACHE_DATA)) {
2158 bp->nb_proc = p;
2159 error = nfs_buf_write(bp);
2160 if (error) {
2161 FSDBG_BOT(515, vp, uio->uio_offset,
2162 uio->uio_resid, error);
2163 return (error);
2164 }
2165 if (np->n_flag & NQNFSNONCACHE) {
2166 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
2167 if (error) {
2168 FSDBG_BOT(515, vp, uio->uio_offset,
2169 uio->uio_resid, error);
2170 return (error);
2171 }
2172 }
2173 } else if ((n + on) == biosize && (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
2174 bp->nb_proc = (struct proc *)0;
2175 SET(bp->nb_flags, NB_ASYNC);
2176 nfs_buf_write(bp);
2177 } else
2178 nfs_buf_write_delayed(bp);
2179
2180 if (np->n_needcommitcnt > (nbuf/16))
2181 nfs_flushcommits(vp, p);
2182
2183 } while (uio->uio_resid > 0 && n > 0);
2184
2185 FSDBG_BOT(515, vp, uio->uio_offset, uio->uio_resid, 0);
2186 return (0);
2187 }
2188
2189 /*
2190 * Flush out and invalidate all buffers associated with a vnode.
2191 * Called with the underlying object locked.
2192 */
2193 static int
2194 nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, slptimeo)
2195 register struct vnode *vp;
2196 int flags;
2197 struct ucred *cred;
2198 struct proc *p;
2199 int slpflag, slptimeo;
2200 {
2201 struct nfsbuf *bp;
2202 struct nfsbuf *nbp, *blist;
2203 int s, error = 0;
2204 struct nfsnode *np = VTONFS(vp);
2205
2206 if (flags & V_SAVE) {
2207 if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
2208 return (error);
2209 if (np->n_dirtyblkhd.lh_first)
2210 panic("nfs_vinvalbuf: dirty bufs (vp 0x%x, bp 0x%x)",
2211 vp, np->n_dirtyblkhd.lh_first);
2212 }
2213
2214 for (;;) {
2215 blist = np->n_cleanblkhd.lh_first;
2216 if (!blist)
2217 blist = np->n_dirtyblkhd.lh_first;
2218 if (!blist)
2219 break;
2220
2221 for (bp = blist; bp; bp = nbp) {
2222 nbp = bp->nb_vnbufs.le_next;
2223 s = splbio();
2224 if (ISSET(bp->nb_flags, NB_BUSY)) {
2225 SET(bp->nb_flags, NB_WANTED);
2226 FSDBG_TOP(556, vp, bp, NBOFF(bp), bp->nb_flags);
2227 error = tsleep((caddr_t)bp,
2228 slpflag | (PRIBIO + 1), "nfs_vinvalbuf",
2229 slptimeo);
2230 FSDBG_BOT(556, vp, bp, NBOFF(bp), bp->nb_flags);
2231 splx(s);
2232 if (error) {
2233 FSDBG(554, vp, bp, -1, error);
2234 return (error);
2235 }
2236 break;
2237 }
2238 FSDBG(554, vp, bp, NBOFF(bp), bp->nb_flags);
2239 nfs_buf_remfree(bp);
2240 SET(bp->nb_flags, NB_BUSY);
2241 splx(s);
2242 if ((flags & V_SAVE) && UBCINFOEXISTS(vp) && (NBOFF(bp) < np->n_size)) {
2243 /* XXX extra paranoia: make sure we're not */
2244 /* somehow leaving any dirty data around */
2245 int mustwrite = 0;
2246 int end = (NBOFF(bp) + bp->nb_bufsize >= np->n_size) ?
2247 bp->nb_bufsize : (np->n_size - NBOFF(bp));
2248 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2249 error = nfs_buf_upl_setup(bp);
2250 if (error == EINVAL) {
2251 /* vm object must no longer exist */
2252 /* hopefully we don't need to do */
2253 /* anything for this buffer */
2254 } else if (error)
2255 printf("nfs_vinvalbuf: upl setup failed %d\n",
2256 error);
2257 bp->nb_valid = bp->nb_dirty = 0;
2258 }
2259 nfs_buf_upl_check(bp);
2260 /* check for any dirty data before the EOF */
2261 if (bp->nb_dirtyend && bp->nb_dirtyoff < end) {
2262 /* clip dirty range to EOF */
2263 if (bp->nb_dirtyend > end)
2264 bp->nb_dirtyend = end;
2265 mustwrite++;
2266 }
2267 bp->nb_dirty &= (1 << (round_page_32(end)/PAGE_SIZE)) - 1;
2268 if (bp->nb_dirty)
2269 mustwrite++;
2270 if (mustwrite) {
2271 FSDBG(554, vp, bp, 0xd00dee, bp->nb_flags);
2272 if (!ISSET(bp->nb_flags, NB_PAGELIST))
2273 panic("nfs_vinvalbuf: dirty buffer without upl");
2274 /* gotta write out dirty data before invalidating */
2275 /* (NB_STABLE indicates that data writes should be FILESYNC) */
2276 /* (NB_NOCACHE indicates buffer should be discarded) */
2277 CLR(bp->nb_flags, (NB_DONE | NB_ERROR | NB_INVAL | NB_ASYNC));
2278 SET(bp->nb_flags, NB_STABLE | NB_NOCACHE);
2279 /*
2280 * NFS has embedded ucred so crhold() risks zone corruption
2281 */
2282 if (bp->nb_wcred == NOCRED)
2283 bp->nb_wcred = crdup(cred);
2284 error = nfs_buf_write(bp);
2285 // Note: bp has been released
2286 if (error) {
2287 FSDBG(554, bp, 0xd00dee, 0xbad, error);
2288 np->n_error = error;
2289 np->n_flag |= NWRITEERR;
2290 error = 0;
2291 }
2292 break;
2293 }
2294 }
2295 SET(bp->nb_flags, NB_INVAL);
2296 // Note: We don't want to do FREEUPs here because
2297 // that may modify the buffer chain we're iterating!
2298 nfs_buf_release(bp, 0);
2299 }
2300 }
2301 NFS_BUF_FREEUP();
2302 if (np->n_dirtyblkhd.lh_first || np->n_cleanblkhd.lh_first)
2303 panic("nfs_vinvalbuf: flush failed");
2304 return (0);
2305 }
2306
2307
2308 /*
2309 * Flush and invalidate all dirty buffers. If another process is already
2310 * doing the flush, just wait for completion.
2311 */
2312 int
2313 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
2314 struct vnode *vp;
2315 int flags;
2316 struct ucred *cred;
2317 struct proc *p;
2318 int intrflg;
2319 {
2320 register struct nfsnode *np = VTONFS(vp);
2321 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
2322 int error = 0, slpflag, slptimeo;
2323 int didhold = 0;
2324
2325 FSDBG_TOP(554, vp, flags, intrflg, 0);
2326
2327 if (nmp && ((nmp->nm_flag & NFSMNT_INT) == 0))
2328 intrflg = 0;
2329 if (intrflg) {
2330 slpflag = PCATCH;
2331 slptimeo = 2 * hz;
2332 } else {
2333 slpflag = 0;
2334 slptimeo = 0;
2335 }
2336 /*
2337 * First wait for any other process doing a flush to complete.
2338 */
2339 while (np->n_flag & NFLUSHINPROG) {
2340 np->n_flag |= NFLUSHWANT;
2341 FSDBG_TOP(555, vp, flags, intrflg, np->n_flag);
2342 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo);
2343 FSDBG_BOT(555, vp, flags, intrflg, np->n_flag);
2344 if (error && (error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p))) {
2345 FSDBG_BOT(554, vp, flags, intrflg, error);
2346 return (error);
2347 }
2348 }
2349
2350 /*
2351 * Now, flush as required.
2352 */
2353 np->n_flag |= NFLUSHINPROG;
2354 error = nfs_vinvalbuf_internal(vp, flags, cred, p, slpflag, 0);
2355 while (error) {
2356 FSDBG(554, vp, 0, 0, error);
2357 error = nfs_sigintr(VFSTONFS(vp->v_mount), NULL, p);
2358 if (error) {
2359 np->n_flag &= ~NFLUSHINPROG;
2360 if (np->n_flag & NFLUSHWANT) {
2361 np->n_flag &= ~NFLUSHWANT;
2362 wakeup((caddr_t)&np->n_flag);
2363 }
2364 FSDBG_BOT(554, vp, flags, intrflg, error);
2365 return (error);
2366 }
2367 error = nfs_vinvalbuf_internal(vp, flags, cred, p, 0, slptimeo);
2368 }
2369 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
2370 if (np->n_flag & NFLUSHWANT) {
2371 np->n_flag &= ~NFLUSHWANT;
2372 wakeup((caddr_t)&np->n_flag);
2373 }
2374 didhold = ubc_hold(vp);
2375 if (didhold) {
2376 int rv = ubc_clean(vp, 1); /* get the pages out of vm also */
2377 if (!rv)
2378 panic("nfs_vinvalbuf(): ubc_clean failed!");
2379 ubc_rele(vp);
2380 }
2381 FSDBG_BOT(554, vp, flags, intrflg, 0);
2382 return (0);
2383 }
2384
2385 /*
2386 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
2387 * This is mainly to avoid queueing async I/O requests when the nfsiods
2388 * are all hung on a dead server.
2389 */
2390 int
2391 nfs_asyncio(bp, cred)
2392 struct nfsbuf *bp;
2393 struct ucred *cred;
2394 {
2395 struct nfsmount *nmp;
2396 int i;
2397 int gotiod;
2398 int slpflag = 0;
2399 int slptimeo = 0;
2400 int error, error2;
2401
2402 if (nfs_numasync == 0)
2403 return (EIO);
2404
2405 FSDBG_TOP(552, bp, bp ? NBOFF(bp) : 0, bp ? bp->nb_flags : 0, 0);
2406
2407 nmp = ((bp != NULL) ? VFSTONFS(bp->nb_vp->v_mount) : NULL);
2408 again:
2409 if (nmp && nmp->nm_flag & NFSMNT_INT)
2410 slpflag = PCATCH;
2411 gotiod = FALSE;
2412
2413 /* no nfsbuf means tell nfsiod to process delwri list */
2414 if (!bp)
2415 nfs_ioddelwri = 1;
2416
2417 /*
2418 * Find a free iod to process this request.
2419 */
2420 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
2421 if (nfs_iodwant[i]) {
2422 /*
2423 * Found one, so wake it up and tell it which
2424 * mount to process.
2425 */
2426 NFS_DPF(ASYNCIO,
2427 ("nfs_asyncio: waking iod %d for mount %p\n",
2428 i, nmp));
2429 nfs_iodwant[i] = (struct proc *)0;
2430 nfs_iodmount[i] = nmp;
2431 if (nmp)
2432 nmp->nm_bufqiods++;
2433 wakeup((caddr_t)&nfs_iodwant[i]);
2434 gotiod = TRUE;
2435 break;
2436 }
2437
2438 /* if we're just poking the delwri list, we're done */
2439 if (!bp)
2440 return (0);
2441
2442 /*
2443 * If none are free, we may already have an iod working on this mount
2444 * point. If so, it will process our request.
2445 */
2446 if (!gotiod) {
2447 if (nmp->nm_bufqiods > 0) {
2448 NFS_DPF(ASYNCIO,
2449 ("nfs_asyncio: %d iods are already processing mount %p\n",
2450 nmp->nm_bufqiods, nmp));
2451 gotiod = TRUE;
2452 }
2453 }
2454
2455 /*
2456 * If we have an iod which can process the request, then queue
2457 * the buffer.
2458 */
2459 FSDBG(552, bp, gotiod, i, nmp->nm_bufqiods);
2460 if (gotiod) {
2461 /*
2462 * Ensure that the queue never grows too large.
2463 */
2464 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
2465 if (ISSET(bp->nb_flags, NB_IOD)) {
2466 /* An nfsiod is attempting this async operation so */
2467 /* we must not fall asleep on the bufq because we */
2468 /* could be waiting on ourself. Just return error */
2469 /* and we'll do this operation syncrhonously. */
2470 goto out;
2471 }
2472 FSDBG(552, bp, nmp->nm_bufqlen, 2*nfs_numasync, -1);
2473 NFS_DPF(ASYNCIO,
2474 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
2475 nmp->nm_bufqwant = TRUE;
2476 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
2477 "nfsaio", slptimeo);
2478 if (error) {
2479 error2 = nfs_sigintr(nmp, NULL, bp->nb_proc);
2480 if (error2) {
2481 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, error2);
2482 return (error2);
2483 }
2484 if (slpflag == PCATCH) {
2485 slpflag = 0;
2486 slptimeo = 2 * hz;
2487 }
2488 }
2489 /*
2490 * We might have lost our iod while sleeping,
2491 * so check and loop if nescessary.
2492 */
2493 if (nmp->nm_bufqiods == 0) {
2494 NFS_DPF(ASYNCIO,
2495 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
2496 goto again;
2497 }
2498 }
2499
2500 if (ISSET(bp->nb_flags, NB_READ)) {
2501 if (bp->nb_rcred == NOCRED && cred != NOCRED) {
2502 /*
2503 * NFS has embedded ucred.
2504 * Can not crhold() here as that causes zone corruption
2505 */
2506 bp->nb_rcred = crdup(cred);
2507 }
2508 } else {
2509 SET(bp->nb_flags, NB_WRITEINPROG);
2510 if (bp->nb_wcred == NOCRED && cred != NOCRED) {
2511 /*
2512 * NFS has embedded ucred.
2513 * Can not crhold() here as that causes zone corruption
2514 */
2515 bp->nb_wcred = crdup(cred);
2516 }
2517 }
2518
2519 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, nb_free);
2520 nmp->nm_bufqlen++;
2521 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, 0);
2522 return (0);
2523 }
2524
2525 out:
2526 /*
2527 * All the iods are busy on other mounts, so return EIO to
2528 * force the caller to process the i/o synchronously.
2529 */
2530 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
2531 FSDBG_BOT(552, bp, NBOFF(bp), bp->nb_flags, EIO);
2532 return (EIO);
2533 }
2534
2535 /*
2536 * Do an I/O operation to/from a cache block. This may be called
2537 * synchronously or from an nfsiod.
2538 */
2539 int
2540 nfs_doio(bp, cr, p)
2541 struct nfsbuf *bp;
2542 struct ucred *cr;
2543 struct proc *p;
2544 {
2545 register struct uio *uiop;
2546 register struct vnode *vp;
2547 struct nfsnode *np;
2548 struct nfsmount *nmp;
2549 int error = 0, diff, len, iomode, must_commit = 0;
2550 struct uio uio;
2551 struct iovec io;
2552
2553 vp = bp->nb_vp;
2554 np = VTONFS(vp);
2555 nmp = VFSTONFS(vp->v_mount);
2556 uiop = &uio;
2557 uiop->uio_iov = &io;
2558 uiop->uio_iovcnt = 1;
2559 uiop->uio_segflg = UIO_SYSSPACE;
2560 uiop->uio_procp = p;
2561
2562 /*
2563 * we've decided to perform I/O for this block,
2564 * so we couldn't possibly NB_DONE. So, clear it.
2565 */
2566 if (ISSET(bp->nb_flags, NB_DONE)) {
2567 if (!ISSET(bp->nb_flags, NB_ASYNC))
2568 panic("nfs_doio: done and not async");
2569 CLR(bp->nb_flags, NB_DONE);
2570 }
2571 FSDBG_TOP(256, np->n_size, NBOFF(bp), bp->nb_bufsize, bp->nb_flags);
2572 FSDBG(257, bp->nb_validoff, bp->nb_validend, bp->nb_dirtyoff,
2573 bp->nb_dirtyend);
2574
2575 if (ISSET(bp->nb_flags, NB_READ)) {
2576 if (vp->v_type == VREG)
2577 NFS_BUF_MAP(bp);
2578 io.iov_len = uiop->uio_resid = bp->nb_bufsize;
2579 io.iov_base = bp->nb_data;
2580 uiop->uio_rw = UIO_READ;
2581 switch (vp->v_type) {
2582 case VREG:
2583 uiop->uio_offset = NBOFF(bp);
2584 nfsstats.read_bios++;
2585 error = nfs_readrpc(vp, uiop, cr);
2586 FSDBG(262, np->n_size, NBOFF(bp), uiop->uio_resid, error);
2587 if (!error) {
2588 /* update valid range */
2589 bp->nb_validoff = 0;
2590 if (uiop->uio_resid) {
2591 /*
2592 * If len > 0, there is a hole in the file and
2593 * no writes after the hole have been pushed to
2594 * the server yet.
2595 * Just zero fill the rest of the valid area.
2596 */
2597 diff = bp->nb_bufsize - uiop->uio_resid;
2598 len = np->n_size - (NBOFF(bp) + diff);
2599 if (len > 0) {
2600 len = min(len, uiop->uio_resid);
2601 bzero((char *)bp->nb_data + diff, len);
2602 bp->nb_validend = diff + len;
2603 FSDBG(258, diff, len, 0, 1);
2604 } else
2605 bp->nb_validend = diff;
2606 } else
2607 bp->nb_validend = bp->nb_bufsize;
2608 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2609 if (bp->nb_validend & PAGE_MASK) {
2610 /* valid range ends in the middle of a page so we */
2611 /* need to zero-fill any invalid data at the end */
2612 /* of the last page */
2613 bzero((caddr_t)(bp->nb_data + bp->nb_validend),
2614 bp->nb_bufsize - bp->nb_validend);
2615 FSDBG(258, bp->nb_validend,
2616 bp->nb_bufsize - bp->nb_validend, 0, 2);
2617 }
2618 }
2619 if (p && (vp->v_flag & VTEXT) &&
2620 (((nmp->nm_flag & NFSMNT_NQNFS) &&
2621 NQNFS_CKINVALID(vp, np, ND_READ) &&
2622 np->n_lrev != np->n_brev) ||
2623 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
2624 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
2625 uprintf("Process killed due to text file modification\n");
2626 psignal(p, SIGKILL);
2627 p->p_flag |= P_NOSWAP;
2628 }
2629 break;
2630 case VLNK:
2631 uiop->uio_offset = (off_t)0;
2632 nfsstats.readlink_bios++;
2633 error = nfs_readlinkrpc(vp, uiop, cr);
2634 if (!error) {
2635 bp->nb_validoff = 0;
2636 bp->nb_validend = uiop->uio_offset;
2637 }
2638 break;
2639 case VDIR:
2640 nfsstats.readdir_bios++;
2641 uiop->uio_offset = NBOFF(bp);
2642 if (!(nmp->nm_flag & NFSMNT_NFSV3))
2643 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; /* dk@farm.org */
2644 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
2645 error = nfs_readdirplusrpc(vp, uiop, cr);
2646 if (error == NFSERR_NOTSUPP)
2647 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
2648 }
2649 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
2650 error = nfs_readdirrpc(vp, uiop, cr);
2651 if (!error) {
2652 bp->nb_validoff = 0;
2653 bp->nb_validend = uiop->uio_offset - NBOFF(bp);
2654 bp->nb_valid = (1 << (round_page_32(bp->nb_validend)/PAGE_SIZE)) - 1;
2655 }
2656 break;
2657 default:
2658 printf("nfs_doio: type %x unexpected\n", vp->v_type);
2659 break;
2660 };
2661 if (error) {
2662 SET(bp->nb_flags, NB_ERROR);
2663 bp->nb_error = error;
2664 }
2665
2666 } else {
2667 /* we're doing a write */
2668 int doff, dend = 0;
2669
2670 /* We need to make sure the pages are locked before doing I/O. */
2671 if (!ISSET(bp->nb_flags, NB_META) && UBCISVALID(vp)) {
2672 if (!ISSET(bp->nb_flags, NB_PAGELIST)) {
2673 error = nfs_buf_upl_setup(bp);
2674 if (error) {
2675 printf("nfs_doio: upl create failed %d\n", error);
2676 SET(bp->nb_flags, NB_ERROR);
2677 bp->nb_error = EIO;
2678 return (EIO);
2679 }
2680 nfs_buf_upl_check(bp);
2681 }
2682 }
2683
2684 if (ISSET(bp->nb_flags, NB_WASDIRTY)) {
2685 FSDBG(256, bp, NBOFF(bp), bp->nb_dirty, 0xd00dee);
2686 /*
2687 * There are pages marked dirty that need to be written out.
2688 *
2689 * We don't want to just combine the write range with the
2690 * range of pages that are dirty because that could cause us
2691 * to write data that wasn't actually written to.
2692 * We also don't want to write data more than once.
2693 *
2694 * If the dirty range just needs to be committed, we do that.
2695 * Otherwise, we write the dirty range and clear the dirty bits
2696 * for any COMPLETE pages covered by that range.
2697 * If there are dirty pages left after that, we write out the
2698 * parts that we haven't written yet.
2699 */
2700 }
2701
2702 /*
2703 * If NB_NEEDCOMMIT is set, a commit rpc may do the trick. If not
2704 * an actual write will have to be done.
2705 * If NB_WRITEINPROG is already set, then push it with a write anyhow.
2706 */
2707 if ((bp->nb_flags & (NB_NEEDCOMMIT | NB_WRITEINPROG)) == NB_NEEDCOMMIT) {
2708 doff = NBOFF(bp) + bp->nb_dirtyoff;
2709 SET(bp->nb_flags, NB_WRITEINPROG);
2710 error = nfs_commit(vp, doff, bp->nb_dirtyend - bp->nb_dirtyoff,
2711 bp->nb_wcred, bp->nb_proc);
2712 CLR(bp->nb_flags, NB_WRITEINPROG);
2713 if (!error) {
2714 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2715 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2716 np->n_needcommitcnt--;
2717 CHECK_NEEDCOMMITCNT(np);
2718 } else if (error == NFSERR_STALEWRITEVERF)
2719 nfs_clearcommit(vp->v_mount);
2720 }
2721
2722 if (!error && bp->nb_dirtyend > 0) {
2723 /* there's a dirty range that needs to be written out */
2724 u_int32_t pagemask;
2725 int firstpg, lastpg;
2726
2727 if (NBOFF(bp) + bp->nb_dirtyend > np->n_size)
2728 bp->nb_dirtyend = np->n_size - NBOFF(bp);
2729
2730 NFS_BUF_MAP(bp);
2731
2732 doff = bp->nb_dirtyoff;
2733 dend = bp->nb_dirtyend;
2734
2735 /* if doff page is dirty, move doff to start of page */
2736 if (NBPGDIRTY(bp,doff/PAGE_SIZE))
2737 doff -= doff & PAGE_MASK;
2738 /* try to expand write range to include preceding dirty pages */
2739 if (!(doff & PAGE_MASK))
2740 while (doff > 0 && NBPGDIRTY(bp,(doff-1)/PAGE_SIZE))
2741 doff -= PAGE_SIZE;
2742 /* if dend page is dirty, move dend to start of next page */
2743 if ((dend & PAGE_MASK) && NBPGDIRTY(bp,dend/PAGE_SIZE))
2744 dend = round_page_32(dend);
2745 /* try to expand write range to include trailing dirty pages */
2746 if (!(dend & PAGE_MASK))
2747 while (dend < bp->nb_bufsize && NBPGDIRTY(bp,dend/PAGE_SIZE))
2748 dend += PAGE_SIZE;
2749 /* make sure to keep dend clipped to EOF */
2750 if (NBOFF(bp) + dend > np->n_size)
2751 dend = np->n_size - NBOFF(bp);
2752 /* calculate range of complete pages being written */
2753 firstpg = round_page_32(doff) / PAGE_SIZE;
2754 lastpg = (trunc_page_32(dend) - 1)/ PAGE_SIZE;
2755 /* calculate mask for that page range */
2756 pagemask = ((1 << (lastpg+1)) - 1) & ~((1 << firstpg) - 1);
2757
2758 /* compare page mask to nb_dirty; if there are other dirty pages */
2759 /* then write FILESYNC; otherwise, write UNSTABLE if async and */
2760 /* not needcommit/nocache/call; otherwise write FILESYNC */
2761 if (bp->nb_dirty & ~pagemask)
2762 iomode = NFSV3WRITE_FILESYNC;
2763 else if ((bp->nb_flags & (NB_ASYNC | NB_NEEDCOMMIT | NB_NOCACHE | NB_STABLE)) == NB_ASYNC)
2764 iomode = NFSV3WRITE_UNSTABLE;
2765 else
2766 iomode = NFSV3WRITE_FILESYNC;
2767
2768 /* write the dirty range */
2769 io.iov_len = uiop->uio_resid = dend - doff;
2770 uiop->uio_offset = NBOFF(bp) + doff;
2771 io.iov_base = (char *)bp->nb_data + doff;
2772 uiop->uio_rw = UIO_WRITE;
2773
2774 nfsstats.write_bios++;
2775
2776 SET(bp->nb_flags, NB_WRITEINPROG);
2777 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2778 if (must_commit)
2779 nfs_clearcommit(vp->v_mount);
2780 /* clear dirty bits for pages we've written */
2781 if (!error)
2782 bp->nb_dirty &= ~pagemask;
2783 /* set/clear needcommit flag */
2784 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
2785 if (!ISSET(bp->nb_flags, NB_NEEDCOMMIT))
2786 np->n_needcommitcnt++;
2787 SET(bp->nb_flags, NB_NEEDCOMMIT);
2788 /* make sure nb_dirtyoff/nb_dirtyend reflect actual range written */
2789 bp->nb_dirtyoff = doff;
2790 bp->nb_dirtyend = dend;
2791 } else {
2792 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2793 np->n_needcommitcnt--;
2794 CHECK_NEEDCOMMITCNT(np);
2795 }
2796 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2797 }
2798 CLR(bp->nb_flags, NB_WRITEINPROG);
2799 /*
2800 * For an interrupted write, the buffer is still valid and the write
2801 * hasn't been pushed to the server yet, so we can't set NB_ERROR and
2802 * report the interruption by setting NB_EINTR. For the NB_ASYNC case,
2803 * NB_EINTR is not relevant.
2804 *
2805 * For the case of a V3 write rpc not being committed to stable
2806 * storage, the block is still dirty and requires either a commit rpc
2807 * or another write rpc with iomode == NFSV3WRITE_FILESYNC before the
2808 * block is reused. This is indicated by setting the NB_DELWRI and
2809 * NB_NEEDCOMMIT flags.
2810 */
2811 if (error == EINTR || (!error && bp->nb_flags & NB_NEEDCOMMIT)) {
2812 CLR(bp->nb_flags, NB_INVAL | NB_NOCACHE);
2813 if (!ISSET(bp->nb_flags, NB_DELWRI)) {
2814 SET(bp->nb_flags, NB_DELWRI);
2815 nfs_nbdwrite++;
2816 NFSBUFCNTCHK();
2817 }
2818 FSDBG(261, bp->nb_validoff, bp->nb_validend,
2819 bp->nb_bufsize, 0);
2820 /*
2821 * Since for the NB_ASYNC case, nfs_bwrite() has
2822 * reassigned the buffer to the clean list, we have to
2823 * reassign it back to the dirty one. Ugh.
2824 */
2825 if (ISSET(bp->nb_flags, NB_ASYNC)) {
2826 /* move to dirty list */
2827 int s = splbio();
2828 if (bp->nb_vnbufs.le_next != NFSNOLIST)
2829 LIST_REMOVE(bp, nb_vnbufs);
2830 LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs);
2831 splx(s);
2832 } else {
2833 SET(bp->nb_flags, NB_EINTR);
2834 }
2835 } else {
2836 /* either there's an error or we don't need to commit */
2837 if (error) {
2838 SET(bp->nb_flags, NB_ERROR);
2839 bp->nb_error = np->n_error = error;
2840 np->n_flag |= NWRITEERR;
2841 }
2842 /* clear the dirty range */
2843 bp->nb_dirtyoff = bp->nb_dirtyend = 0;
2844 }
2845 }
2846
2847 if (!error && bp->nb_dirty) {
2848 /* there are pages marked dirty that need to be written out */
2849 int pg, cnt, npages, off, len;
2850
2851 nfsstats.write_bios++;
2852
2853 NFS_BUF_MAP(bp);
2854
2855 /*
2856 * we do these writes synchronously because we can't really
2857 * support the unstable/needommit method. We could write
2858 * them unstable, clear the dirty bits, and then commit the
2859 * whole block later, but if we need to rewrite the data, we
2860 * won't have any idea which pages were written because that
2861 * info can't be stored in the nb_dirtyoff/nb_dirtyend. We
2862 * also can't leave the dirty bits set because then we wouldn't
2863 * be able to tell if the pages were re-dirtied between the end
2864 * of the write and the commit.
2865 */
2866 iomode = NFSV3WRITE_FILESYNC;
2867 uiop->uio_rw = UIO_WRITE;
2868
2869 SET(bp->nb_flags, NB_WRITEINPROG);
2870 npages = bp->nb_bufsize/PAGE_SIZE;
2871 for (pg=0; pg < npages; pg++) {
2872 if (!NBPGDIRTY(bp,pg))
2873 continue;
2874 cnt = 1;
2875 while (((pg+cnt) < npages) && NBPGDIRTY(bp,pg+cnt))
2876 cnt++;
2877 /* write cnt pages starting with page pg */
2878 off = pg * PAGE_SIZE;
2879 len = cnt * PAGE_SIZE;
2880
2881 /* clip writes to EOF */
2882 if (NBOFF(bp) + off + len > np->n_size)
2883 len -= (NBOFF(bp) + off + len) - np->n_size;
2884 if (len > 0) {
2885 io.iov_len = uiop->uio_resid = len;
2886 uiop->uio_offset = NBOFF(bp) + off;
2887 io.iov_base = (char *)bp->nb_data + off;
2888 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
2889 if (must_commit)
2890 nfs_clearcommit(vp->v_mount);
2891 if (error)
2892 break;
2893 }
2894 /* clear dirty bits */
2895 while (cnt--) {
2896 bp->nb_dirty &= ~(1 << pg);
2897 /* leave pg on last page */
2898 if (cnt) pg++;
2899 }
2900 }
2901 if (!error) {
2902 if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) {
2903 np->n_needcommitcnt--;
2904 CHECK_NEEDCOMMITCNT(np);
2905 }
2906 CLR(bp->nb_flags, NB_NEEDCOMMIT);
2907 }
2908 CLR(bp->nb_flags, NB_WRITEINPROG);
2909 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize,
2910 np->n_size);
2911 }
2912
2913 if (error) {
2914 SET(bp->nb_flags, NB_ERROR);
2915 bp->nb_error = error;
2916 }
2917 }
2918
2919 FSDBG_BOT(256, bp->nb_validoff, bp->nb_validend, bp->nb_bufsize, error);
2920
2921 nfs_buf_iodone(bp);
2922 return (error);
2923 }