]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
4ca0b0e67fa79e1e04caf755822b05abf8740246
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 /*-
27 * Copyright (c) 1994 Christopher G. Demetriou
28 * Copyright (c) 1982, 1986, 1989, 1993
29 * The Regents of the University of California. All rights reserved.
30 * (c) UNIX System Laboratories, Inc.
31 * All or some portions of this file are derived from material licensed
32 * to the University of California by American Telephone and Telegraph
33 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
34 * the permission of UNIX System Laboratories, Inc.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * The NEXTSTEP Software License Agreement specifies the terms
65 * and conditions for redistribution.
66 *
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
68 */
69
70 /*
71 * Some references:
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
75 */
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc.h>
80 #include <sys/buf.h>
81 #include <sys/vnode.h>
82 #include <sys/mount.h>
83 #include <sys/trace.h>
84 #include <sys/malloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
87 #include <sys/ubc.h>
88 #include <vm/vm_pageout.h>
89 #if DIAGNOSTIC
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
94
95 #include <sys/kdebug.h>
96 #include <machine/spl.h>
97
98 static __inline__ void bufqinc(int q);
99 static __inline__ void bufqdec(int q);
100
101 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
102 static int bcleanbuf(struct buf *bp);
103 extern void vwakeup();
104
105 extern int niobuf; /* The number of IO buffer headers for cluster IO */
106 int blaundrycnt;
107
108 /* zone allocated buffer headers */
109 static zone_t buf_hdr_zone;
110 static int buf_hdr_count;
111
112 #if TRACE
113 struct proc *traceproc;
114 int tracewhich, tracebuf[TRCSIZ];
115 u_int tracex;
116 char traceflags[TR_NFLAGS];
117 #endif /* TRACE */
118
119 /*
120 * Definitions for the buffer hash lists.
121 */
122 #define BUFHASH(dvp, lbn) \
123 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
124 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
125 u_long bufhash;
126
127 /* Definitions for the buffer stats. */
128 struct bufstats bufstats;
129
130 /* Number of delayed write buffers */
131 int nbdwrite = 0;
132
133 /*
134 * Insq/Remq for the buffer hash lists.
135 */
136 #if 0
137 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
138 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
139 #endif /* 0 */
140
141
142 TAILQ_HEAD(ioqueue, buf) iobufqueue;
143 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
144 static int needbuffer;
145 static int need_iobuffer;
146
147 /*
148 * Insq/Remq for the buffer free lists.
149 */
150 #define binsheadfree(bp, dp, whichq) do { \
151 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
152 bufqinc((whichq)); \
153 (bp)->b_whichq = whichq; \
154 (bp)->b_timestamp = time.tv_sec; \
155 } while (0)
156
157 #define binstailfree(bp, dp, whichq) do { \
158 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
159 bufqinc((whichq)); \
160 (bp)->b_whichq = whichq; \
161 (bp)->b_timestamp = time.tv_sec; \
162 } while (0)
163
164 #define BHASHENTCHECK(bp) \
165 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
166 panic("%x: b_hash.le_prev is not deadbeef", (bp));
167
168 #define BLISTNONE(bp) \
169 (bp)->b_hash.le_next = (struct buf *)0; \
170 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
171
172 /*
173 * Insq/Remq for the vnode usage lists.
174 */
175 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
176 #define bufremvn(bp) { \
177 LIST_REMOVE(bp, b_vnbufs); \
178 (bp)->b_vnbufs.le_next = NOLIST; \
179 }
180
181 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
182
183 /* number of per vnode, "in flight" buffer writes */
184 #define BUFWRITE_THROTTLE 9
185
186
187 /*
188 * Time in seconds before a buffer on a list is
189 * considered as a stale buffer
190 */
191 #define LRU_IS_STALE 120 /* default value for the LRU */
192 #define AGE_IS_STALE 60 /* default value for the AGE */
193 #define META_IS_STALE 180 /* default value for the BQ_META */
194
195 int lru_is_stale = LRU_IS_STALE;
196 int age_is_stale = AGE_IS_STALE;
197 int meta_is_stale = META_IS_STALE;
198
199 /* LIST_INSERT_HEAD() with assertions */
200 static __inline__ void
201 blistenterhead(struct bufhashhdr * head, struct buf * bp)
202 {
203 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
204 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
205 (head)->lh_first = bp;
206 bp->b_hash.le_prev = &(head)->lh_first;
207 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
208 panic("blistenterhead: le_prev is deadbeef");
209 }
210
211 static __inline__ void
212 binshash(struct buf *bp, struct bufhashhdr *dp)
213 {
214 struct buf *nbp;
215
216 simple_lock(&bufhashlist_slock);
217
218 #if 0
219 if((bad = incore(bp->b_vp, bp->b_lblkno)))
220 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
221 #endif /* 0 */
222
223 BHASHENTCHECK(bp);
224
225 nbp = dp->lh_first;
226 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
227 if(nbp == bp)
228 panic("buf already in hashlist");
229 }
230
231 blistenterhead(dp, bp);
232 simple_unlock(&bufhashlist_slock);
233 }
234
235 static __inline__ void
236 bremhash(struct buf *bp)
237 {
238 simple_lock(&bufhashlist_slock);
239 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
240 panic("bremhash le_prev is deadbeef");
241 if (bp->b_hash.le_next == bp)
242 panic("bremhash: next points to self");
243
244 if (bp->b_hash.le_next != NULL)
245 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
246 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
247 simple_unlock(&bufhashlist_slock);
248 }
249
250 /*
251 * Remove a buffer from the free list it's on
252 */
253 void
254 bremfree(bp)
255 struct buf *bp;
256 {
257 struct bqueues *dp = NULL;
258 int whichq = -1;
259
260 /*
261 * We only calculate the head of the freelist when removing
262 * the last element of the list as that is the only time that
263 * it is needed (e.g. to reset the tail pointer).
264 *
265 * NB: This makes an assumption about how tailq's are implemented.
266 */
267 if (bp->b_freelist.tqe_next == NULL) {
268 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
269 if (dp->tqh_last == &bp->b_freelist.tqe_next)
270 break;
271 if (dp == &bufqueues[BQUEUES])
272 panic("bremfree: lost tail");
273 }
274 TAILQ_REMOVE(dp, bp, b_freelist);
275 whichq = bp->b_whichq;
276 bufqdec(whichq);
277 bp->b_whichq = -1;
278 bp->b_timestamp = 0;
279 }
280
281 /*
282 * Associate a buffer with a vnode.
283 */
284 static void
285 bgetvp(vp, bp)
286 register struct vnode *vp;
287 register struct buf *bp;
288 {
289
290 if (bp->b_vp != vp)
291 panic("bgetvp: not free");
292 VHOLD(vp);
293 bp->b_vp = vp;
294 if (vp->v_type == VBLK || vp->v_type == VCHR)
295 bp->b_dev = vp->v_rdev;
296 else
297 bp->b_dev = NODEV;
298 /*
299 * Insert onto list for new vnode.
300 */
301 bufinsvn(bp, &vp->v_cleanblkhd);
302 }
303
304 /*
305 * Disassociate a buffer from a vnode.
306 */
307 static void
308 brelvp(bp)
309 register struct buf *bp;
310 {
311 struct vnode *vp;
312
313 if (bp->b_vp == (struct vnode *) 0)
314 panic("brelvp: NULL vp");
315 /*
316 * Delete from old vnode list, if on one.
317 */
318 if (bp->b_vnbufs.le_next != NOLIST)
319 bufremvn(bp);
320 vp = bp->b_vp;
321 bp->b_vp = (struct vnode *) 0;
322 HOLDRELE(vp);
323 }
324
325 /*
326 * Reassign a buffer from one vnode to another.
327 * Used to assign file specific control information
328 * (indirect blocks) to the vnode to which they belong.
329 */
330 void
331 reassignbuf(bp, newvp)
332 register struct buf *bp;
333 register struct vnode *newvp;
334 {
335 register struct buflists *listheadp;
336
337 if (newvp == NULL) {
338 printf("reassignbuf: NULL");
339 return;
340 }
341 /*
342 * Delete from old vnode list, if on one.
343 */
344 if (bp->b_vnbufs.le_next != NOLIST)
345 bufremvn(bp);
346 /*
347 * If dirty, put on list of dirty buffers;
348 * otherwise insert onto list of clean buffers.
349 */
350 if (ISSET(bp->b_flags, B_DELWRI))
351 listheadp = &newvp->v_dirtyblkhd;
352 else
353 listheadp = &newvp->v_cleanblkhd;
354 bufinsvn(bp, listheadp);
355 }
356
357 static __inline__ void
358 bufhdrinit(struct buf *bp)
359 {
360 bzero((char *)bp, sizeof *bp);
361 bp->b_dev = NODEV;
362 bp->b_rcred = NOCRED;
363 bp->b_wcred = NOCRED;
364 bp->b_vnbufs.le_next = NOLIST;
365 bp->b_flags = B_INVAL;
366
367 return;
368 }
369
370 /*
371 * Initialize buffers and hash links for buffers.
372 */
373 __private_extern__ void
374 bufinit()
375 {
376 register struct buf *bp;
377 register struct bqueues *dp;
378 register int i;
379 int metabuf;
380 long whichq;
381 static void bufzoneinit();
382 static void bcleanbuf_thread_init();
383
384 /* Initialize the buffer queues ('freelists') and the hash table */
385 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
386 TAILQ_INIT(dp);
387 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
388
389 simple_lock_init(&bufhashlist_slock );
390
391 metabuf = nbuf/8; /* reserved for meta buf */
392
393 /* Initialize the buffer headers */
394 for (i = 0; i < nbuf; i++) {
395 bp = &buf[i];
396 bufhdrinit(bp);
397
398 /*
399 * metabuf buffer headers on the meta-data list and
400 * rest of the buffer headers on the empty list
401 */
402 if (--metabuf)
403 whichq = BQ_META;
404 else
405 whichq = BQ_EMPTY;
406
407 BLISTNONE(bp);
408 dp = &bufqueues[whichq];
409 binsheadfree(bp, dp, whichq);
410 binshash(bp, &invalhash);
411 }
412
413 for (; i < nbuf + niobuf; i++) {
414 bp = &buf[i];
415 bufhdrinit(bp);
416 binsheadfree(bp, &iobufqueue, -1);
417 }
418
419 printf("using %d buffer headers and %d cluster IO buffer headers\n",
420 nbuf, niobuf);
421
422 /* Set up zones used by the buffer cache */
423 bufzoneinit();
424
425 /* start the bcleanbuf() thread */
426 bcleanbuf_thread_init();
427
428 #if 0 /* notyet */
429 {
430 static void bufq_balance_thread_init();
431 /* create a thread to do dynamic buffer queue balancing */
432 bufq_balance_thread_init();
433 }
434 #endif /* notyet */
435 }
436
437 static struct buf *
438 bio_doread(vp, blkno, size, cred, async, queuetype)
439 struct vnode *vp;
440 daddr_t blkno;
441 int size;
442 struct ucred *cred;
443 int async;
444 int queuetype;
445 {
446 register struct buf *bp;
447 struct proc *p = current_proc();
448
449 bp = getblk(vp, blkno, size, 0, 0, queuetype);
450
451 /*
452 * If buffer does not have data valid, start a read.
453 * Note that if buffer is B_INVAL, getblk() won't return it.
454 * Therefore, it's valid if it's I/O has completed or been delayed.
455 */
456 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
457 /* Start I/O for the buffer (keeping credentials). */
458 SET(bp->b_flags, B_READ | async);
459 if (cred != NOCRED && bp->b_rcred == NOCRED) {
460 /*
461 * NFS has embedded ucred.
462 * Can not crhold() here as that causes zone corruption
463 */
464 bp->b_rcred = crdup(cred);
465 }
466
467 VOP_STRATEGY(bp);
468
469 trace(TR_BREADMISS, pack(vp, size), blkno);
470
471 /* Pay for the read. */
472 if (p && p->p_stats)
473 p->p_stats->p_ru.ru_inblock++; /* XXX */
474 } else if (async) {
475 brelse(bp);
476 }
477
478 trace(TR_BREADHIT, pack(vp, size), blkno);
479
480 return (bp);
481 }
482 /*
483 * Read a disk block.
484 * This algorithm described in Bach (p.54).
485 */
486 int
487 bread(vp, blkno, size, cred, bpp)
488 struct vnode *vp;
489 daddr_t blkno;
490 int size;
491 struct ucred *cred;
492 struct buf **bpp;
493 {
494 register struct buf *bp;
495
496 /* Get buffer for block. */
497 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
498
499 /* Wait for the read to complete, and return result. */
500 return (biowait(bp));
501 }
502
503 /*
504 * Read a disk block. [bread() for meta-data]
505 * This algorithm described in Bach (p.54).
506 */
507 int
508 meta_bread(vp, blkno, size, cred, bpp)
509 struct vnode *vp;
510 daddr_t blkno;
511 int size;
512 struct ucred *cred;
513 struct buf **bpp;
514 {
515 register struct buf *bp;
516
517 /* Get buffer for block. */
518 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
519
520 /* Wait for the read to complete, and return result. */
521 return (biowait(bp));
522 }
523
524 /*
525 * Read-ahead multiple disk blocks. The first is sync, the rest async.
526 * Trivial modification to the breada algorithm presented in Bach (p.55).
527 */
528 int
529 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
530 struct vnode *vp;
531 daddr_t blkno; int size;
532 daddr_t rablks[]; int rasizes[];
533 int nrablks;
534 struct ucred *cred;
535 struct buf **bpp;
536 {
537 register struct buf *bp;
538 int i;
539
540 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
541
542 /*
543 * For each of the read-ahead blocks, start a read, if necessary.
544 */
545 for (i = 0; i < nrablks; i++) {
546 /* If it's in the cache, just go on to next one. */
547 if (incore(vp, rablks[i]))
548 continue;
549
550 /* Get a buffer for the read-ahead block */
551 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
552 }
553
554 /* Otherwise, we had to start a read for it; wait until it's valid. */
555 return (biowait(bp));
556 }
557
558 /*
559 * Read with single-block read-ahead. Defined in Bach (p.55), but
560 * implemented as a call to breadn().
561 * XXX for compatibility with old file systems.
562 */
563 int
564 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
565 struct vnode *vp;
566 daddr_t blkno; int size;
567 daddr_t rablkno; int rabsize;
568 struct ucred *cred;
569 struct buf **bpp;
570 {
571
572 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
573 }
574
575 /*
576 * Block write. Described in Bach (p.56)
577 */
578 int
579 bwrite(bp)
580 struct buf *bp;
581 {
582 int rv, sync, wasdelayed;
583 struct proc *p = current_proc();
584 struct vnode *vp = bp->b_vp;
585
586 /* Remember buffer type, to switch on it later. */
587 sync = !ISSET(bp->b_flags, B_ASYNC);
588 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
589 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
590 if (wasdelayed) {
591 nbdwrite--;
592 wakeup((caddr_t)&nbdwrite);
593 }
594
595 if (!sync) {
596 /*
597 * If not synchronous, pay for the I/O operation and make
598 * sure the buf is on the correct vnode queue. We have
599 * to do this now, because if we don't, the vnode may not
600 * be properly notified that its I/O has completed.
601 */
602 if (wasdelayed)
603 reassignbuf(bp, vp);
604 else
605 if (p && p->p_stats)
606 p->p_stats->p_ru.ru_oublock++; /* XXX */
607 }
608
609 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
610
611 /* Initiate disk write. Make sure the appropriate party is charged. */
612 SET(bp->b_flags, B_WRITEINPROG);
613 vp->v_numoutput++;
614
615 VOP_STRATEGY(bp);
616
617 if (sync) {
618 /*
619 * If I/O was synchronous, wait for it to complete.
620 */
621 rv = biowait(bp);
622
623 /*
624 * Pay for the I/O operation, if it's not been paid for, and
625 * make sure it's on the correct vnode queue. (async operatings
626 * were payed for above.)
627 */
628 if (wasdelayed)
629 reassignbuf(bp, vp);
630 else
631 if (p && p->p_stats)
632 p->p_stats->p_ru.ru_oublock++; /* XXX */
633
634 /* Release the buffer. */
635 // XXXdbg - only if the unused bit is set
636 if (!ISSET(bp->b_flags, B_NORELSE)) {
637 brelse(bp);
638 } else {
639 CLR(bp->b_flags, B_NORELSE);
640 }
641
642 return (rv);
643 } else {
644 return (0);
645 }
646 }
647
648 int
649 vn_bwrite(ap)
650 struct vop_bwrite_args *ap;
651 {
652 return (bwrite(ap->a_bp));
653 }
654
655 /*
656 * Delayed write.
657 *
658 * The buffer is marked dirty, but is not queued for I/O.
659 * This routine should be used when the buffer is expected
660 * to be modified again soon, typically a small write that
661 * partially fills a buffer.
662 *
663 * NB: magnetic tapes cannot be delayed; they must be
664 * written in the order that the writes are requested.
665 *
666 * Described in Leffler, et al. (pp. 208-213).
667 *
668 * Note: With the abilitty to allocate additional buffer
669 * headers, we can get in to the situation where "too" many
670 * bdwrite()s can create situation where the kernel can create
671 * buffers faster than the disks can service. Doing a bawrite() in
672 * cases were we have "too many" outstanding bdwrite()s avoids that.
673 */
674 __private_extern__ int
675 bdwrite_internal(bp, return_error)
676 struct buf *bp;
677 int return_error;
678 {
679 struct proc *p = current_proc();
680 struct vnode *vp = bp->b_vp;
681
682 /*
683 * If the block hasn't been seen before:
684 * (1) Mark it as having been seen,
685 * (2) Charge for the write.
686 * (3) Make sure it's on its vnode's correct block list,
687 */
688 if (!ISSET(bp->b_flags, B_DELWRI)) {
689 SET(bp->b_flags, B_DELWRI);
690 if (p && p->p_stats)
691 p->p_stats->p_ru.ru_oublock++; /* XXX */
692 nbdwrite ++;
693 reassignbuf(bp, vp);
694 }
695
696 /* If this is a tape block, write it the block now. */
697 if (ISSET(bp->b_flags, B_TAPE)) {
698 /* bwrite(bp); */
699 VOP_BWRITE(bp);
700 return (0);
701 }
702
703 /*
704 * If the vnode has "too many" write operations in progress
705 * wait for them to finish the IO
706 */
707 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
708 vp->v_flag |= VTHROTTLED;
709 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
710 }
711
712 /*
713 * If we have too many delayed write buffers,
714 * more than we can "safely" handle, just fall back to
715 * doing the async write
716 */
717 if (nbdwrite < 0)
718 panic("bdwrite: Negative nbdwrite");
719
720 // can't do a bawrite() if the LOCKED bit is set because the
721 // buffer is part of a transaction and can't go to disk until
722 // the LOCKED bit is cleared.
723 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
724 if (return_error)
725 return (EAGAIN);
726 else
727 bawrite(bp);
728 return (0);
729 }
730
731 /* Otherwise, the "write" is done, so mark and release the buffer. */
732 SET(bp->b_flags, B_DONE);
733 brelse(bp);
734 return (0);
735 }
736
737 void
738 bdwrite(bp)
739 struct buf *bp;
740 {
741 (void) bdwrite_internal(bp, 0);
742 }
743
744
745 /*
746 * Asynchronous block write; just an asynchronous bwrite().
747 *
748 * Note: With the abilitty to allocate additional buffer
749 * headers, we can get in to the situation where "too" many
750 * bawrite()s can create situation where the kernel can create
751 * buffers faster than the disks can service.
752 * We limit the number of "in flight" writes a vnode can have to
753 * avoid this.
754 */
755 static int
756 bawrite_internal(bp, throttle)
757 struct buf *bp;
758 int throttle;
759 {
760 struct vnode *vp = bp->b_vp;
761
762 if (vp) {
763 /*
764 * If the vnode has "too many" write operations in progress
765 * wait for them to finish the IO
766 */
767 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
768 if (throttle) {
769 vp->v_flag |= VTHROTTLED;
770 (void)tsleep((caddr_t)&vp->v_numoutput,
771 PRIBIO + 1, "bawrite", 0);
772 } else
773 return (EWOULDBLOCK);
774 }
775 }
776
777 SET(bp->b_flags, B_ASYNC);
778 VOP_BWRITE(bp);
779 return (0);
780 }
781
782 void
783 bawrite(bp)
784 struct buf *bp;
785 {
786 (void) bawrite_internal(bp, 1);
787 }
788
789 /*
790 * bwillwrite:
791 *
792 * Called prior to the locking of any vnodes when we are expecting to
793 * write. We do not want to starve the buffer cache with too many
794 * dirty buffers so we block here. By blocking prior to the locking
795 * of any vnodes we attempt to avoid the situation where a locked vnode
796 * prevents the various system daemons from flushing related buffers.
797 */
798
799 void
800 bwillwrite(void)
801 {
802 /* XXX To be implemented later */
803 }
804
805 /*
806 * Release a buffer on to the free lists.
807 * Described in Bach (p. 46).
808 */
809 void
810 brelse(bp)
811 struct buf *bp;
812 {
813 struct bqueues *bufq;
814 int s;
815 long whichq;
816
817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
818 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
819 bp->b_flags, 0);
820
821 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
822
823 // if we're invalidating a buffer that has the B_CALL bit
824 // set then call the b_iodone function so it gets cleaned
825 // up properly.
826 //
827 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
828 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
829 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
830 }
831 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
832 void (*iodone_func)(struct buf *) = bp->b_iodone;
833
834 CLR(bp->b_flags, B_CALL); /* but note callout done */
835 bp->b_iodone = NULL;
836
837 if (iodone_func == NULL) {
838 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
839 }
840 (*iodone_func)(bp);
841 }
842 }
843
844 /* IO is done. Cleanup the UPL state */
845 if (!ISSET(bp->b_flags, B_META)
846 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
847 kern_return_t kret;
848 upl_t upl;
849 int upl_flags;
850
851 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
852 if ( !ISSET(bp->b_flags, B_INVAL)) {
853 kret = ubc_create_upl(bp->b_vp,
854 ubc_blktooff(bp->b_vp, bp->b_lblkno),
855 bp->b_bufsize,
856 &upl,
857 NULL,
858 UPL_PRECIOUS);
859 if (kret != KERN_SUCCESS)
860 panic("brelse: Failed to get pagelists");
861 #ifdef UBC_DEBUG
862 upl_ubc_alias_set(upl, bp, 5);
863 #endif /* UBC_DEBUG */
864 } else
865 upl = (upl_t) 0;
866 } else {
867 upl = bp->b_pagelist;
868 kret = ubc_upl_unmap(upl);
869
870 if (kret != KERN_SUCCESS)
871 panic("kernel_upl_unmap failed");
872 bp->b_data = 0;
873 }
874 if (upl) {
875 if (bp->b_flags & (B_ERROR | B_INVAL)) {
876 if (bp->b_flags & (B_READ | B_INVAL))
877 upl_flags = UPL_ABORT_DUMP_PAGES;
878 else
879 upl_flags = 0;
880 ubc_upl_abort(upl, upl_flags);
881 } else {
882 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
883 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
884 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
885 upl_flags = UPL_COMMIT_SET_DIRTY ;
886 else
887 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
888 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
889 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
890 }
891 s = splbio();
892 CLR(bp->b_flags, B_PAGELIST);
893 bp->b_pagelist = 0;
894 splx(s);
895 }
896 } else {
897 if(ISSET(bp->b_flags, B_PAGELIST))
898 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
899 }
900
901 /* Wake up any processes waiting for any buffer to become free. */
902 if (needbuffer) {
903 needbuffer = 0;
904 wakeup(&needbuffer);
905 }
906
907 /* Wake up any proceeses waiting for _this_ buffer to become free. */
908 if (ISSET(bp->b_flags, B_WANTED)) {
909 CLR(bp->b_flags, B_WANTED);
910 wakeup(bp);
911 }
912
913 /* Block disk interrupts. */
914 s = splbio();
915
916 /*
917 * Determine which queue the buffer should be on, then put it there.
918 */
919
920 /* If it's locked, don't report an error; try again later. */
921 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
922 CLR(bp->b_flags, B_ERROR);
923
924 /* If it's not cacheable, or an error, mark it invalid. */
925 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
926 SET(bp->b_flags, B_INVAL);
927
928 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
929 /*
930 * If it's invalid or empty, dissociate it from its vnode
931 * and put on the head of the appropriate queue.
932 */
933 if (bp->b_vp)
934 brelvp(bp);
935 if (ISSET(bp->b_flags, B_DELWRI)) {
936 CLR(bp->b_flags, B_DELWRI);
937 nbdwrite--;
938 wakeup((caddr_t)&nbdwrite);
939 }
940 if (bp->b_bufsize <= 0)
941 whichq = BQ_EMPTY; /* no data */
942 else if (ISSET(bp->b_flags, B_META))
943 whichq = BQ_META; /* meta-data */
944 else
945 whichq = BQ_AGE; /* invalid data */
946
947 bufq = &bufqueues[whichq];
948 binsheadfree(bp, bufq, whichq);
949 } else {
950 /*
951 * It has valid data. Put it on the end of the appropriate
952 * queue, so that it'll stick around for as long as possible.
953 */
954 if (ISSET(bp->b_flags, B_LOCKED))
955 whichq = BQ_LOCKED; /* locked in core */
956 else if (ISSET(bp->b_flags, B_META))
957 whichq = BQ_META; /* meta-data */
958 else if (ISSET(bp->b_flags, B_AGE))
959 whichq = BQ_AGE; /* stale but valid data */
960 else
961 whichq = BQ_LRU; /* valid data */
962
963 bufq = &bufqueues[whichq];
964 binstailfree(bp, bufq, whichq);
965 }
966
967 /* Unlock the buffer. */
968 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
969
970 /* Allow disk interrupts. */
971 splx(s);
972
973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
974 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
975 }
976
977 /*
978 * Determine if a block is in the cache.
979 * Just look on what would be its hash chain. If it's there, return
980 * a pointer to it, unless it's marked invalid. If it's marked invalid,
981 * we normally don't return the buffer, unless the caller explicitly
982 * wants us to.
983 */
984 struct buf *
985 incore(vp, blkno)
986 struct vnode *vp;
987 daddr_t blkno;
988 {
989 struct buf *bp;
990
991 bp = BUFHASH(vp, blkno)->lh_first;
992
993 /* Search hash chain */
994 for (; bp != NULL; bp = bp->b_hash.le_next) {
995 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
996 !ISSET(bp->b_flags, B_INVAL))
997 return (bp);
998 }
999
1000 return (0);
1001 }
1002
1003
1004 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1005 /*
1006 * Get a block of requested size that is associated with
1007 * a given vnode and block offset. If it is found in the
1008 * block cache, mark it as having been found, make it busy
1009 * and return it. Otherwise, return an empty block of the
1010 * correct size. It is up to the caller to insure that the
1011 * cached blocks be of the correct size.
1012 */
1013 struct buf *
1014 getblk(vp, blkno, size, slpflag, slptimeo, operation)
1015 register struct vnode *vp;
1016 daddr_t blkno;
1017 int size, slpflag, slptimeo, operation;
1018 {
1019 struct buf *bp;
1020 int s, err;
1021 upl_t upl;
1022 upl_page_info_t *pl;
1023 kern_return_t kret;
1024 int error=0;
1025 int pagedirty = 0;
1026
1027 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1028 blkno * PAGE_SIZE, size, operation, 0, 0);
1029 start:
1030
1031 s = splbio();
1032 if ((bp = incore(vp, blkno))) {
1033 /* Found in the Buffer Cache */
1034 if (ISSET(bp->b_flags, B_BUSY)) {
1035 /* but is busy */
1036 switch (operation) {
1037 case BLK_READ:
1038 case BLK_WRITE:
1039 case BLK_META:
1040 SET(bp->b_flags, B_WANTED);
1041 bufstats.bufs_busyincore++;
1042 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1043 slptimeo);
1044 splx(s);
1045 /*
1046 * Callers who call with PCATCH or timeout are
1047 * willing to deal with the NULL pointer
1048 */
1049 if (err && ((slpflag & PCATCH) ||
1050 ((err == EWOULDBLOCK) && slptimeo)))
1051 return (NULL);
1052 goto start;
1053 /*NOTREACHED*/
1054 break;
1055
1056 case BLK_PAGEIN:
1057 /* pagein operation must not use getblk */
1058 panic("getblk: pagein for incore busy buffer");
1059 splx(s);
1060 /*NOTREACHED*/
1061 break;
1062
1063 case BLK_PAGEOUT:
1064 /* pageout operation must not use getblk */
1065 panic("getblk: pageout for incore busy buffer");
1066 splx(s);
1067 /*NOTREACHED*/
1068 break;
1069
1070 default:
1071 panic("getblk: %d unknown operation 1", operation);
1072 /*NOTREACHED*/
1073 break;
1074 }
1075 } else {
1076 /* not busy */
1077 SET(bp->b_flags, (B_BUSY | B_CACHE));
1078 bremfree(bp);
1079 bufstats.bufs_incore++;
1080 splx(s);
1081
1082 allocbuf(bp, size);
1083 if (ISSET(bp->b_flags, B_PAGELIST))
1084 panic("pagelist buffer is not busy");
1085
1086 switch (operation) {
1087 case BLK_READ:
1088 case BLK_WRITE:
1089 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1090 kret = ubc_create_upl(vp,
1091 ubc_blktooff(vp, bp->b_lblkno),
1092 bp->b_bufsize,
1093 &upl,
1094 &pl,
1095 UPL_PRECIOUS);
1096 if (kret != KERN_SUCCESS)
1097 panic("Failed to get pagelists");
1098
1099 SET(bp->b_flags, B_PAGELIST);
1100 bp->b_pagelist = upl;
1101
1102 if (!upl_valid_page(pl, 0)) {
1103 if (vp->v_tag != VT_NFS)
1104 panic("getblk: incore buffer without valid page");
1105 CLR(bp->b_flags, B_CACHE);
1106 }
1107
1108 if (upl_dirty_page(pl, 0))
1109 SET(bp->b_flags, B_WASDIRTY);
1110 else
1111 CLR(bp->b_flags, B_WASDIRTY);
1112
1113 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1114 if (kret != KERN_SUCCESS)
1115 panic("getblk: ubc_upl_map() failed with (%d)",
1116 kret);
1117 if (bp->b_data == 0)
1118 panic("ubc_upl_map mapped 0");
1119 }
1120 break;
1121
1122 case BLK_META:
1123 /*
1124 * VM is not involved in IO for the meta data
1125 * buffer already has valid data
1126 */
1127 if(bp->b_data == 0)
1128 panic("bp->b_data null incore buf=%x", bp);
1129 break;
1130
1131 case BLK_PAGEIN:
1132 case BLK_PAGEOUT:
1133 panic("getblk: paging operation 1");
1134 break;
1135
1136 default:
1137 panic("getblk: %d unknown operation 2", operation);
1138 /*NOTREACHED*/
1139 break;
1140 }
1141 }
1142 } else { /* not incore() */
1143 int queue = BQ_EMPTY; /* Start with no preference */
1144 splx(s);
1145
1146 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1147 !(UBCINFOEXISTS(vp))) {
1148 operation = BLK_META;
1149 }
1150 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1151 goto start;
1152 if (incore(vp, blkno)) {
1153 SET(bp->b_flags, B_INVAL);
1154 binshash(bp, &invalhash);
1155 brelse(bp);
1156 goto start;
1157 }
1158 /*
1159 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1160 * CALLED! BE CAREFUL.
1161 */
1162
1163 /*
1164 * if it is meta, the queue may be set to other
1165 * type so reset as well as mark it to be B_META
1166 * so that when buffer is released it will goto META queue
1167 * Also, if the vnode is not VREG, then it is META
1168 */
1169 if (operation == BLK_META) {
1170 SET(bp->b_flags, B_META);
1171 queue = BQ_META;
1172 }
1173
1174 bp->b_blkno = bp->b_lblkno = blkno;
1175 bp->b_vp = vp;
1176
1177 /*
1178 * Insert in the hash so that incore() can find it
1179 */
1180 binshash(bp, BUFHASH(vp, blkno));
1181
1182 s = splbio();
1183 bgetvp(vp, bp);
1184 splx(s);
1185
1186 allocbuf(bp, size);
1187
1188 switch (operation) {
1189 case BLK_META:
1190 /* buffer data is invalid */
1191
1192 if(bp->b_data == 0)
1193 panic("bp->b_data is null %x",bp);
1194
1195 bufstats.bufs_miss++;
1196
1197 /* wakeup the buffer */
1198 CLR(bp->b_flags, B_WANTED);
1199 wakeup(bp);
1200 break;
1201
1202 case BLK_READ:
1203 case BLK_WRITE:
1204
1205 if (ISSET(bp->b_flags, B_PAGELIST))
1206 panic("B_PAGELIST in bp=%x",bp);
1207
1208 kret = ubc_create_upl(vp,
1209 ubc_blktooff(vp, blkno),
1210 bp->b_bufsize,
1211 &upl,
1212 &pl,
1213 UPL_PRECIOUS);
1214 if (kret != KERN_SUCCESS)
1215 panic("Failed to get pagelists");
1216
1217 #ifdef UBC_DEBUG
1218 upl_ubc_alias_set(upl, bp, 4);
1219 #endif /* UBC_DEBUG */
1220 bp->b_pagelist = upl;
1221
1222 SET(bp->b_flags, B_PAGELIST);
1223
1224 if (upl_valid_page(pl, 0)) {
1225 SET(bp->b_flags, B_CACHE | B_DONE);
1226 bufstats.bufs_vmhits++;
1227
1228 pagedirty = upl_dirty_page(pl, 0);
1229
1230 if (pagedirty)
1231 SET(bp->b_flags, B_WASDIRTY);
1232
1233 if (vp->v_tag == VT_NFS) {
1234 off_t f_offset;
1235 int valid_size;
1236
1237 bp->b_validoff = 0;
1238 bp->b_dirtyoff = 0;
1239
1240 f_offset = ubc_blktooff(vp, blkno);
1241
1242 if (f_offset > vp->v_ubcinfo->ui_size) {
1243 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1244 bp->b_validend = 0;
1245 bp->b_dirtyend = 0;
1246 } else {
1247 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1248 bp->b_validend = valid_size;
1249
1250 if (pagedirty)
1251 bp->b_dirtyend = valid_size;
1252 else
1253 bp->b_dirtyend = 0;
1254
1255 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1256 bp->b_validend, bp->b_dirtyend,
1257 (int)vp->v_ubcinfo->ui_size, 0, 0);
1258 }
1259 } else {
1260 bp->b_validoff = 0;
1261 bp->b_dirtyoff = 0;
1262
1263 if (pagedirty) {
1264 /* page is dirty */
1265 bp->b_validend = bp->b_bcount;
1266 bp->b_dirtyend = bp->b_bcount;
1267 } else {
1268 /* page is clean */
1269 bp->b_validend = bp->b_bcount;
1270 bp->b_dirtyend = 0;
1271 }
1272 }
1273 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1274 if(error) {
1275 panic("getblk: VOP_BMAP failed");
1276 /*NOTREACHED*/
1277 /*
1278 * XXX: We probably should invalidate the VM Page
1279 */
1280 bp->b_error = error;
1281 SET(bp->b_flags, (B_ERROR | B_INVAL));
1282 /* undo B_DONE that was set before upl_commit() */
1283 CLR(bp->b_flags, B_DONE);
1284 brelse(bp);
1285 return (0);
1286 }
1287 } else {
1288 bufstats.bufs_miss++;
1289 }
1290 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1291 if (kret != KERN_SUCCESS) {
1292 panic("getblk: ubc_upl_map() "
1293 "failed with (%d)", kret);
1294 }
1295 if (bp->b_data == 0)
1296 panic("kernel_upl_map mapped 0");
1297
1298 break;
1299
1300 case BLK_PAGEIN:
1301 case BLK_PAGEOUT:
1302 panic("getblk: paging operation 2");
1303 break;
1304 default:
1305 panic("getblk: %d unknown operation 3", operation);
1306 /*NOTREACHED*/
1307 break;
1308 }
1309 }
1310
1311 if (bp->b_data == NULL)
1312 panic("getblk: bp->b_addr is null");
1313
1314 if (bp->b_bufsize & 0xfff) {
1315 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1316 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1317 }
1318
1319 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1320 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1321
1322 return (bp);
1323 }
1324
1325 /*
1326 * Get an empty, disassociated buffer of given size.
1327 */
1328 struct buf *
1329 geteblk(size)
1330 int size;
1331 {
1332 struct buf *bp;
1333 int queue = BQ_EMPTY;
1334
1335 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1336 ;
1337 SET(bp->b_flags, (B_META|B_INVAL));
1338
1339 #if DIAGNOSTIC
1340 assert(queue == BQ_EMPTY);
1341 #endif /* DIAGNOSTIC */
1342 /* XXX need to implement logic to deal with other queues */
1343
1344 binshash(bp, &invalhash);
1345 allocbuf(bp, size);
1346 bufstats.bufs_eblk++;
1347
1348 return (bp);
1349 }
1350
1351 /*
1352 * Zones for the meta data buffers
1353 */
1354
1355 #define MINMETA 512
1356 #define MAXMETA 4096
1357
1358 struct meta_zone_entry {
1359 zone_t mz_zone;
1360 vm_size_t mz_size;
1361 vm_size_t mz_max;
1362 char *mz_name;
1363 };
1364
1365 struct meta_zone_entry meta_zones[] = {
1366 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1367 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1368 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1369 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1370 {NULL, 0, 0, "" } /* End */
1371 };
1372
1373 /*
1374 * Initialize the meta data zones
1375 */
1376 static void
1377 bufzoneinit(void)
1378 {
1379 int i;
1380
1381 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1382 meta_zones[i].mz_zone =
1383 zinit(meta_zones[i].mz_size,
1384 meta_zones[i].mz_max,
1385 PAGE_SIZE,
1386 meta_zones[i].mz_name);
1387 }
1388 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1389 }
1390
1391 static __inline__ zone_t
1392 getbufzone(size_t size)
1393 {
1394 int i;
1395
1396 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1397 panic("getbufzone: incorect size = %d", size);
1398
1399 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1400 if (meta_zones[i].mz_size >= size)
1401 break;
1402 }
1403
1404 return (meta_zones[i].mz_zone);
1405 }
1406
1407 /*
1408 * With UBC, there is no need to expand / shrink the file data
1409 * buffer. The VM uses the same pages, hence no waste.
1410 * All the file data buffers can have one size.
1411 * In fact expand / shrink would be an expensive operation.
1412 *
1413 * Only exception to this is meta-data buffers. Most of the
1414 * meta data operations are smaller than PAGE_SIZE. Having the
1415 * meta-data buffers grow and shrink as needed, optimizes use
1416 * of the kernel wired memory.
1417 */
1418
1419 int
1420 allocbuf(bp, size)
1421 struct buf *bp;
1422 int size;
1423 {
1424 vm_size_t desired_size;
1425
1426 desired_size = roundup(size, CLBYTES);
1427
1428 if(desired_size < PAGE_SIZE)
1429 desired_size = PAGE_SIZE;
1430 if (desired_size > MAXBSIZE)
1431 panic("allocbuf: buffer larger than MAXBSIZE requested");
1432
1433 if (ISSET(bp->b_flags, B_META)) {
1434 kern_return_t kret;
1435 zone_t zprev, z;
1436 size_t nsize = roundup(size, MINMETA);
1437
1438 if (bp->b_data) {
1439 vm_offset_t elem = (vm_offset_t)bp->b_data;
1440
1441 if (ISSET(bp->b_flags, B_ZALLOC))
1442 if (bp->b_bufsize <= MAXMETA) {
1443 if (bp->b_bufsize < nsize) {
1444 /* reallocate to a bigger size */
1445 desired_size = nsize;
1446
1447 zprev = getbufzone(bp->b_bufsize);
1448 z = getbufzone(nsize);
1449 bp->b_data = (caddr_t)zalloc(z);
1450 if(bp->b_data == 0)
1451 panic("allocbuf: zalloc() returned NULL");
1452 bcopy(elem, bp->b_data, bp->b_bufsize);
1453 zfree(zprev, elem);
1454 } else {
1455 desired_size = bp->b_bufsize;
1456 }
1457 } else
1458 panic("allocbuf: B_ZALLOC set incorrectly");
1459 else
1460 if (bp->b_bufsize < desired_size) {
1461 /* reallocate to a bigger size */
1462 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1463 if (kret != KERN_SUCCESS)
1464 panic("allocbuf: kmem_alloc() returned %d", kret);
1465 if(bp->b_data == 0)
1466 panic("allocbuf: null b_data");
1467 bcopy(elem, bp->b_data, bp->b_bufsize);
1468 kmem_free(kernel_map, elem, bp->b_bufsize);
1469 } else {
1470 desired_size = bp->b_bufsize;
1471 }
1472 } else {
1473 /* new allocation */
1474 if (nsize <= MAXMETA) {
1475 desired_size = nsize;
1476 z = getbufzone(nsize);
1477 bp->b_data = (caddr_t)zalloc(z);
1478 if(bp->b_data == 0)
1479 panic("allocbuf: zalloc() returned NULL 2");
1480 SET(bp->b_flags, B_ZALLOC);
1481 } else {
1482 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1483 if (kret != KERN_SUCCESS)
1484 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1485 if(bp->b_data == 0)
1486 panic("allocbuf: null b_data 2");
1487 }
1488 }
1489 }
1490
1491 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1492 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1493
1494 bp->b_bufsize = desired_size;
1495 bp->b_bcount = size;
1496 return (0);
1497 }
1498
1499 /*
1500 * Get a new buffer from one of the free lists.
1501 *
1502 * Request for a queue is passes in. The queue from which the buffer was taken
1503 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1504 * BQUEUE means no preference. Use heuristics in that case.
1505 * Heuristics is as follows:
1506 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1507 * If none available block till one is made available.
1508 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1509 * Pick the most stale buffer.
1510 * If found buffer was marked delayed write, start the async. write
1511 * and restart the search.
1512 * Initialize the fields and disassociate the buffer from the vnode.
1513 * Remove the buffer from the hash. Return the buffer and the queue
1514 * on which it was found.
1515 */
1516
1517 static struct buf *
1518 getnewbuf(slpflag, slptimeo, queue)
1519 int slpflag, slptimeo;
1520 int *queue;
1521 {
1522 register struct buf *bp;
1523 register struct buf *lru_bp;
1524 register struct buf *age_bp;
1525 register struct buf *meta_bp;
1526 register int age_time, lru_time, bp_time, meta_time;
1527 int s;
1528 int req = *queue; /* save it for restarts */
1529
1530 start:
1531 s = splbio();
1532
1533 /* invalid request gets empty queue */
1534 if ((*queue > BQUEUES) || (*queue < 0)
1535 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1536 *queue = BQ_EMPTY;
1537
1538 /* (*queue == BQUEUES) means no preference */
1539 if (*queue != BQUEUES) {
1540 /* Try for the requested queue first */
1541 bp = bufqueues[*queue].tqh_first;
1542 if (bp)
1543 goto found;
1544 }
1545
1546 /* Unable to use requested queue */
1547 age_bp = bufqueues[BQ_AGE].tqh_first;
1548 lru_bp = bufqueues[BQ_LRU].tqh_first;
1549 meta_bp = bufqueues[BQ_META].tqh_first;
1550
1551 if (!age_bp && !lru_bp && !meta_bp) {
1552 /*
1553 * Unavailble on AGE or LRU or META queues
1554 * Try the empty list first
1555 */
1556 bp = bufqueues[BQ_EMPTY].tqh_first;
1557 if (bp) {
1558 *queue = BQ_EMPTY;
1559 goto found;
1560 }
1561
1562 /* Create a new temparory buffer header */
1563 bp = (struct buf *)zalloc(buf_hdr_zone);
1564
1565 if (bp) {
1566 bufhdrinit(bp);
1567 BLISTNONE(bp);
1568 binshash(bp, &invalhash);
1569 SET(bp->b_flags, B_HDRALLOC);
1570 *queue = BQ_EMPTY;
1571 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1572 buf_hdr_count++;
1573 goto found;
1574 }
1575
1576 /* Log this error condition */
1577 printf("getnewbuf: No useful buffers");
1578
1579 /* wait for a free buffer of any kind */
1580 needbuffer = 1;
1581 bufstats.bufs_sleeps++;
1582 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1583 splx(s);
1584 return (0);
1585 }
1586
1587 /* Buffer available either on AGE or LRU or META */
1588 bp = NULL;
1589 *queue = -1;
1590
1591 /* Buffer available either on AGE or LRU */
1592 if (!age_bp) {
1593 bp = lru_bp;
1594 *queue = BQ_LRU;
1595 } else if (!lru_bp) {
1596 bp = age_bp;
1597 *queue = BQ_AGE;
1598 } else { /* buffer available on both AGE and LRU */
1599 age_time = time.tv_sec - age_bp->b_timestamp;
1600 lru_time = time.tv_sec - lru_bp->b_timestamp;
1601 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1602 bp = age_bp;
1603 *queue = BQ_AGE;
1604 /*
1605 * we should probably re-timestamp eveything in the
1606 * queues at this point with the current time
1607 */
1608 } else {
1609 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1610 bp = lru_bp;
1611 *queue = BQ_LRU;
1612 } else {
1613 bp = age_bp;
1614 *queue = BQ_AGE;
1615 }
1616 }
1617 }
1618
1619 if (!bp) { /* Neither on AGE nor on LRU */
1620 bp = meta_bp;
1621 *queue = BQ_META;
1622 } else if (meta_bp) {
1623 bp_time = time.tv_sec - bp->b_timestamp;
1624 meta_time = time.tv_sec - meta_bp->b_timestamp;
1625
1626 if (!(bp_time < 0) && !(meta_time < 0)) {
1627 /* time not set backwards */
1628 int bp_is_stale;
1629 bp_is_stale = (*queue == BQ_LRU) ?
1630 lru_is_stale : age_is_stale;
1631
1632 if ((meta_time >= meta_is_stale) &&
1633 (bp_time < bp_is_stale)) {
1634 bp = meta_bp;
1635 *queue = BQ_META;
1636 }
1637 }
1638 }
1639
1640 if (bp == NULL)
1641 panic("getnewbuf: null bp");
1642
1643 found:
1644 if (ISSET(bp->b_flags, B_LOCKED)) {
1645 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1646 }
1647
1648 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1649 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1650
1651 if(ISSET(bp->b_flags, B_BUSY))
1652 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1653
1654 /* Clean it */
1655 if (bcleanbuf(bp)) {
1656 /* bawrite() issued, buffer not ready */
1657 splx(s);
1658 *queue = req;
1659 goto start;
1660 }
1661 splx(s);
1662 return (bp);
1663 }
1664
1665 #include <mach/mach_types.h>
1666 #include <mach/memory_object_types.h>
1667 #include <kern/sched_prim.h>
1668
1669 /*
1670 * Clean a buffer.
1671 * Returns 0 is buffer is ready to use,
1672 * Returns 1 if issued a bawrite() to indicate
1673 * that the buffer is not ready.
1674 */
1675 static int
1676 bcleanbuf(struct buf *bp)
1677 {
1678 int s;
1679 struct ucred *cred;
1680 int hdralloc = 0;
1681
1682 s = splbio();
1683
1684 /* Remove from the queue */
1685 bremfree(bp);
1686
1687 /* Buffer is no longer on free lists. */
1688 SET(bp->b_flags, B_BUSY);
1689
1690 /* Check whether the buffer header was "allocated" */
1691 if (ISSET(bp->b_flags, B_HDRALLOC))
1692 hdralloc = 1;
1693
1694 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1695 panic("bcleanbuf: le_prev is deadbeef");
1696
1697 /*
1698 * If buffer was a delayed write, start the IO by queuing
1699 * it on the LAUNDRY queue, and return 1
1700 */
1701 if (ISSET(bp->b_flags, B_DELWRI)) {
1702 splx(s);
1703 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1704 blaundrycnt++;
1705 wakeup(&blaundrycnt);
1706 /* and give it a chance to run */
1707 (void)thread_block(THREAD_CONTINUE_NULL);
1708 return (1);
1709 }
1710
1711 if (bp->b_vp)
1712 brelvp(bp);
1713 bremhash(bp);
1714 BLISTNONE(bp);
1715
1716 splx(s);
1717
1718 if (ISSET(bp->b_flags, B_META)) {
1719 vm_offset_t elem = (vm_offset_t)bp->b_data;
1720 if (elem == 0)
1721 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1722
1723 if (ISSET(bp->b_flags, B_ZALLOC)) {
1724 if (bp->b_bufsize <= MAXMETA) {
1725 zone_t z;
1726
1727 z = getbufzone(bp->b_bufsize);
1728 bp->b_data = (caddr_t)0xdeadbeef;
1729 zfree(z, elem);
1730 CLR(bp->b_flags, B_ZALLOC);
1731 } else
1732 panic("bcleanbuf: B_ZALLOC set incorrectly");
1733 } else {
1734 bp->b_data = (caddr_t)0xdeadbeef;
1735 kmem_free(kernel_map, elem, bp->b_bufsize);
1736 }
1737 }
1738
1739 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1740
1741 /* disassociate us from our vnode, if we had one... */
1742 s = splbio();
1743
1744 /* clear out various other fields */
1745 bp->b_bufsize = 0;
1746 bp->b_data = 0;
1747 bp->b_flags = B_BUSY;
1748 if (hdralloc)
1749 SET(bp->b_flags, B_HDRALLOC);
1750 bp->b_dev = NODEV;
1751 bp->b_blkno = bp->b_lblkno = 0;
1752 bp->b_iodone = 0;
1753 bp->b_error = 0;
1754 bp->b_resid = 0;
1755 bp->b_bcount = 0;
1756 bp->b_dirtyoff = bp->b_dirtyend = 0;
1757 bp->b_validoff = bp->b_validend = 0;
1758
1759 /* nuke any credentials we were holding */
1760 cred = bp->b_rcred;
1761 if (cred != NOCRED) {
1762 bp->b_rcred = NOCRED;
1763 crfree(cred);
1764 }
1765 cred = bp->b_wcred;
1766 if (cred != NOCRED) {
1767 bp->b_wcred = NOCRED;
1768 crfree(cred);
1769 }
1770 splx(s);
1771 return (0);
1772 }
1773
1774
1775 /*
1776 * Wait for operations on the buffer to complete.
1777 * When they do, extract and return the I/O's error value.
1778 */
1779 int
1780 biowait(bp)
1781 struct buf *bp;
1782 {
1783 int s;
1784
1785 s = splbio();
1786 while (!ISSET(bp->b_flags, B_DONE))
1787 tsleep(bp, PRIBIO + 1, "biowait", 0);
1788 splx(s);
1789
1790 /* check for interruption of I/O (e.g. via NFS), then errors. */
1791 if (ISSET(bp->b_flags, B_EINTR)) {
1792 CLR(bp->b_flags, B_EINTR);
1793 return (EINTR);
1794 } else if (ISSET(bp->b_flags, B_ERROR))
1795 return (bp->b_error ? bp->b_error : EIO);
1796 else
1797 return (0);
1798 }
1799
1800 /*
1801 * Mark I/O complete on a buffer.
1802 *
1803 * If a callback has been requested, e.g. the pageout
1804 * daemon, do so. Otherwise, awaken waiting processes.
1805 *
1806 * [ Leffler, et al., says on p.247:
1807 * "This routine wakes up the blocked process, frees the buffer
1808 * for an asynchronous write, or, for a request by the pagedaemon
1809 * process, invokes a procedure specified in the buffer structure" ]
1810 *
1811 * In real life, the pagedaemon (or other system processes) wants
1812 * to do async stuff to, and doesn't want the buffer brelse()'d.
1813 * (for swap pager, that puts swap buffers on the free lists (!!!),
1814 * for the vn device, that puts malloc'd buffers on the free lists!)
1815 */
1816 void
1817 biodone(bp)
1818 struct buf *bp;
1819 {
1820 boolean_t funnel_state;
1821 struct vnode *vp;
1822
1823 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1824
1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1826 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1827
1828 if (ISSET(bp->b_flags, B_DONE))
1829 panic("biodone already");
1830 SET(bp->b_flags, B_DONE); /* note that it's done */
1831 /*
1832 * I/O was done, so don't believe
1833 * the DIRTY state from VM anymore
1834 */
1835 CLR(bp->b_flags, B_WASDIRTY);
1836
1837 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1838 vwakeup(bp); /* wake up reader */
1839
1840 if (kdebug_enable) {
1841 int code = DKIO_DONE;
1842
1843 if (bp->b_flags & B_READ)
1844 code |= DKIO_READ;
1845 if (bp->b_flags & B_ASYNC)
1846 code |= DKIO_ASYNC;
1847
1848 if (bp->b_flags & B_META)
1849 code |= DKIO_META;
1850 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1851 code |= DKIO_PAGING;
1852
1853 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1854 bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1855 }
1856
1857 /* Wakeup the throttled write operations as needed */
1858 vp = bp->b_vp;
1859 if (vp
1860 && (vp->v_flag & VTHROTTLED)
1861 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1862 vp->v_flag &= ~VTHROTTLED;
1863 wakeup((caddr_t)&vp->v_numoutput);
1864 }
1865
1866 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1867 void (*iodone_func)(struct buf *) = bp->b_iodone;
1868
1869 CLR(bp->b_flags, B_CALL); /* but note callout done */
1870 bp->b_iodone = NULL;
1871
1872 if (iodone_func == NULL) {
1873 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1874 } else {
1875 (*iodone_func)(bp);
1876 }
1877 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1878 brelse(bp);
1879 else { /* or just wakeup the buffer */
1880 CLR(bp->b_flags, B_WANTED);
1881 wakeup(bp);
1882 }
1883
1884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1885 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1886
1887 thread_funnel_set(kernel_flock, funnel_state);
1888 }
1889
1890 /*
1891 * Return a count of buffers on the "locked" queue.
1892 */
1893 int
1894 count_lock_queue()
1895 {
1896 register struct buf *bp;
1897 register int n = 0;
1898
1899 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1900 bp = bp->b_freelist.tqe_next)
1901 n++;
1902 return (n);
1903 }
1904
1905 /*
1906 * Return a count of 'busy' buffers. Used at the time of shutdown.
1907 */
1908 int
1909 count_busy_buffers()
1910 {
1911 register struct buf *bp;
1912 register int nbusy = 0;
1913
1914 for (bp = &buf[nbuf]; --bp >= buf; )
1915 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1916 nbusy++;
1917 return (nbusy);
1918 }
1919
1920 #if DIAGNOSTIC
1921 /*
1922 * Print out statistics on the current allocation of the buffer pool.
1923 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1924 * in vfs_syscalls.c using sysctl.
1925 */
1926 void
1927 vfs_bufstats()
1928 {
1929 int s, i, j, count;
1930 register struct buf *bp;
1931 register struct bqueues *dp;
1932 int counts[MAXBSIZE/CLBYTES+1];
1933 static char *bname[BQUEUES] =
1934 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1935
1936 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1937 count = 0;
1938 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1939 counts[j] = 0;
1940 s = splbio();
1941 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1942 counts[bp->b_bufsize/CLBYTES]++;
1943 count++;
1944 }
1945 splx(s);
1946 printf("%s: total-%d", bname[i], count);
1947 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1948 if (counts[j] != 0)
1949 printf(", %d-%d", j * CLBYTES, counts[j]);
1950 printf("\n");
1951 }
1952 }
1953 #endif /* DIAGNOSTIC */
1954
1955 #define NRESERVEDIOBUFS 64
1956
1957 __private_extern__ struct buf *
1958 alloc_io_buf(vp, priv)
1959 struct vnode *vp;
1960 int priv;
1961 {
1962 register struct buf *bp;
1963 int s;
1964
1965 s = splbio();
1966
1967 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1968 need_iobuffer = 1;
1969 bufstats.bufs_iobufsleeps++;
1970 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1971 }
1972
1973 while ((bp = iobufqueue.tqh_first) == NULL) {
1974 need_iobuffer = 1;
1975 bufstats.bufs_iobufsleeps++;
1976 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1977 }
1978
1979 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1980 bp->b_timestamp = 0;
1981
1982 /* clear out various fields */
1983 bp->b_flags = B_BUSY;
1984 bp->b_blkno = bp->b_lblkno = 0;
1985
1986 bp->b_iodone = 0;
1987 bp->b_error = 0;
1988 bp->b_resid = 0;
1989 bp->b_bcount = 0;
1990 bp->b_bufsize = 0;
1991 bp->b_vp = vp;
1992
1993 if (vp->v_type == VBLK || vp->v_type == VCHR)
1994 bp->b_dev = vp->v_rdev;
1995 else
1996 bp->b_dev = NODEV;
1997 bufstats.bufs_iobufinuse++;
1998 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1999 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
2000 splx(s);
2001
2002 return (bp);
2003 }
2004
2005 __private_extern__ void
2006 free_io_buf(bp)
2007 struct buf *bp;
2008 {
2009 int s;
2010
2011 s = splbio();
2012 /* put buffer back on the head of the iobufqueue */
2013 bp->b_vp = NULL;
2014 bp->b_flags = B_INVAL;
2015
2016 binsheadfree(bp, &iobufqueue, -1);
2017
2018 /* Wake up any processes waiting for any buffer to become free. */
2019 if (need_iobuffer) {
2020 need_iobuffer = 0;
2021 wakeup(&need_iobuffer);
2022 }
2023 bufstats.bufs_iobufinuse--;
2024 splx(s);
2025 }
2026
2027 /* disabled for now */
2028
2029 /* XXX move this to a separate file */
2030 /*
2031 * Dynamic Scaling of the Buffer Queues
2032 */
2033
2034 typedef long long blsize_t;
2035
2036 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
2037 /* Global tunable limits */
2038 blsize_t nbufh; /* number of buffer headers */
2039 blsize_t nbuflow; /* minimum number of buffer headers required */
2040 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
2041 blsize_t nbuftarget; /* preferred number of buffer headers */
2042
2043 /*
2044 * assertions:
2045 *
2046 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2047 * 2. nbufhigh <= MAXNBUF
2048 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2049 * 4. nbufh can not be set by sysctl().
2050 */
2051
2052 /* Per queue tunable limits */
2053
2054 struct bufqlim {
2055 blsize_t bl_nlow; /* minimum number of buffer headers required */
2056 blsize_t bl_num; /* number of buffer headers on the queue */
2057 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2058 blsize_t bl_target; /* preferred number of buffer headers */
2059 long bl_stale; /* Seconds after which a buffer is considered stale */
2060 } bufqlim[BQUEUES];
2061
2062 /*
2063 * assertions:
2064 *
2065 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2066 * 2. bl_nlhigh <= MAXNBUF
2067 * 3. bufqlim[BQ_META].bl_nlow != 0
2068 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2069 * file system IO operations)
2070 * 5. bl_num can not be set by sysctl().
2071 * 6. bl_nhigh <= nbufhigh
2072 */
2073
2074 /*
2075 * Rationale:
2076 * ----------
2077 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2078 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2079 *
2080 * These limits are exported to by means of sysctl().
2081 * It was decided to define blsize_t as a 64 bit quantity.
2082 * This will make sure that we will not be required to change it
2083 * as long as we do not exceed 64 bit address space for the kernel.
2084 *
2085 * low and high numbers parameters initialized at compile time
2086 * and boot arguments can be used to override them. sysctl()
2087 * would not change the value. sysctl() can get all the values
2088 * but can set only target. num is the current level.
2089 *
2090 * Advantages of having a "bufqscan" thread doing the balancing are,
2091 * Keep enough bufs on BQ_EMPTY.
2092 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2093 * getnewbuf() perfoms best if a buffer was found there.
2094 * Also this minimizes the possibility of starting IO
2095 * from getnewbuf(). That's a performance win, too.
2096 *
2097 * Localize complex logic [balancing as well as time aging]
2098 * to balancebufq().
2099 *
2100 * Simplify getnewbuf() logic by elimination of time aging code.
2101 */
2102
2103 /*
2104 * Algorithm:
2105 * -----------
2106 * The goal of the dynamic scaling of the buffer queues to to keep
2107 * the size of the LRU close to bl_target. Buffers on a queue would
2108 * be time aged.
2109 *
2110 * There would be a thread which will be responsible for "balancing"
2111 * the buffer cache queues.
2112 *
2113 * The scan order would be: AGE, LRU, META, EMPTY.
2114 */
2115
2116 long bufqscanwait = 0;
2117
2118 static void bufqscan_thread();
2119 static int balancebufq(int q);
2120 static int btrimempty(int n);
2121 static __inline__ int initbufqscan(void);
2122 static __inline__ int nextbufq(int q);
2123 static void buqlimprt(int all);
2124
2125 static void
2126 bufq_balance_thread_init()
2127 {
2128
2129 if (bufqscanwait++ == 0) {
2130
2131 /* Initalize globals */
2132 MAXNBUF = (mem_size / PAGE_SIZE);
2133 nbufh = nbuf;
2134 nbuflow = min(nbufh, 100);
2135 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2136 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2137 nbuftarget = max(nbuflow, nbuftarget);
2138 nbuftarget = min(nbufhigh, nbuftarget);
2139
2140 /*
2141 * Initialize the bufqlim
2142 */
2143
2144 /* LOCKED queue */
2145 bufqlim[BQ_LOCKED].bl_nlow = 0;
2146 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2147 bufqlim[BQ_LOCKED].bl_target = 0;
2148 bufqlim[BQ_LOCKED].bl_stale = 30;
2149
2150 /* LRU queue */
2151 bufqlim[BQ_LRU].bl_nlow = 0;
2152 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2153 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2154 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2155
2156 /* AGE queue */
2157 bufqlim[BQ_AGE].bl_nlow = 0;
2158 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2159 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2160 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2161
2162 /* EMPTY queue */
2163 bufqlim[BQ_EMPTY].bl_nlow = 0;
2164 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2165 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2166 bufqlim[BQ_EMPTY].bl_stale = 600000;
2167
2168 /* META queue */
2169 bufqlim[BQ_META].bl_nlow = 0;
2170 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2171 bufqlim[BQ_META].bl_target = nbuftarget/4;
2172 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2173
2174 /* LAUNDRY queue */
2175 bufqlim[BQ_LOCKED].bl_nlow = 0;
2176 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2177 bufqlim[BQ_LOCKED].bl_target = 0;
2178 bufqlim[BQ_LOCKED].bl_stale = 30;
2179
2180 buqlimprt(1);
2181 }
2182
2183 /* create worker thread */
2184 kernel_thread(kernel_task, bufqscan_thread);
2185 }
2186
2187 /* The workloop for the buffer balancing thread */
2188 static void
2189 bufqscan_thread()
2190 {
2191 boolean_t funnel_state;
2192 int moretodo = 0;
2193
2194 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2195
2196 for(;;) {
2197 do {
2198 int q; /* buffer queue to process */
2199
2200 q = initbufqscan();
2201 for (; q; ) {
2202 moretodo |= balancebufq(q);
2203 q = nextbufq(q);
2204 }
2205 } while (moretodo);
2206
2207 #if DIAGNOSTIC
2208 vfs_bufstats();
2209 buqlimprt(0);
2210 #endif
2211 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2212 moretodo = 0;
2213 }
2214
2215 (void) thread_funnel_set(kernel_flock, FALSE);
2216 }
2217
2218 /* Seed for the buffer queue balancing */
2219 static __inline__ int
2220 initbufqscan()
2221 {
2222 /* Start with AGE queue */
2223 return (BQ_AGE);
2224 }
2225
2226 /* Pick next buffer queue to balance */
2227 static __inline__ int
2228 nextbufq(int q)
2229 {
2230 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2231
2232 q++;
2233 q %= sizeof(order);
2234 return (order[q]);
2235 }
2236
2237 /* function to balance the buffer queues */
2238 static int
2239 balancebufq(int q)
2240 {
2241 int moretodo = 0;
2242 int s = splbio();
2243 int n;
2244
2245 /* reject invalid q */
2246 if ((q < 0) || (q >= BQUEUES))
2247 goto out;
2248
2249 /* LOCKED or LAUNDRY queue MUST not be balanced */
2250 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2251 goto out;
2252
2253 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2254
2255 /* If queue has less than target nothing more to do */
2256 if (n < 0)
2257 goto out;
2258
2259 if ( n > 8 ) {
2260 /* Balance only a small amount (12.5%) at a time */
2261 n >>= 3;
2262 }
2263
2264 /* EMPTY queue needs special handling */
2265 if (q == BQ_EMPTY) {
2266 moretodo |= btrimempty(n);
2267 goto out;
2268 }
2269
2270 for (; n > 0; n--) {
2271 struct buf *bp = bufqueues[q].tqh_first;
2272 if (!bp)
2273 break;
2274
2275 /* check if it's stale */
2276 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2277 if (bcleanbuf(bp)) {
2278 /* bawrite() issued, bp not ready */
2279 moretodo = 1;
2280 } else {
2281 /* release the cleaned buffer to BQ_EMPTY */
2282 SET(bp->b_flags, B_INVAL);
2283 brelse(bp);
2284 }
2285 } else
2286 break;
2287 }
2288
2289 out:
2290 splx(s);
2291 return (moretodo);
2292 }
2293
2294 static int
2295 btrimempty(int n)
2296 {
2297 /*
2298 * When struct buf are allocated dynamically, this would
2299 * reclaim upto 'n' struct buf from the empty queue.
2300 */
2301
2302 return (0);
2303 }
2304
2305 static __inline__ void
2306 bufqinc(int q)
2307 {
2308 if ((q < 0) || (q >= BQUEUES))
2309 return;
2310
2311 bufqlim[q].bl_num++;
2312 return;
2313 }
2314
2315 static __inline__ void
2316 bufqdec(int q)
2317 {
2318 if ((q < 0) || (q >= BQUEUES))
2319 return;
2320
2321 bufqlim[q].bl_num--;
2322 return;
2323 }
2324
2325 static void
2326 buqlimprt(int all)
2327 {
2328 int i;
2329 static char *bname[BQUEUES] =
2330 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2331
2332 if (all)
2333 for (i = 0; i < BQUEUES; i++) {
2334 printf("%s : ", bname[i]);
2335 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2336 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2337 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2338 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2339 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2340 }
2341 else
2342 for (i = 0; i < BQUEUES; i++) {
2343 printf("%s : ", bname[i]);
2344 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2345 }
2346 }
2347
2348 /*
2349 * If the getnewbuf() calls bcleanbuf() on the same thread
2350 * there is a potential for stack overrun and deadlocks.
2351 * So we always handoff the work to worker thread for completion
2352 */
2353
2354 static void
2355 bcleanbuf_thread_init()
2356 {
2357 static void bcleanbuf_thread();
2358
2359 /* create worker thread */
2360 kernel_thread(kernel_task, bcleanbuf_thread);
2361 }
2362
2363 static void
2364 bcleanbuf_thread()
2365 {
2366 boolean_t funnel_state;
2367 struct buf *bp;
2368 int error = 0;
2369 int loopcnt = 0;
2370
2371 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2372
2373 doit:
2374 while (blaundrycnt == 0)
2375 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2376 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2377 /* Remove from the queue */
2378 bremfree(bp);
2379 blaundrycnt--;
2380 /* do the IO */
2381 error = bawrite_internal(bp, 0);
2382 if (error) {
2383 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2384 blaundrycnt++;
2385 if (loopcnt > 10) {
2386 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2387 loopcnt = 0;
2388 } else {
2389 (void)thread_block(THREAD_CONTINUE_NULL);
2390 loopcnt++;
2391 }
2392 }
2393 /* start again */
2394 goto doit;
2395
2396 (void) thread_funnel_set(kernel_flock, funnel_state);
2397 }
2398
2399
2400 static int
2401 bp_cmp(void *a, void *b)
2402 {
2403 struct buf *bp_a = *(struct buf **)a,
2404 *bp_b = *(struct buf **)b;
2405 daddr_t res;
2406
2407 // don't have to worry about negative block
2408 // numbers so this is ok to do.
2409 //
2410 res = (bp_a->b_blkno - bp_b->b_blkno);
2411
2412 return (int)res;
2413 }
2414
2415 #define NFLUSH 32
2416
2417 int
2418 bflushq(int whichq, struct mount *mp)
2419 {
2420 struct buf *bp, *next;
2421 int i, buf_count, s;
2422 int counter=0, total_writes=0;
2423 static struct buf *flush_table[NFLUSH];
2424
2425 if (whichq < 0 || whichq >= BQUEUES) {
2426 return;
2427 }
2428
2429
2430 restart:
2431 bp = TAILQ_FIRST(&bufqueues[whichq]);
2432 for(buf_count=0; bp; bp=next) {
2433 next = bp->b_freelist.tqe_next;
2434
2435 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2436 continue;
2437 }
2438
2439 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2440 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2441 panic("bflushq: bp @ 0x%x is locked!\n", bp);
2442 }
2443
2444 bremfree(bp);
2445 bp->b_flags |= B_BUSY;
2446 flush_table[buf_count] = bp;
2447 buf_count++;
2448 total_writes++;
2449
2450 if (buf_count >= NFLUSH) {
2451 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2452
2453 for(i=0; i < buf_count; i++) {
2454 bawrite(flush_table[i]);
2455 }
2456
2457 goto restart;
2458 }
2459 }
2460 }
2461
2462 if (buf_count > 0) {
2463 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2464 for(i=0; i < buf_count; i++) {
2465 bawrite(flush_table[i]);
2466 }
2467 }
2468
2469 return total_writes;
2470 }