]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
57c206760d1eb5aa2c5cac31e2cea63915a52fdd
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67 /*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/proc.h>
77 #include <sys/buf.h>
78 #include <sys/vnode.h>
79 #include <sys/mount.h>
80 #include <sys/trace.h>
81 #include <sys/malloc.h>
82 #include <sys/resourcevar.h>
83 #include <miscfs/specfs/specdev.h>
84 #include <sys/ubc.h>
85 #include <vm/vm_pageout.h>
86 #if DIAGNOSTIC
87 #include <kern/assert.h>
88 #endif /* DIAGNOSTIC */
89 #include <kern/task.h>
90 #include <kern/zalloc.h>
91
92 #include <sys/kdebug.h>
93 #include <machine/spl.h>
94
95 static __inline__ void bufqinc(int q);
96 static __inline__ void bufqdec(int q);
97
98 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
99 static int bcleanbuf(struct buf *bp);
100 extern void vwakeup();
101
102 extern int niobuf; /* The number of IO buffer headers for cluster IO */
103 int blaundrycnt;
104
105 /* zone allocated buffer headers */
106 static zone_t buf_hdr_zone;
107 static int buf_hdr_count;
108
109 #if TRACE
110 struct proc *traceproc;
111 int tracewhich, tracebuf[TRCSIZ];
112 u_int tracex;
113 char traceflags[TR_NFLAGS];
114 #endif /* TRACE */
115
116 /*
117 * Definitions for the buffer hash lists.
118 */
119 #define BUFHASH(dvp, lbn) \
120 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
121 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
122 u_long bufhash;
123
124 /* Definitions for the buffer stats. */
125 struct bufstats bufstats;
126
127 /* Number of delayed write buffers */
128 int nbdwrite = 0;
129
130 /*
131 * Insq/Remq for the buffer hash lists.
132 */
133 #if 0
134 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
135 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
136 #endif /* 0 */
137
138
139 TAILQ_HEAD(ioqueue, buf) iobufqueue;
140 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
141 static int needbuffer;
142 static int need_iobuffer;
143
144 /*
145 * Insq/Remq for the buffer free lists.
146 */
147 #define binsheadfree(bp, dp, whichq) do { \
148 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
149 bufqinc((whichq)); \
150 (bp)->b_whichq = whichq; \
151 (bp)->b_timestamp = time.tv_sec; \
152 } while (0)
153
154 #define binstailfree(bp, dp, whichq) do { \
155 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
156 bufqinc((whichq)); \
157 (bp)->b_whichq = whichq; \
158 (bp)->b_timestamp = time.tv_sec; \
159 } while (0)
160
161 #define BHASHENTCHECK(bp) \
162 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
163 panic("%x: b_hash.le_prev is not deadbeef", (bp));
164
165 #define BLISTNONE(bp) \
166 (bp)->b_hash.le_next = (struct buf *)0; \
167 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
168
169 /*
170 * Insq/Remq for the vnode usage lists.
171 */
172 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
173 #define bufremvn(bp) { \
174 LIST_REMOVE(bp, b_vnbufs); \
175 (bp)->b_vnbufs.le_next = NOLIST; \
176 }
177
178 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
179
180 /* number of per vnode, "in flight" buffer writes */
181 #define BUFWRITE_THROTTLE 9
182
183
184 /*
185 * Time in seconds before a buffer on a list is
186 * considered as a stale buffer
187 */
188 #define LRU_IS_STALE 120 /* default value for the LRU */
189 #define AGE_IS_STALE 60 /* default value for the AGE */
190 #define META_IS_STALE 180 /* default value for the BQ_META */
191
192 int lru_is_stale = LRU_IS_STALE;
193 int age_is_stale = AGE_IS_STALE;
194 int meta_is_stale = META_IS_STALE;
195
196 /* LIST_INSERT_HEAD() with assertions */
197 static __inline__ void
198 blistenterhead(struct bufhashhdr * head, struct buf * bp)
199 {
200 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
201 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
202 (head)->lh_first = bp;
203 bp->b_hash.le_prev = &(head)->lh_first;
204 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
205 panic("blistenterhead: le_prev is deadbeef");
206 }
207
208 static __inline__ void
209 binshash(struct buf *bp, struct bufhashhdr *dp)
210 {
211 struct buf *nbp;
212
213 simple_lock(&bufhashlist_slock);
214
215 #if 0
216 if((bad = incore(bp->b_vp, bp->b_lblkno)))
217 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
218 #endif /* 0 */
219
220 BHASHENTCHECK(bp);
221
222 nbp = dp->lh_first;
223 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
224 if(nbp == bp)
225 panic("buf already in hashlist");
226 }
227
228 blistenterhead(dp, bp);
229 simple_unlock(&bufhashlist_slock);
230 }
231
232 static __inline__ void
233 bremhash(struct buf *bp)
234 {
235 simple_lock(&bufhashlist_slock);
236 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
237 panic("bremhash le_prev is deadbeef");
238 if (bp->b_hash.le_next == bp)
239 panic("bremhash: next points to self");
240
241 if (bp->b_hash.le_next != NULL)
242 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
243 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
244 simple_unlock(&bufhashlist_slock);
245 }
246
247 /*
248 * Remove a buffer from the free list it's on
249 */
250 void
251 bremfree(bp)
252 struct buf *bp;
253 {
254 struct bqueues *dp = NULL;
255 int whichq = -1;
256
257 /*
258 * We only calculate the head of the freelist when removing
259 * the last element of the list as that is the only time that
260 * it is needed (e.g. to reset the tail pointer).
261 *
262 * NB: This makes an assumption about how tailq's are implemented.
263 */
264 if (bp->b_freelist.tqe_next == NULL) {
265 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
266 if (dp->tqh_last == &bp->b_freelist.tqe_next)
267 break;
268 if (dp == &bufqueues[BQUEUES])
269 panic("bremfree: lost tail");
270 }
271 TAILQ_REMOVE(dp, bp, b_freelist);
272 whichq = bp->b_whichq;
273 bufqdec(whichq);
274 bp->b_whichq = -1;
275 bp->b_timestamp = 0;
276 }
277
278 /*
279 * Associate a buffer with a vnode.
280 */
281 static void
282 bgetvp(vp, bp)
283 register struct vnode *vp;
284 register struct buf *bp;
285 {
286
287 if (bp->b_vp != vp)
288 panic("bgetvp: not free");
289 VHOLD(vp);
290 bp->b_vp = vp;
291 if (vp->v_type == VBLK || vp->v_type == VCHR)
292 bp->b_dev = vp->v_rdev;
293 else
294 bp->b_dev = NODEV;
295 /*
296 * Insert onto list for new vnode.
297 */
298 bufinsvn(bp, &vp->v_cleanblkhd);
299 }
300
301 /*
302 * Disassociate a buffer from a vnode.
303 */
304 static void
305 brelvp(bp)
306 register struct buf *bp;
307 {
308 struct vnode *vp;
309
310 if (bp->b_vp == (struct vnode *) 0)
311 panic("brelvp: NULL vp");
312 /*
313 * Delete from old vnode list, if on one.
314 */
315 if (bp->b_vnbufs.le_next != NOLIST)
316 bufremvn(bp);
317 vp = bp->b_vp;
318 bp->b_vp = (struct vnode *) 0;
319 HOLDRELE(vp);
320 }
321
322 /*
323 * Reassign a buffer from one vnode to another.
324 * Used to assign file specific control information
325 * (indirect blocks) to the vnode to which they belong.
326 */
327 void
328 reassignbuf(bp, newvp)
329 register struct buf *bp;
330 register struct vnode *newvp;
331 {
332 register struct buflists *listheadp;
333
334 if (newvp == NULL) {
335 printf("reassignbuf: NULL");
336 return;
337 }
338 /*
339 * Delete from old vnode list, if on one.
340 */
341 if (bp->b_vnbufs.le_next != NOLIST)
342 bufremvn(bp);
343 /*
344 * If dirty, put on list of dirty buffers;
345 * otherwise insert onto list of clean buffers.
346 */
347 if (ISSET(bp->b_flags, B_DELWRI))
348 listheadp = &newvp->v_dirtyblkhd;
349 else
350 listheadp = &newvp->v_cleanblkhd;
351 bufinsvn(bp, listheadp);
352 }
353
354 static __inline__ void
355 bufhdrinit(struct buf *bp)
356 {
357 bzero((char *)bp, sizeof *bp);
358 bp->b_dev = NODEV;
359 bp->b_rcred = NOCRED;
360 bp->b_wcred = NOCRED;
361 bp->b_vnbufs.le_next = NOLIST;
362 bp->b_flags = B_INVAL;
363
364 return;
365 }
366
367 /*
368 * Initialize buffers and hash links for buffers.
369 */
370 __private_extern__ void
371 bufinit()
372 {
373 register struct buf *bp;
374 register struct bqueues *dp;
375 register int i;
376 int metabuf;
377 long whichq;
378 static void bufzoneinit();
379 static void bcleanbuf_thread_init();
380
381 /* Initialize the buffer queues ('freelists') and the hash table */
382 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
383 TAILQ_INIT(dp);
384 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
385
386 simple_lock_init(&bufhashlist_slock );
387
388 metabuf = nbuf/8; /* reserved for meta buf */
389
390 /* Initialize the buffer headers */
391 for (i = 0; i < nbuf; i++) {
392 bp = &buf[i];
393 bufhdrinit(bp);
394
395 /*
396 * metabuf buffer headers on the meta-data list and
397 * rest of the buffer headers on the empty list
398 */
399 if (--metabuf)
400 whichq = BQ_META;
401 else
402 whichq = BQ_EMPTY;
403
404 BLISTNONE(bp);
405 dp = &bufqueues[whichq];
406 binsheadfree(bp, dp, whichq);
407 binshash(bp, &invalhash);
408 }
409
410 for (; i < nbuf + niobuf; i++) {
411 bp = &buf[i];
412 bufhdrinit(bp);
413 binsheadfree(bp, &iobufqueue, -1);
414 }
415
416 printf("using %d buffer headers and %d cluster IO buffer headers\n",
417 nbuf, niobuf);
418
419 /* Set up zones used by the buffer cache */
420 bufzoneinit();
421
422 /* start the bcleanbuf() thread */
423 bcleanbuf_thread_init();
424
425 #if 0 /* notyet */
426 {
427 static void bufq_balance_thread_init();
428 /* create a thread to do dynamic buffer queue balancing */
429 bufq_balance_thread_init();
430 }
431 #endif /* notyet */
432 }
433
434 static struct buf *
435 bio_doread(vp, blkno, size, cred, async, queuetype)
436 struct vnode *vp;
437 daddr_t blkno;
438 int size;
439 struct ucred *cred;
440 int async;
441 int queuetype;
442 {
443 register struct buf *bp;
444 struct proc *p = current_proc();
445
446 bp = getblk(vp, blkno, size, 0, 0, queuetype);
447
448 /*
449 * If buffer does not have data valid, start a read.
450 * Note that if buffer is B_INVAL, getblk() won't return it.
451 * Therefore, it's valid if it's I/O has completed or been delayed.
452 */
453 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
454 /* Start I/O for the buffer (keeping credentials). */
455 SET(bp->b_flags, B_READ | async);
456 if (cred != NOCRED && bp->b_rcred == NOCRED) {
457 /*
458 * NFS has embedded ucred.
459 * Can not crhold() here as that causes zone corruption
460 */
461 bp->b_rcred = crdup(cred);
462 }
463
464 VOP_STRATEGY(bp);
465
466 trace(TR_BREADMISS, pack(vp, size), blkno);
467
468 /* Pay for the read. */
469 if (p && p->p_stats)
470 p->p_stats->p_ru.ru_inblock++; /* XXX */
471 } else if (async) {
472 brelse(bp);
473 }
474
475 trace(TR_BREADHIT, pack(vp, size), blkno);
476
477 return (bp);
478 }
479 /*
480 * Read a disk block.
481 * This algorithm described in Bach (p.54).
482 */
483 int
484 bread(vp, blkno, size, cred, bpp)
485 struct vnode *vp;
486 daddr_t blkno;
487 int size;
488 struct ucred *cred;
489 struct buf **bpp;
490 {
491 register struct buf *bp;
492
493 /* Get buffer for block. */
494 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
495
496 /* Wait for the read to complete, and return result. */
497 return (biowait(bp));
498 }
499
500 /*
501 * Read a disk block. [bread() for meta-data]
502 * This algorithm described in Bach (p.54).
503 */
504 int
505 meta_bread(vp, blkno, size, cred, bpp)
506 struct vnode *vp;
507 daddr_t blkno;
508 int size;
509 struct ucred *cred;
510 struct buf **bpp;
511 {
512 register struct buf *bp;
513
514 /* Get buffer for block. */
515 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
516
517 /* Wait for the read to complete, and return result. */
518 return (biowait(bp));
519 }
520
521 /*
522 * Read-ahead multiple disk blocks. The first is sync, the rest async.
523 * Trivial modification to the breada algorithm presented in Bach (p.55).
524 */
525 int
526 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
527 struct vnode *vp;
528 daddr_t blkno; int size;
529 daddr_t rablks[]; int rasizes[];
530 int nrablks;
531 struct ucred *cred;
532 struct buf **bpp;
533 {
534 register struct buf *bp;
535 int i;
536
537 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
538
539 /*
540 * For each of the read-ahead blocks, start a read, if necessary.
541 */
542 for (i = 0; i < nrablks; i++) {
543 /* If it's in the cache, just go on to next one. */
544 if (incore(vp, rablks[i]))
545 continue;
546
547 /* Get a buffer for the read-ahead block */
548 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
549 }
550
551 /* Otherwise, we had to start a read for it; wait until it's valid. */
552 return (biowait(bp));
553 }
554
555 /*
556 * Read with single-block read-ahead. Defined in Bach (p.55), but
557 * implemented as a call to breadn().
558 * XXX for compatibility with old file systems.
559 */
560 int
561 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
562 struct vnode *vp;
563 daddr_t blkno; int size;
564 daddr_t rablkno; int rabsize;
565 struct ucred *cred;
566 struct buf **bpp;
567 {
568
569 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
570 }
571
572 /*
573 * Block write. Described in Bach (p.56)
574 */
575 int
576 bwrite(bp)
577 struct buf *bp;
578 {
579 int rv, sync, wasdelayed;
580 struct proc *p = current_proc();
581 struct vnode *vp = bp->b_vp;
582
583 /* Remember buffer type, to switch on it later. */
584 sync = !ISSET(bp->b_flags, B_ASYNC);
585 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
586 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
587 if (wasdelayed) {
588 nbdwrite--;
589 wakeup((caddr_t)&nbdwrite);
590 }
591
592 if (!sync) {
593 /*
594 * If not synchronous, pay for the I/O operation and make
595 * sure the buf is on the correct vnode queue. We have
596 * to do this now, because if we don't, the vnode may not
597 * be properly notified that its I/O has completed.
598 */
599 if (wasdelayed)
600 reassignbuf(bp, vp);
601 else
602 if (p && p->p_stats)
603 p->p_stats->p_ru.ru_oublock++; /* XXX */
604 }
605
606 trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
607
608 /* Initiate disk write. Make sure the appropriate party is charged. */
609 SET(bp->b_flags, B_WRITEINPROG);
610 vp->v_numoutput++;
611
612 VOP_STRATEGY(bp);
613
614 if (sync) {
615 /*
616 * If I/O was synchronous, wait for it to complete.
617 */
618 rv = biowait(bp);
619
620 /*
621 * Pay for the I/O operation, if it's not been paid for, and
622 * make sure it's on the correct vnode queue. (async operatings
623 * were payed for above.)
624 */
625 if (wasdelayed)
626 reassignbuf(bp, vp);
627 else
628 if (p && p->p_stats)
629 p->p_stats->p_ru.ru_oublock++; /* XXX */
630
631 /* Release the buffer. */
632 // XXXdbg - only if the unused bit is set
633 if (!ISSET(bp->b_flags, B_NORELSE)) {
634 brelse(bp);
635 } else {
636 CLR(bp->b_flags, B_NORELSE);
637 }
638
639 return (rv);
640 } else {
641 return (0);
642 }
643 }
644
645 int
646 vn_bwrite(ap)
647 struct vop_bwrite_args *ap;
648 {
649 return (bwrite(ap->a_bp));
650 }
651
652 /*
653 * Delayed write.
654 *
655 * The buffer is marked dirty, but is not queued for I/O.
656 * This routine should be used when the buffer is expected
657 * to be modified again soon, typically a small write that
658 * partially fills a buffer.
659 *
660 * NB: magnetic tapes cannot be delayed; they must be
661 * written in the order that the writes are requested.
662 *
663 * Described in Leffler, et al. (pp. 208-213).
664 *
665 * Note: With the abilitty to allocate additional buffer
666 * headers, we can get in to the situation where "too" many
667 * bdwrite()s can create situation where the kernel can create
668 * buffers faster than the disks can service. Doing a bawrite() in
669 * cases were we have "too many" outstanding bdwrite()s avoids that.
670 */
671 __private_extern__ int
672 bdwrite_internal(bp, return_error)
673 struct buf *bp;
674 int return_error;
675 {
676 struct proc *p = current_proc();
677 struct vnode *vp = bp->b_vp;
678
679 /*
680 * If the block hasn't been seen before:
681 * (1) Mark it as having been seen,
682 * (2) Charge for the write.
683 * (3) Make sure it's on its vnode's correct block list,
684 */
685 if (!ISSET(bp->b_flags, B_DELWRI)) {
686 SET(bp->b_flags, B_DELWRI);
687 if (p && p->p_stats)
688 p->p_stats->p_ru.ru_oublock++; /* XXX */
689 nbdwrite ++;
690 reassignbuf(bp, vp);
691 }
692
693 /* If this is a tape block, write it the block now. */
694 if (ISSET(bp->b_flags, B_TAPE)) {
695 /* bwrite(bp); */
696 VOP_BWRITE(bp);
697 return (0);
698 }
699
700 /*
701 * If the vnode has "too many" write operations in progress
702 * wait for them to finish the IO
703 */
704 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
705 vp->v_flag |= VTHROTTLED;
706 (void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
707 }
708
709 /*
710 * If we have too many delayed write buffers,
711 * more than we can "safely" handle, just fall back to
712 * doing the async write
713 */
714 if (nbdwrite < 0)
715 panic("bdwrite: Negative nbdwrite");
716
717 // can't do a bawrite() if the LOCKED bit is set because the
718 // buffer is part of a transaction and can't go to disk until
719 // the LOCKED bit is cleared.
720 if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
721 if (return_error)
722 return (EAGAIN);
723 else
724 bawrite(bp);
725 return (0);
726 }
727
728 /* Otherwise, the "write" is done, so mark and release the buffer. */
729 SET(bp->b_flags, B_DONE);
730 brelse(bp);
731 return (0);
732 }
733
734 void
735 bdwrite(bp)
736 struct buf *bp;
737 {
738 (void) bdwrite_internal(bp, 0);
739 }
740
741
742 /*
743 * Asynchronous block write; just an asynchronous bwrite().
744 *
745 * Note: With the abilitty to allocate additional buffer
746 * headers, we can get in to the situation where "too" many
747 * bawrite()s can create situation where the kernel can create
748 * buffers faster than the disks can service.
749 * We limit the number of "in flight" writes a vnode can have to
750 * avoid this.
751 */
752 static int
753 bawrite_internal(bp, throttle)
754 struct buf *bp;
755 int throttle;
756 {
757 struct vnode *vp = bp->b_vp;
758
759 if (vp) {
760 /*
761 * If the vnode has "too many" write operations in progress
762 * wait for them to finish the IO
763 */
764 while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
765 if (throttle) {
766 vp->v_flag |= VTHROTTLED;
767 (void)tsleep((caddr_t)&vp->v_numoutput,
768 PRIBIO + 1, "bawrite", 0);
769 } else
770 return (EWOULDBLOCK);
771 }
772 }
773
774 SET(bp->b_flags, B_ASYNC);
775 VOP_BWRITE(bp);
776 return (0);
777 }
778
779 void
780 bawrite(bp)
781 struct buf *bp;
782 {
783 (void) bawrite_internal(bp, 1);
784 }
785
786 /*
787 * bwillwrite:
788 *
789 * Called prior to the locking of any vnodes when we are expecting to
790 * write. We do not want to starve the buffer cache with too many
791 * dirty buffers so we block here. By blocking prior to the locking
792 * of any vnodes we attempt to avoid the situation where a locked vnode
793 * prevents the various system daemons from flushing related buffers.
794 */
795
796 void
797 bwillwrite(void)
798 {
799 /* XXX To be implemented later */
800 }
801
802 /*
803 * Release a buffer on to the free lists.
804 * Described in Bach (p. 46).
805 */
806 void
807 brelse(bp)
808 struct buf *bp;
809 {
810 struct bqueues *bufq;
811 int s;
812 long whichq;
813
814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
815 bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
816 bp->b_flags, 0);
817
818 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
819
820 // if we're invalidating a buffer that has the B_CALL bit
821 // set then call the b_iodone function so it gets cleaned
822 // up properly.
823 //
824 if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
825 if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
826 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
827 }
828 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
829 void (*iodone_func)(struct buf *) = bp->b_iodone;
830
831 CLR(bp->b_flags, B_CALL); /* but note callout done */
832 bp->b_iodone = NULL;
833
834 if (iodone_func == NULL) {
835 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
836 }
837 (*iodone_func)(bp);
838 }
839 }
840
841 /* IO is done. Cleanup the UPL state */
842 if (!ISSET(bp->b_flags, B_META)
843 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
844 kern_return_t kret;
845 upl_t upl;
846 int upl_flags;
847
848 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
849 if ( !ISSET(bp->b_flags, B_INVAL)) {
850 kret = ubc_create_upl(bp->b_vp,
851 ubc_blktooff(bp->b_vp, bp->b_lblkno),
852 bp->b_bufsize,
853 &upl,
854 NULL,
855 UPL_PRECIOUS);
856 if (kret != KERN_SUCCESS)
857 panic("brelse: Failed to get pagelists");
858 #ifdef UBC_DEBUG
859 upl_ubc_alias_set(upl, bp, 5);
860 #endif /* UBC_DEBUG */
861 } else
862 upl = (upl_t) 0;
863 } else {
864 upl = bp->b_pagelist;
865 kret = ubc_upl_unmap(upl);
866
867 if (kret != KERN_SUCCESS)
868 panic("kernel_upl_unmap failed");
869 bp->b_data = 0;
870 }
871 if (upl) {
872 if (bp->b_flags & (B_ERROR | B_INVAL)) {
873 if (bp->b_flags & (B_READ | B_INVAL))
874 upl_flags = UPL_ABORT_DUMP_PAGES;
875 else
876 upl_flags = 0;
877 ubc_upl_abort(upl, upl_flags);
878 } else {
879 if (ISSET(bp->b_flags, B_NEEDCOMMIT))
880 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
881 else if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
882 upl_flags = UPL_COMMIT_SET_DIRTY ;
883 else
884 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
885 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
886 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
887 }
888 s = splbio();
889 CLR(bp->b_flags, B_PAGELIST);
890 bp->b_pagelist = 0;
891 splx(s);
892 }
893 } else {
894 if(ISSET(bp->b_flags, B_PAGELIST))
895 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
896 }
897
898 /* Wake up any processes waiting for any buffer to become free. */
899 if (needbuffer) {
900 needbuffer = 0;
901 wakeup(&needbuffer);
902 }
903
904 /* Wake up any proceeses waiting for _this_ buffer to become free. */
905 if (ISSET(bp->b_flags, B_WANTED)) {
906 CLR(bp->b_flags, B_WANTED);
907 wakeup(bp);
908 }
909
910 /* Block disk interrupts. */
911 s = splbio();
912
913 /*
914 * Determine which queue the buffer should be on, then put it there.
915 */
916
917 /* If it's locked, don't report an error; try again later. */
918 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
919 CLR(bp->b_flags, B_ERROR);
920
921 /* If it's not cacheable, or an error, mark it invalid. */
922 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
923 SET(bp->b_flags, B_INVAL);
924
925 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
926 /*
927 * If it's invalid or empty, dissociate it from its vnode
928 * and put on the head of the appropriate queue.
929 */
930 if (bp->b_vp)
931 brelvp(bp);
932 if (ISSET(bp->b_flags, B_DELWRI)) {
933 CLR(bp->b_flags, B_DELWRI);
934 nbdwrite--;
935 wakeup((caddr_t)&nbdwrite);
936 }
937 if (bp->b_bufsize <= 0)
938 whichq = BQ_EMPTY; /* no data */
939 else if (ISSET(bp->b_flags, B_META))
940 whichq = BQ_META; /* meta-data */
941 else
942 whichq = BQ_AGE; /* invalid data */
943
944 bufq = &bufqueues[whichq];
945 binsheadfree(bp, bufq, whichq);
946 } else {
947 /*
948 * It has valid data. Put it on the end of the appropriate
949 * queue, so that it'll stick around for as long as possible.
950 */
951 if (ISSET(bp->b_flags, B_LOCKED))
952 whichq = BQ_LOCKED; /* locked in core */
953 else if (ISSET(bp->b_flags, B_META))
954 whichq = BQ_META; /* meta-data */
955 else if (ISSET(bp->b_flags, B_AGE))
956 whichq = BQ_AGE; /* stale but valid data */
957 else
958 whichq = BQ_LRU; /* valid data */
959
960 bufq = &bufqueues[whichq];
961 binstailfree(bp, bufq, whichq);
962 }
963
964 /* Unlock the buffer. */
965 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
966
967 /* Allow disk interrupts. */
968 splx(s);
969
970 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
971 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
972 }
973
974 /*
975 * Determine if a block is in the cache.
976 * Just look on what would be its hash chain. If it's there, return
977 * a pointer to it, unless it's marked invalid. If it's marked invalid,
978 * we normally don't return the buffer, unless the caller explicitly
979 * wants us to.
980 */
981 struct buf *
982 incore(vp, blkno)
983 struct vnode *vp;
984 daddr_t blkno;
985 {
986 struct buf *bp;
987
988 bp = BUFHASH(vp, blkno)->lh_first;
989
990 /* Search hash chain */
991 for (; bp != NULL; bp = bp->b_hash.le_next) {
992 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
993 !ISSET(bp->b_flags, B_INVAL))
994 return (bp);
995 }
996
997 return (0);
998 }
999
1000
1001 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1002 /*
1003 * Get a block of requested size that is associated with
1004 * a given vnode and block offset. If it is found in the
1005 * block cache, mark it as having been found, make it busy
1006 * and return it. Otherwise, return an empty block of the
1007 * correct size. It is up to the caller to insure that the
1008 * cached blocks be of the correct size.
1009 */
1010 struct buf *
1011 getblk(vp, blkno, size, slpflag, slptimeo, operation)
1012 register struct vnode *vp;
1013 daddr_t blkno;
1014 int size, slpflag, slptimeo, operation;
1015 {
1016 struct buf *bp;
1017 int s, err;
1018 upl_t upl;
1019 upl_page_info_t *pl;
1020 kern_return_t kret;
1021 int error=0;
1022 int pagedirty = 0;
1023
1024 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
1025 blkno * PAGE_SIZE, size, operation, 0, 0);
1026 start:
1027
1028 s = splbio();
1029 if ((bp = incore(vp, blkno))) {
1030 /* Found in the Buffer Cache */
1031 if (ISSET(bp->b_flags, B_BUSY)) {
1032 /* but is busy */
1033 switch (operation) {
1034 case BLK_READ:
1035 case BLK_WRITE:
1036 case BLK_META:
1037 SET(bp->b_flags, B_WANTED);
1038 bufstats.bufs_busyincore++;
1039 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
1040 slptimeo);
1041 splx(s);
1042 /*
1043 * Callers who call with PCATCH or timeout are
1044 * willing to deal with the NULL pointer
1045 */
1046 if (err && ((slpflag & PCATCH) ||
1047 ((err == EWOULDBLOCK) && slptimeo)))
1048 return (NULL);
1049 goto start;
1050 /*NOTREACHED*/
1051 break;
1052
1053 case BLK_PAGEIN:
1054 /* pagein operation must not use getblk */
1055 panic("getblk: pagein for incore busy buffer");
1056 splx(s);
1057 /*NOTREACHED*/
1058 break;
1059
1060 case BLK_PAGEOUT:
1061 /* pageout operation must not use getblk */
1062 panic("getblk: pageout for incore busy buffer");
1063 splx(s);
1064 /*NOTREACHED*/
1065 break;
1066
1067 default:
1068 panic("getblk: %d unknown operation 1", operation);
1069 /*NOTREACHED*/
1070 break;
1071 }
1072 } else {
1073 /* not busy */
1074 SET(bp->b_flags, (B_BUSY | B_CACHE));
1075 bremfree(bp);
1076 bufstats.bufs_incore++;
1077 splx(s);
1078
1079 allocbuf(bp, size);
1080 if (ISSET(bp->b_flags, B_PAGELIST))
1081 panic("pagelist buffer is not busy");
1082
1083 switch (operation) {
1084 case BLK_READ:
1085 case BLK_WRITE:
1086 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
1087 kret = ubc_create_upl(vp,
1088 ubc_blktooff(vp, bp->b_lblkno),
1089 bp->b_bufsize,
1090 &upl,
1091 &pl,
1092 UPL_PRECIOUS);
1093 if (kret != KERN_SUCCESS)
1094 panic("Failed to get pagelists");
1095
1096 SET(bp->b_flags, B_PAGELIST);
1097 bp->b_pagelist = upl;
1098
1099 if (!upl_valid_page(pl, 0)) {
1100 if (vp->v_tag != VT_NFS)
1101 panic("getblk: incore buffer without valid page");
1102 CLR(bp->b_flags, B_CACHE);
1103 }
1104
1105 if (upl_dirty_page(pl, 0))
1106 SET(bp->b_flags, B_WASDIRTY);
1107 else
1108 CLR(bp->b_flags, B_WASDIRTY);
1109
1110 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1111 if (kret != KERN_SUCCESS)
1112 panic("getblk: ubc_upl_map() failed with (%d)",
1113 kret);
1114 if (bp->b_data == 0)
1115 panic("ubc_upl_map mapped 0");
1116 }
1117 break;
1118
1119 case BLK_META:
1120 /*
1121 * VM is not involved in IO for the meta data
1122 * buffer already has valid data
1123 */
1124 if(bp->b_data == 0)
1125 panic("bp->b_data null incore buf=%x", bp);
1126 break;
1127
1128 case BLK_PAGEIN:
1129 case BLK_PAGEOUT:
1130 panic("getblk: paging operation 1");
1131 break;
1132
1133 default:
1134 panic("getblk: %d unknown operation 2", operation);
1135 /*NOTREACHED*/
1136 break;
1137 }
1138 }
1139 } else { /* not incore() */
1140 int queue = BQ_EMPTY; /* Start with no preference */
1141 splx(s);
1142
1143 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
1144 !(UBCINFOEXISTS(vp))) {
1145 operation = BLK_META;
1146 }
1147 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
1148 goto start;
1149 if (incore(vp, blkno)) {
1150 SET(bp->b_flags, B_INVAL);
1151 binshash(bp, &invalhash);
1152 brelse(bp);
1153 goto start;
1154 }
1155 /*
1156 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1157 * CALLED! BE CAREFUL.
1158 */
1159
1160 /*
1161 * if it is meta, the queue may be set to other
1162 * type so reset as well as mark it to be B_META
1163 * so that when buffer is released it will goto META queue
1164 * Also, if the vnode is not VREG, then it is META
1165 */
1166 if (operation == BLK_META) {
1167 SET(bp->b_flags, B_META);
1168 queue = BQ_META;
1169 }
1170
1171 bp->b_blkno = bp->b_lblkno = blkno;
1172 bp->b_vp = vp;
1173
1174 /*
1175 * Insert in the hash so that incore() can find it
1176 */
1177 binshash(bp, BUFHASH(vp, blkno));
1178
1179 s = splbio();
1180 bgetvp(vp, bp);
1181 splx(s);
1182
1183 allocbuf(bp, size);
1184
1185 switch (operation) {
1186 case BLK_META:
1187 /* buffer data is invalid */
1188
1189 if(bp->b_data == 0)
1190 panic("bp->b_data is null %x",bp);
1191
1192 bufstats.bufs_miss++;
1193
1194 /* wakeup the buffer */
1195 CLR(bp->b_flags, B_WANTED);
1196 wakeup(bp);
1197 break;
1198
1199 case BLK_READ:
1200 case BLK_WRITE:
1201
1202 if (ISSET(bp->b_flags, B_PAGELIST))
1203 panic("B_PAGELIST in bp=%x",bp);
1204
1205 kret = ubc_create_upl(vp,
1206 ubc_blktooff(vp, blkno),
1207 bp->b_bufsize,
1208 &upl,
1209 &pl,
1210 UPL_PRECIOUS);
1211 if (kret != KERN_SUCCESS)
1212 panic("Failed to get pagelists");
1213
1214 #ifdef UBC_DEBUG
1215 upl_ubc_alias_set(upl, bp, 4);
1216 #endif /* UBC_DEBUG */
1217 bp->b_pagelist = upl;
1218
1219 SET(bp->b_flags, B_PAGELIST);
1220
1221 if (upl_valid_page(pl, 0)) {
1222 SET(bp->b_flags, B_CACHE | B_DONE);
1223 bufstats.bufs_vmhits++;
1224
1225 pagedirty = upl_dirty_page(pl, 0);
1226
1227 if (pagedirty)
1228 SET(bp->b_flags, B_WASDIRTY);
1229
1230 if (vp->v_tag == VT_NFS) {
1231 off_t f_offset;
1232 int valid_size;
1233
1234 bp->b_validoff = 0;
1235 bp->b_dirtyoff = 0;
1236
1237 f_offset = ubc_blktooff(vp, blkno);
1238
1239 if (f_offset > vp->v_ubcinfo->ui_size) {
1240 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1241 bp->b_validend = 0;
1242 bp->b_dirtyend = 0;
1243 } else {
1244 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1245 bp->b_validend = valid_size;
1246
1247 if (pagedirty)
1248 bp->b_dirtyend = valid_size;
1249 else
1250 bp->b_dirtyend = 0;
1251
1252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1253 bp->b_validend, bp->b_dirtyend,
1254 (int)vp->v_ubcinfo->ui_size, 0, 0);
1255 }
1256 } else {
1257 bp->b_validoff = 0;
1258 bp->b_dirtyoff = 0;
1259
1260 if (pagedirty) {
1261 /* page is dirty */
1262 bp->b_validend = bp->b_bcount;
1263 bp->b_dirtyend = bp->b_bcount;
1264 } else {
1265 /* page is clean */
1266 bp->b_validend = bp->b_bcount;
1267 bp->b_dirtyend = 0;
1268 }
1269 }
1270 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
1271 if(error) {
1272 panic("getblk: VOP_BMAP failed");
1273 /*NOTREACHED*/
1274 /*
1275 * XXX: We probably should invalidate the VM Page
1276 */
1277 bp->b_error = error;
1278 SET(bp->b_flags, (B_ERROR | B_INVAL));
1279 /* undo B_DONE that was set before upl_commit() */
1280 CLR(bp->b_flags, B_DONE);
1281 brelse(bp);
1282 return (0);
1283 }
1284 } else {
1285 bufstats.bufs_miss++;
1286 }
1287 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1288 if (kret != KERN_SUCCESS) {
1289 panic("getblk: ubc_upl_map() "
1290 "failed with (%d)", kret);
1291 }
1292 if (bp->b_data == 0)
1293 panic("kernel_upl_map mapped 0");
1294
1295 break;
1296
1297 case BLK_PAGEIN:
1298 case BLK_PAGEOUT:
1299 panic("getblk: paging operation 2");
1300 break;
1301 default:
1302 panic("getblk: %d unknown operation 3", operation);
1303 /*NOTREACHED*/
1304 break;
1305 }
1306 }
1307
1308 if (bp->b_data == NULL)
1309 panic("getblk: bp->b_addr is null");
1310
1311 if (bp->b_bufsize & 0xfff) {
1312 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1313 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1314 }
1315
1316 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1317 (int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
1318
1319 return (bp);
1320 }
1321
1322 /*
1323 * Get an empty, disassociated buffer of given size.
1324 */
1325 struct buf *
1326 geteblk(size)
1327 int size;
1328 {
1329 struct buf *bp;
1330 int queue = BQ_EMPTY;
1331
1332 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1333 ;
1334 SET(bp->b_flags, (B_META|B_INVAL));
1335
1336 #if DIAGNOSTIC
1337 assert(queue == BQ_EMPTY);
1338 #endif /* DIAGNOSTIC */
1339 /* XXX need to implement logic to deal with other queues */
1340
1341 binshash(bp, &invalhash);
1342 allocbuf(bp, size);
1343 bufstats.bufs_eblk++;
1344
1345 return (bp);
1346 }
1347
1348 /*
1349 * Zones for the meta data buffers
1350 */
1351
1352 #define MINMETA 512
1353 #define MAXMETA 4096
1354
1355 struct meta_zone_entry {
1356 zone_t mz_zone;
1357 vm_size_t mz_size;
1358 vm_size_t mz_max;
1359 char *mz_name;
1360 };
1361
1362 struct meta_zone_entry meta_zones[] = {
1363 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1364 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1365 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1366 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1367 {NULL, 0, 0, "" } /* End */
1368 };
1369
1370 /*
1371 * Initialize the meta data zones
1372 */
1373 static void
1374 bufzoneinit(void)
1375 {
1376 int i;
1377
1378 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1379 meta_zones[i].mz_zone =
1380 zinit(meta_zones[i].mz_size,
1381 meta_zones[i].mz_max,
1382 PAGE_SIZE,
1383 meta_zones[i].mz_name);
1384 }
1385 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1386 }
1387
1388 static __inline__ zone_t
1389 getbufzone(size_t size)
1390 {
1391 int i;
1392
1393 if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
1394 panic("getbufzone: incorect size = %d", size);
1395
1396 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1397 if (meta_zones[i].mz_size >= size)
1398 break;
1399 }
1400
1401 return (meta_zones[i].mz_zone);
1402 }
1403
1404 /*
1405 * With UBC, there is no need to expand / shrink the file data
1406 * buffer. The VM uses the same pages, hence no waste.
1407 * All the file data buffers can have one size.
1408 * In fact expand / shrink would be an expensive operation.
1409 *
1410 * Only exception to this is meta-data buffers. Most of the
1411 * meta data operations are smaller than PAGE_SIZE. Having the
1412 * meta-data buffers grow and shrink as needed, optimizes use
1413 * of the kernel wired memory.
1414 */
1415
1416 int
1417 allocbuf(bp, size)
1418 struct buf *bp;
1419 int size;
1420 {
1421 vm_size_t desired_size;
1422
1423 desired_size = roundup(size, CLBYTES);
1424
1425 if(desired_size < PAGE_SIZE)
1426 desired_size = PAGE_SIZE;
1427 if (desired_size > MAXBSIZE)
1428 panic("allocbuf: buffer larger than MAXBSIZE requested");
1429
1430 if (ISSET(bp->b_flags, B_META)) {
1431 kern_return_t kret;
1432 zone_t zprev, z;
1433 size_t nsize = roundup(size, MINMETA);
1434
1435 if (bp->b_data) {
1436 vm_offset_t elem = (vm_offset_t)bp->b_data;
1437
1438 if (ISSET(bp->b_flags, B_ZALLOC))
1439 if (bp->b_bufsize <= MAXMETA) {
1440 if (bp->b_bufsize < nsize) {
1441 /* reallocate to a bigger size */
1442 desired_size = nsize;
1443
1444 zprev = getbufzone(bp->b_bufsize);
1445 z = getbufzone(nsize);
1446 bp->b_data = (caddr_t)zalloc(z);
1447 if(bp->b_data == 0)
1448 panic("allocbuf: zalloc() returned NULL");
1449 bcopy(elem, bp->b_data, bp->b_bufsize);
1450 zfree(zprev, elem);
1451 } else {
1452 desired_size = bp->b_bufsize;
1453 }
1454 } else
1455 panic("allocbuf: B_ZALLOC set incorrectly");
1456 else
1457 if (bp->b_bufsize < desired_size) {
1458 /* reallocate to a bigger size */
1459 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1460 if (kret != KERN_SUCCESS)
1461 panic("allocbuf: kmem_alloc() returned %d", kret);
1462 if(bp->b_data == 0)
1463 panic("allocbuf: null b_data");
1464 bcopy(elem, bp->b_data, bp->b_bufsize);
1465 kmem_free(kernel_map, elem, bp->b_bufsize);
1466 } else {
1467 desired_size = bp->b_bufsize;
1468 }
1469 } else {
1470 /* new allocation */
1471 if (nsize <= MAXMETA) {
1472 desired_size = nsize;
1473 z = getbufzone(nsize);
1474 bp->b_data = (caddr_t)zalloc(z);
1475 if(bp->b_data == 0)
1476 panic("allocbuf: zalloc() returned NULL 2");
1477 SET(bp->b_flags, B_ZALLOC);
1478 } else {
1479 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1480 if (kret != KERN_SUCCESS)
1481 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1482 if(bp->b_data == 0)
1483 panic("allocbuf: null b_data 2");
1484 }
1485 }
1486 }
1487
1488 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1489 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
1490
1491 bp->b_bufsize = desired_size;
1492 bp->b_bcount = size;
1493 return (0);
1494 }
1495
1496 /*
1497 * Get a new buffer from one of the free lists.
1498 *
1499 * Request for a queue is passes in. The queue from which the buffer was taken
1500 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1501 * BQUEUE means no preference. Use heuristics in that case.
1502 * Heuristics is as follows:
1503 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1504 * If none available block till one is made available.
1505 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1506 * Pick the most stale buffer.
1507 * If found buffer was marked delayed write, start the async. write
1508 * and restart the search.
1509 * Initialize the fields and disassociate the buffer from the vnode.
1510 * Remove the buffer from the hash. Return the buffer and the queue
1511 * on which it was found.
1512 */
1513
1514 static struct buf *
1515 getnewbuf(slpflag, slptimeo, queue)
1516 int slpflag, slptimeo;
1517 int *queue;
1518 {
1519 register struct buf *bp;
1520 register struct buf *lru_bp;
1521 register struct buf *age_bp;
1522 register struct buf *meta_bp;
1523 register int age_time, lru_time, bp_time, meta_time;
1524 int s;
1525 int req = *queue; /* save it for restarts */
1526
1527 start:
1528 s = splbio();
1529
1530 /* invalid request gets empty queue */
1531 if ((*queue > BQUEUES) || (*queue < 0)
1532 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1533 *queue = BQ_EMPTY;
1534
1535 /* (*queue == BQUEUES) means no preference */
1536 if (*queue != BQUEUES) {
1537 /* Try for the requested queue first */
1538 bp = bufqueues[*queue].tqh_first;
1539 if (bp)
1540 goto found;
1541 }
1542
1543 /* Unable to use requested queue */
1544 age_bp = bufqueues[BQ_AGE].tqh_first;
1545 lru_bp = bufqueues[BQ_LRU].tqh_first;
1546 meta_bp = bufqueues[BQ_META].tqh_first;
1547
1548 if (!age_bp && !lru_bp && !meta_bp) {
1549 /*
1550 * Unavailble on AGE or LRU or META queues
1551 * Try the empty list first
1552 */
1553 bp = bufqueues[BQ_EMPTY].tqh_first;
1554 if (bp) {
1555 *queue = BQ_EMPTY;
1556 goto found;
1557 }
1558
1559 /* Create a new temparory buffer header */
1560 bp = (struct buf *)zalloc(buf_hdr_zone);
1561
1562 if (bp) {
1563 bufhdrinit(bp);
1564 BLISTNONE(bp);
1565 binshash(bp, &invalhash);
1566 SET(bp->b_flags, B_HDRALLOC);
1567 *queue = BQ_EMPTY;
1568 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1569 buf_hdr_count++;
1570 goto found;
1571 }
1572
1573 /* Log this error condition */
1574 printf("getnewbuf: No useful buffers");
1575
1576 /* wait for a free buffer of any kind */
1577 needbuffer = 1;
1578 bufstats.bufs_sleeps++;
1579 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1580 splx(s);
1581 return (0);
1582 }
1583
1584 /* Buffer available either on AGE or LRU or META */
1585 bp = NULL;
1586 *queue = -1;
1587
1588 /* Buffer available either on AGE or LRU */
1589 if (!age_bp) {
1590 bp = lru_bp;
1591 *queue = BQ_LRU;
1592 } else if (!lru_bp) {
1593 bp = age_bp;
1594 *queue = BQ_AGE;
1595 } else { /* buffer available on both AGE and LRU */
1596 age_time = time.tv_sec - age_bp->b_timestamp;
1597 lru_time = time.tv_sec - lru_bp->b_timestamp;
1598 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1599 bp = age_bp;
1600 *queue = BQ_AGE;
1601 /*
1602 * we should probably re-timestamp eveything in the
1603 * queues at this point with the current time
1604 */
1605 } else {
1606 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1607 bp = lru_bp;
1608 *queue = BQ_LRU;
1609 } else {
1610 bp = age_bp;
1611 *queue = BQ_AGE;
1612 }
1613 }
1614 }
1615
1616 if (!bp) { /* Neither on AGE nor on LRU */
1617 bp = meta_bp;
1618 *queue = BQ_META;
1619 } else if (meta_bp) {
1620 bp_time = time.tv_sec - bp->b_timestamp;
1621 meta_time = time.tv_sec - meta_bp->b_timestamp;
1622
1623 if (!(bp_time < 0) && !(meta_time < 0)) {
1624 /* time not set backwards */
1625 int bp_is_stale;
1626 bp_is_stale = (*queue == BQ_LRU) ?
1627 lru_is_stale : age_is_stale;
1628
1629 if ((meta_time >= meta_is_stale) &&
1630 (bp_time < bp_is_stale)) {
1631 bp = meta_bp;
1632 *queue = BQ_META;
1633 }
1634 }
1635 }
1636
1637 if (bp == NULL)
1638 panic("getnewbuf: null bp");
1639
1640 found:
1641 if (ISSET(bp->b_flags, B_LOCKED)) {
1642 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
1643 }
1644
1645 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1646 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
1647
1648 if(ISSET(bp->b_flags, B_BUSY))
1649 panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
1650
1651 /* Clean it */
1652 if (bcleanbuf(bp)) {
1653 /* bawrite() issued, buffer not ready */
1654 splx(s);
1655 *queue = req;
1656 goto start;
1657 }
1658 splx(s);
1659 return (bp);
1660 }
1661
1662 #include <mach/mach_types.h>
1663 #include <mach/memory_object_types.h>
1664 #include <kern/sched_prim.h>
1665
1666 /*
1667 * Clean a buffer.
1668 * Returns 0 is buffer is ready to use,
1669 * Returns 1 if issued a bawrite() to indicate
1670 * that the buffer is not ready.
1671 */
1672 static int
1673 bcleanbuf(struct buf *bp)
1674 {
1675 int s;
1676 struct ucred *cred;
1677 int hdralloc = 0;
1678
1679 s = splbio();
1680
1681 /* Remove from the queue */
1682 bremfree(bp);
1683
1684 /* Buffer is no longer on free lists. */
1685 SET(bp->b_flags, B_BUSY);
1686
1687 /* Check whether the buffer header was "allocated" */
1688 if (ISSET(bp->b_flags, B_HDRALLOC))
1689 hdralloc = 1;
1690
1691 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1692 panic("bcleanbuf: le_prev is deadbeef");
1693
1694 /*
1695 * If buffer was a delayed write, start the IO by queuing
1696 * it on the LAUNDRY queue, and return 1
1697 */
1698 if (ISSET(bp->b_flags, B_DELWRI)) {
1699 splx(s);
1700 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1701 blaundrycnt++;
1702 wakeup(&blaundrycnt);
1703 /* and give it a chance to run */
1704 (void)thread_block(THREAD_CONTINUE_NULL);
1705 return (1);
1706 }
1707
1708 if (bp->b_vp)
1709 brelvp(bp);
1710 bremhash(bp);
1711 BLISTNONE(bp);
1712
1713 splx(s);
1714
1715 if (ISSET(bp->b_flags, B_META)) {
1716 vm_offset_t elem = (vm_offset_t)bp->b_data;
1717 if (elem == 0)
1718 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1719
1720 if (ISSET(bp->b_flags, B_ZALLOC)) {
1721 if (bp->b_bufsize <= MAXMETA) {
1722 zone_t z;
1723
1724 z = getbufzone(bp->b_bufsize);
1725 bp->b_data = (caddr_t)0xdeadbeef;
1726 zfree(z, elem);
1727 CLR(bp->b_flags, B_ZALLOC);
1728 } else
1729 panic("bcleanbuf: B_ZALLOC set incorrectly");
1730 } else {
1731 bp->b_data = (caddr_t)0xdeadbeef;
1732 kmem_free(kernel_map, elem, bp->b_bufsize);
1733 }
1734 }
1735
1736 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1737
1738 /* disassociate us from our vnode, if we had one... */
1739 s = splbio();
1740
1741 /* clear out various other fields */
1742 bp->b_bufsize = 0;
1743 bp->b_data = 0;
1744 bp->b_flags = B_BUSY;
1745 if (hdralloc)
1746 SET(bp->b_flags, B_HDRALLOC);
1747 bp->b_dev = NODEV;
1748 bp->b_blkno = bp->b_lblkno = 0;
1749 bp->b_iodone = 0;
1750 bp->b_error = 0;
1751 bp->b_resid = 0;
1752 bp->b_bcount = 0;
1753 bp->b_dirtyoff = bp->b_dirtyend = 0;
1754 bp->b_validoff = bp->b_validend = 0;
1755
1756 /* nuke any credentials we were holding */
1757 cred = bp->b_rcred;
1758 if (cred != NOCRED) {
1759 bp->b_rcred = NOCRED;
1760 crfree(cred);
1761 }
1762 cred = bp->b_wcred;
1763 if (cred != NOCRED) {
1764 bp->b_wcred = NOCRED;
1765 crfree(cred);
1766 }
1767 splx(s);
1768 return (0);
1769 }
1770
1771
1772 /*
1773 * Wait for operations on the buffer to complete.
1774 * When they do, extract and return the I/O's error value.
1775 */
1776 int
1777 biowait(bp)
1778 struct buf *bp;
1779 {
1780 int s;
1781
1782 s = splbio();
1783 while (!ISSET(bp->b_flags, B_DONE))
1784 tsleep(bp, PRIBIO + 1, "biowait", 0);
1785 splx(s);
1786
1787 /* check for interruption of I/O (e.g. via NFS), then errors. */
1788 if (ISSET(bp->b_flags, B_EINTR)) {
1789 CLR(bp->b_flags, B_EINTR);
1790 return (EINTR);
1791 } else if (ISSET(bp->b_flags, B_ERROR))
1792 return (bp->b_error ? bp->b_error : EIO);
1793 else
1794 return (0);
1795 }
1796
1797 /*
1798 * Mark I/O complete on a buffer.
1799 *
1800 * If a callback has been requested, e.g. the pageout
1801 * daemon, do so. Otherwise, awaken waiting processes.
1802 *
1803 * [ Leffler, et al., says on p.247:
1804 * "This routine wakes up the blocked process, frees the buffer
1805 * for an asynchronous write, or, for a request by the pagedaemon
1806 * process, invokes a procedure specified in the buffer structure" ]
1807 *
1808 * In real life, the pagedaemon (or other system processes) wants
1809 * to do async stuff to, and doesn't want the buffer brelse()'d.
1810 * (for swap pager, that puts swap buffers on the free lists (!!!),
1811 * for the vn device, that puts malloc'd buffers on the free lists!)
1812 */
1813 void
1814 biodone(bp)
1815 struct buf *bp;
1816 {
1817 boolean_t funnel_state;
1818 struct vnode *vp;
1819
1820 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1821
1822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1823 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1824
1825 if (ISSET(bp->b_flags, B_DONE))
1826 panic("biodone already");
1827 SET(bp->b_flags, B_DONE); /* note that it's done */
1828 /*
1829 * I/O was done, so don't believe
1830 * the DIRTY state from VM anymore
1831 */
1832 CLR(bp->b_flags, B_WASDIRTY);
1833
1834 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1835 vwakeup(bp); /* wake up reader */
1836
1837 if (kdebug_enable) {
1838 int code = DKIO_DONE;
1839
1840 if (bp->b_flags & B_READ)
1841 code |= DKIO_READ;
1842 if (bp->b_flags & B_ASYNC)
1843 code |= DKIO_ASYNC;
1844
1845 if (bp->b_flags & B_META)
1846 code |= DKIO_META;
1847 else if (bp->b_flags & (B_PGIN | B_PAGEOUT))
1848 code |= DKIO_PAGING;
1849
1850 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
1851 bp, bp->b_vp, bp->b_resid, bp->b_error, 0);
1852 }
1853
1854 /* Wakeup the throttled write operations as needed */
1855 vp = bp->b_vp;
1856 if (vp
1857 && (vp->v_flag & VTHROTTLED)
1858 && (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
1859 vp->v_flag &= ~VTHROTTLED;
1860 wakeup((caddr_t)&vp->v_numoutput);
1861 }
1862
1863 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1864 void (*iodone_func)(struct buf *) = bp->b_iodone;
1865
1866 CLR(bp->b_flags, B_CALL); /* but note callout done */
1867 bp->b_iodone = NULL;
1868
1869 if (iodone_func == NULL) {
1870 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
1871 } else {
1872 (*iodone_func)(bp);
1873 }
1874 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1875 brelse(bp);
1876 else { /* or just wakeup the buffer */
1877 CLR(bp->b_flags, B_WANTED);
1878 wakeup(bp);
1879 }
1880
1881 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1882 (int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
1883
1884 thread_funnel_set(kernel_flock, funnel_state);
1885 }
1886
1887 /*
1888 * Return a count of buffers on the "locked" queue.
1889 */
1890 int
1891 count_lock_queue()
1892 {
1893 register struct buf *bp;
1894 register int n = 0;
1895
1896 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1897 bp = bp->b_freelist.tqe_next)
1898 n++;
1899 return (n);
1900 }
1901
1902 /*
1903 * Return a count of 'busy' buffers. Used at the time of shutdown.
1904 */
1905 int
1906 count_busy_buffers()
1907 {
1908 register struct buf *bp;
1909 register int nbusy = 0;
1910
1911 for (bp = &buf[nbuf]; --bp >= buf; )
1912 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1913 nbusy++;
1914 return (nbusy);
1915 }
1916
1917 #if DIAGNOSTIC
1918 /*
1919 * Print out statistics on the current allocation of the buffer pool.
1920 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1921 * in vfs_syscalls.c using sysctl.
1922 */
1923 void
1924 vfs_bufstats()
1925 {
1926 int s, i, j, count;
1927 register struct buf *bp;
1928 register struct bqueues *dp;
1929 int counts[MAXBSIZE/CLBYTES+1];
1930 static char *bname[BQUEUES] =
1931 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1932
1933 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1934 count = 0;
1935 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1936 counts[j] = 0;
1937 s = splbio();
1938 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1939 counts[bp->b_bufsize/CLBYTES]++;
1940 count++;
1941 }
1942 splx(s);
1943 printf("%s: total-%d", bname[i], count);
1944 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1945 if (counts[j] != 0)
1946 printf(", %d-%d", j * CLBYTES, counts[j]);
1947 printf("\n");
1948 }
1949 }
1950 #endif /* DIAGNOSTIC */
1951
1952 #define NRESERVEDIOBUFS 64
1953
1954 __private_extern__ struct buf *
1955 alloc_io_buf(vp, priv)
1956 struct vnode *vp;
1957 int priv;
1958 {
1959 register struct buf *bp;
1960 int s;
1961
1962 s = splbio();
1963
1964 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1965 need_iobuffer = 1;
1966 bufstats.bufs_iobufsleeps++;
1967 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1968 }
1969
1970 while ((bp = iobufqueue.tqh_first) == NULL) {
1971 need_iobuffer = 1;
1972 bufstats.bufs_iobufsleeps++;
1973 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1974 }
1975
1976 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1977 bp->b_timestamp = 0;
1978
1979 /* clear out various fields */
1980 bp->b_flags = B_BUSY;
1981 bp->b_blkno = bp->b_lblkno = 0;
1982
1983 bp->b_iodone = 0;
1984 bp->b_error = 0;
1985 bp->b_resid = 0;
1986 bp->b_bcount = 0;
1987 bp->b_bufsize = 0;
1988 bp->b_vp = vp;
1989
1990 if (vp->v_type == VBLK || vp->v_type == VCHR)
1991 bp->b_dev = vp->v_rdev;
1992 else
1993 bp->b_dev = NODEV;
1994 bufstats.bufs_iobufinuse++;
1995 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1996 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1997 splx(s);
1998
1999 return (bp);
2000 }
2001
2002 __private_extern__ void
2003 free_io_buf(bp)
2004 struct buf *bp;
2005 {
2006 int s;
2007
2008 s = splbio();
2009 /* put buffer back on the head of the iobufqueue */
2010 bp->b_vp = NULL;
2011 bp->b_flags = B_INVAL;
2012
2013 binsheadfree(bp, &iobufqueue, -1);
2014
2015 /* Wake up any processes waiting for any buffer to become free. */
2016 if (need_iobuffer) {
2017 need_iobuffer = 0;
2018 wakeup(&need_iobuffer);
2019 }
2020 bufstats.bufs_iobufinuse--;
2021 splx(s);
2022 }
2023
2024 /* disabled for now */
2025
2026 /* XXX move this to a separate file */
2027 /*
2028 * Dynamic Scaling of the Buffer Queues
2029 */
2030
2031 typedef long long blsize_t;
2032
2033 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
2034 /* Global tunable limits */
2035 blsize_t nbufh; /* number of buffer headers */
2036 blsize_t nbuflow; /* minimum number of buffer headers required */
2037 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
2038 blsize_t nbuftarget; /* preferred number of buffer headers */
2039
2040 /*
2041 * assertions:
2042 *
2043 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2044 * 2. nbufhigh <= MAXNBUF
2045 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2046 * 4. nbufh can not be set by sysctl().
2047 */
2048
2049 /* Per queue tunable limits */
2050
2051 struct bufqlim {
2052 blsize_t bl_nlow; /* minimum number of buffer headers required */
2053 blsize_t bl_num; /* number of buffer headers on the queue */
2054 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
2055 blsize_t bl_target; /* preferred number of buffer headers */
2056 long bl_stale; /* Seconds after which a buffer is considered stale */
2057 } bufqlim[BQUEUES];
2058
2059 /*
2060 * assertions:
2061 *
2062 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2063 * 2. bl_nlhigh <= MAXNBUF
2064 * 3. bufqlim[BQ_META].bl_nlow != 0
2065 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2066 * file system IO operations)
2067 * 5. bl_num can not be set by sysctl().
2068 * 6. bl_nhigh <= nbufhigh
2069 */
2070
2071 /*
2072 * Rationale:
2073 * ----------
2074 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2075 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2076 *
2077 * These limits are exported to by means of sysctl().
2078 * It was decided to define blsize_t as a 64 bit quantity.
2079 * This will make sure that we will not be required to change it
2080 * as long as we do not exceed 64 bit address space for the kernel.
2081 *
2082 * low and high numbers parameters initialized at compile time
2083 * and boot arguments can be used to override them. sysctl()
2084 * would not change the value. sysctl() can get all the values
2085 * but can set only target. num is the current level.
2086 *
2087 * Advantages of having a "bufqscan" thread doing the balancing are,
2088 * Keep enough bufs on BQ_EMPTY.
2089 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2090 * getnewbuf() perfoms best if a buffer was found there.
2091 * Also this minimizes the possibility of starting IO
2092 * from getnewbuf(). That's a performance win, too.
2093 *
2094 * Localize complex logic [balancing as well as time aging]
2095 * to balancebufq().
2096 *
2097 * Simplify getnewbuf() logic by elimination of time aging code.
2098 */
2099
2100 /*
2101 * Algorithm:
2102 * -----------
2103 * The goal of the dynamic scaling of the buffer queues to to keep
2104 * the size of the LRU close to bl_target. Buffers on a queue would
2105 * be time aged.
2106 *
2107 * There would be a thread which will be responsible for "balancing"
2108 * the buffer cache queues.
2109 *
2110 * The scan order would be: AGE, LRU, META, EMPTY.
2111 */
2112
2113 long bufqscanwait = 0;
2114
2115 static void bufqscan_thread();
2116 static int balancebufq(int q);
2117 static int btrimempty(int n);
2118 static __inline__ int initbufqscan(void);
2119 static __inline__ int nextbufq(int q);
2120 static void buqlimprt(int all);
2121
2122 static void
2123 bufq_balance_thread_init()
2124 {
2125
2126 if (bufqscanwait++ == 0) {
2127
2128 /* Initalize globals */
2129 MAXNBUF = (mem_size / PAGE_SIZE);
2130 nbufh = nbuf;
2131 nbuflow = min(nbufh, 100);
2132 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
2133 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
2134 nbuftarget = max(nbuflow, nbuftarget);
2135 nbuftarget = min(nbufhigh, nbuftarget);
2136
2137 /*
2138 * Initialize the bufqlim
2139 */
2140
2141 /* LOCKED queue */
2142 bufqlim[BQ_LOCKED].bl_nlow = 0;
2143 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2144 bufqlim[BQ_LOCKED].bl_target = 0;
2145 bufqlim[BQ_LOCKED].bl_stale = 30;
2146
2147 /* LRU queue */
2148 bufqlim[BQ_LRU].bl_nlow = 0;
2149 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
2150 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
2151 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
2152
2153 /* AGE queue */
2154 bufqlim[BQ_AGE].bl_nlow = 0;
2155 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
2156 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
2157 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
2158
2159 /* EMPTY queue */
2160 bufqlim[BQ_EMPTY].bl_nlow = 0;
2161 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
2162 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
2163 bufqlim[BQ_EMPTY].bl_stale = 600000;
2164
2165 /* META queue */
2166 bufqlim[BQ_META].bl_nlow = 0;
2167 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
2168 bufqlim[BQ_META].bl_target = nbuftarget/4;
2169 bufqlim[BQ_META].bl_stale = META_IS_STALE;
2170
2171 /* LAUNDRY queue */
2172 bufqlim[BQ_LOCKED].bl_nlow = 0;
2173 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
2174 bufqlim[BQ_LOCKED].bl_target = 0;
2175 bufqlim[BQ_LOCKED].bl_stale = 30;
2176
2177 buqlimprt(1);
2178 }
2179
2180 /* create worker thread */
2181 kernel_thread(kernel_task, bufqscan_thread);
2182 }
2183
2184 /* The workloop for the buffer balancing thread */
2185 static void
2186 bufqscan_thread()
2187 {
2188 boolean_t funnel_state;
2189 int moretodo = 0;
2190
2191 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2192
2193 for(;;) {
2194 do {
2195 int q; /* buffer queue to process */
2196
2197 q = initbufqscan();
2198 for (; q; ) {
2199 moretodo |= balancebufq(q);
2200 q = nextbufq(q);
2201 }
2202 } while (moretodo);
2203
2204 #if DIAGNOSTIC
2205 vfs_bufstats();
2206 buqlimprt(0);
2207 #endif
2208 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
2209 moretodo = 0;
2210 }
2211
2212 (void) thread_funnel_set(kernel_flock, FALSE);
2213 }
2214
2215 /* Seed for the buffer queue balancing */
2216 static __inline__ int
2217 initbufqscan()
2218 {
2219 /* Start with AGE queue */
2220 return (BQ_AGE);
2221 }
2222
2223 /* Pick next buffer queue to balance */
2224 static __inline__ int
2225 nextbufq(int q)
2226 {
2227 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2228
2229 q++;
2230 q %= sizeof(order);
2231 return (order[q]);
2232 }
2233
2234 /* function to balance the buffer queues */
2235 static int
2236 balancebufq(int q)
2237 {
2238 int moretodo = 0;
2239 int s = splbio();
2240 int n;
2241
2242 /* reject invalid q */
2243 if ((q < 0) || (q >= BQUEUES))
2244 goto out;
2245
2246 /* LOCKED or LAUNDRY queue MUST not be balanced */
2247 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2248 goto out;
2249
2250 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2251
2252 /* If queue has less than target nothing more to do */
2253 if (n < 0)
2254 goto out;
2255
2256 if ( n > 8 ) {
2257 /* Balance only a small amount (12.5%) at a time */
2258 n >>= 3;
2259 }
2260
2261 /* EMPTY queue needs special handling */
2262 if (q == BQ_EMPTY) {
2263 moretodo |= btrimempty(n);
2264 goto out;
2265 }
2266
2267 for (; n > 0; n--) {
2268 struct buf *bp = bufqueues[q].tqh_first;
2269 if (!bp)
2270 break;
2271
2272 /* check if it's stale */
2273 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2274 if (bcleanbuf(bp)) {
2275 /* bawrite() issued, bp not ready */
2276 moretodo = 1;
2277 } else {
2278 /* release the cleaned buffer to BQ_EMPTY */
2279 SET(bp->b_flags, B_INVAL);
2280 brelse(bp);
2281 }
2282 } else
2283 break;
2284 }
2285
2286 out:
2287 splx(s);
2288 return (moretodo);
2289 }
2290
2291 static int
2292 btrimempty(int n)
2293 {
2294 /*
2295 * When struct buf are allocated dynamically, this would
2296 * reclaim upto 'n' struct buf from the empty queue.
2297 */
2298
2299 return (0);
2300 }
2301
2302 static __inline__ void
2303 bufqinc(int q)
2304 {
2305 if ((q < 0) || (q >= BQUEUES))
2306 return;
2307
2308 bufqlim[q].bl_num++;
2309 return;
2310 }
2311
2312 static __inline__ void
2313 bufqdec(int q)
2314 {
2315 if ((q < 0) || (q >= BQUEUES))
2316 return;
2317
2318 bufqlim[q].bl_num--;
2319 return;
2320 }
2321
2322 static void
2323 buqlimprt(int all)
2324 {
2325 int i;
2326 static char *bname[BQUEUES] =
2327 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2328
2329 if (all)
2330 for (i = 0; i < BQUEUES; i++) {
2331 printf("%s : ", bname[i]);
2332 printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
2333 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2334 printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
2335 printf("target = %ld, ", (long)bufqlim[i].bl_target);
2336 printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
2337 }
2338 else
2339 for (i = 0; i < BQUEUES; i++) {
2340 printf("%s : ", bname[i]);
2341 printf("cur = %ld, ", (long)bufqlim[i].bl_num);
2342 }
2343 }
2344
2345 /*
2346 * If the getnewbuf() calls bcleanbuf() on the same thread
2347 * there is a potential for stack overrun and deadlocks.
2348 * So we always handoff the work to worker thread for completion
2349 */
2350
2351 static void
2352 bcleanbuf_thread_init()
2353 {
2354 static void bcleanbuf_thread();
2355
2356 /* create worker thread */
2357 kernel_thread(kernel_task, bcleanbuf_thread);
2358 }
2359
2360 static void
2361 bcleanbuf_thread()
2362 {
2363 boolean_t funnel_state;
2364 struct buf *bp;
2365 int error = 0;
2366 int loopcnt = 0;
2367
2368 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2369
2370 doit:
2371 while (blaundrycnt == 0)
2372 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2373 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2374 /* Remove from the queue */
2375 bremfree(bp);
2376 blaundrycnt--;
2377 /* do the IO */
2378 error = bawrite_internal(bp, 0);
2379 if (error) {
2380 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
2381 blaundrycnt++;
2382 if (loopcnt > 10) {
2383 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
2384 loopcnt = 0;
2385 } else {
2386 (void)thread_block(THREAD_CONTINUE_NULL);
2387 loopcnt++;
2388 }
2389 }
2390 /* start again */
2391 goto doit;
2392
2393 (void) thread_funnel_set(kernel_flock, funnel_state);
2394 }
2395
2396
2397 static int
2398 bp_cmp(void *a, void *b)
2399 {
2400 struct buf *bp_a = *(struct buf **)a,
2401 *bp_b = *(struct buf **)b;
2402 daddr_t res;
2403
2404 // don't have to worry about negative block
2405 // numbers so this is ok to do.
2406 //
2407 res = (bp_a->b_blkno - bp_b->b_blkno);
2408
2409 return (int)res;
2410 }
2411
2412 #define NFLUSH 32
2413
2414 int
2415 bflushq(int whichq, struct mount *mp)
2416 {
2417 struct buf *bp, *next;
2418 int i, buf_count, s;
2419 int counter=0, total_writes=0;
2420 static struct buf *flush_table[NFLUSH];
2421
2422 if (whichq < 0 || whichq >= BQUEUES) {
2423 return;
2424 }
2425
2426
2427 restart:
2428 bp = TAILQ_FIRST(&bufqueues[whichq]);
2429 for(buf_count=0; bp; bp=next) {
2430 next = bp->b_freelist.tqe_next;
2431
2432 if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
2433 continue;
2434 }
2435
2436 if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
2437 if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
2438 panic("bflushq: bp @ 0x%x is locked!\n", bp);
2439 }
2440
2441 bremfree(bp);
2442 bp->b_flags |= B_BUSY;
2443 flush_table[buf_count] = bp;
2444 buf_count++;
2445 total_writes++;
2446
2447 if (buf_count >= NFLUSH) {
2448 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2449
2450 for(i=0; i < buf_count; i++) {
2451 bawrite(flush_table[i]);
2452 }
2453
2454 goto restart;
2455 }
2456 }
2457 }
2458
2459 if (buf_count > 0) {
2460 qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
2461 for(i=0; i < buf_count; i++) {
2462 bawrite(flush_table[i]);
2463 }
2464 }
2465
2466 return total_writes;
2467 }