]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_bio.c
3e9b0a09badfe201681aefdc7aa49df9039afe53
[apple/xnu.git] / bsd / vfs / vfs_bio.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*-
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
63 *
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
65 */
66
67 /*
68 * Some references:
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
72 */
73 #define ZALLOC_METADATA 1
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/proc.h>
78 #include <sys/buf.h>
79 #include <sys/vnode.h>
80 #include <sys/mount.h>
81 #include <sys/trace.h>
82 #include <sys/malloc.h>
83 #include <sys/resourcevar.h>
84 #include <miscfs/specfs/specdev.h>
85 #include <sys/ubc.h>
86 #include <vm/vm_pageout.h>
87 #if DIAGNOSTIC
88 #include <kern/assert.h>
89 #endif /* DIAGNOSTIC */
90 #include <kern/task.h>
91 #include <kern/zalloc.h>
92
93 #include <sys/kdebug.h>
94
95 extern void bufqinc(int q);
96 extern void bufqdec(int q);
97 extern void bufq_balance_thread_init();
98
99 extern void reassignbuf(struct buf *, struct vnode *);
100 static struct buf *getnewbuf(int slpflag, int slptimeo, int *queue);
101
102 extern int niobuf; /* The number of IO buffer headers for cluster IO */
103 int blaundrycnt;
104
105 #if TRACE
106 struct proc *traceproc;
107 int tracewhich, tracebuf[TRCSIZ];
108 u_int tracex;
109 char traceflags[TR_NFLAGS];
110 #endif /* TRACE */
111
112 /*
113 * Definitions for the buffer hash lists.
114 */
115 #define BUFHASH(dvp, lbn) \
116 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
117 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
118 u_long bufhash;
119
120 /* Definitions for the buffer stats. */
121 struct bufstats bufstats;
122
123 /*
124 * Insq/Remq for the buffer hash lists.
125 */
126 #if 0
127 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
128 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
129 #endif /* 0 */
130
131
132 TAILQ_HEAD(ioqueue, buf) iobufqueue;
133 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
134 int needbuffer;
135 int need_iobuffer;
136
137 /*
138 * Insq/Remq for the buffer free lists.
139 */
140 #define binsheadfree(bp, dp, whichq) do { \
141 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
142 bufqinc((whichq)); \
143 (bp)->b_whichq = whichq; \
144 (bp)->b_timestamp = time.tv_sec; \
145 } while (0)
146
147 #define binstailfree(bp, dp, whichq) do { \
148 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
149 bufqinc((whichq)); \
150 (bp)->b_whichq = whichq; \
151 (bp)->b_timestamp = time.tv_sec; \
152 } while (0)
153
154 #define BHASHENTCHECK(bp) \
155 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
156 panic("%x: b_hash.le_prev is not deadbeef", (bp));
157
158 #define BLISTNONE(bp) \
159 (bp)->b_hash.le_next = (struct buf *)0; \
160 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
161
162 simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
163
164 /*
165 * Time in seconds before a buffer on a list is
166 * considered as a stale buffer
167 */
168 #define LRU_IS_STALE 120 /* default value for the LRU */
169 #define AGE_IS_STALE 60 /* default value for the AGE */
170 #define META_IS_STALE 180 /* default value for the BQ_META */
171
172 int lru_is_stale = LRU_IS_STALE;
173 int age_is_stale = AGE_IS_STALE;
174 int meta_is_stale = META_IS_STALE;
175
176 #if 1
177 void
178 blistenterhead(struct bufhashhdr * head, struct buf * bp)
179 {
180 if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
181 (head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
182 (head)->lh_first = bp;
183 bp->b_hash.le_prev = &(head)->lh_first;
184 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
185 panic("blistenterhead: le_prev is deadbeef");
186
187 }
188 #endif
189
190 #if 1
191 void
192 binshash(struct buf *bp, struct bufhashhdr *dp)
193 {
194 int s;
195
196 struct buf *nbp;
197
198 simple_lock(&bufhashlist_slock);
199 #if 0
200 if(incore(bp->b_vp, bp->b_lblkno)) {
201 panic("adding to queue already existing element");
202 }
203 #endif /* 0 */
204 BHASHENTCHECK(bp);
205
206 nbp = dp->lh_first;
207 for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
208 if(nbp == bp)
209 panic("buf already in hashlist");
210 }
211
212 #if 0
213 LIST_INSERT_HEAD(dp, bp, b_hash);
214 #else
215 blistenterhead(dp, bp);
216 #endif
217 simple_unlock(&bufhashlist_slock);
218 }
219
220 void
221 bremhash(struct buf *bp)
222 {
223 int s;
224
225 simple_lock(&bufhashlist_slock);
226 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
227 panic("bremhash le_prev is deadbeef");
228 if (bp->b_hash.le_next == bp)
229 panic("bremhash: next points to self");
230
231 if (bp->b_hash.le_next != NULL)
232 bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
233 *bp->b_hash.le_prev = (bp)->b_hash.le_next;
234 simple_unlock(&bufhashlist_slock);
235 }
236
237 #endif /* 1 */
238
239
240 /*
241 * Remove a buffer from the free list it's on
242 */
243 void
244 bremfree(bp)
245 struct buf *bp;
246 {
247 struct bqueues *dp = NULL;
248 int whichq = -1;
249
250 /*
251 * We only calculate the head of the freelist when removing
252 * the last element of the list as that is the only time that
253 * it is needed (e.g. to reset the tail pointer).
254 *
255 * NB: This makes an assumption about how tailq's are implemented.
256 */
257 if (bp->b_freelist.tqe_next == NULL) {
258 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
259 if (dp->tqh_last == &bp->b_freelist.tqe_next)
260 break;
261 if (dp == &bufqueues[BQUEUES])
262 panic("bremfree: lost tail");
263 }
264 TAILQ_REMOVE(dp, bp, b_freelist);
265 whichq = bp->b_whichq;
266 bufqdec(whichq);
267 bp->b_whichq = -1;
268 bp->b_timestamp = 0;
269 }
270
271 static __inline__ void
272 bufhdrinit(struct buf *bp)
273 {
274 bzero((char *)bp, sizeof *bp);
275 bp->b_dev = NODEV;
276 bp->b_rcred = NOCRED;
277 bp->b_wcred = NOCRED;
278 bp->b_vnbufs.le_next = NOLIST;
279 bp->b_flags = B_INVAL;
280
281 return;
282 }
283
284 /*
285 * Initialize buffers and hash links for buffers.
286 */
287 void
288 bufinit()
289 {
290 register struct buf *bp;
291 register struct bqueues *dp;
292 register int i;
293 int metabuf;
294 long whichq;
295 static void bufzoneinit();
296 static void bcleanbuf_thread_init();
297
298 /* Initialize the buffer queues ('freelists') and the hash table */
299 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
300 TAILQ_INIT(dp);
301 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
302
303 simple_lock_init(&bufhashlist_slock );
304
305 metabuf = nbuf/8; /* reserved for meta buf */
306
307 /* Initialize the buffer headers */
308 for (i = 0; i < nbuf; i++) {
309 bp = &buf[i];
310 bufhdrinit(bp);
311
312 /*
313 * metabuf buffer headers on the meta-data list and
314 * rest of the buffer headers on the empty list
315 */
316 if (--metabuf)
317 whichq = BQ_META;
318 else
319 whichq = BQ_EMPTY;
320
321 BLISTNONE(bp);
322 dp = &bufqueues[whichq];
323 binsheadfree(bp, dp, whichq);
324 binshash(bp, &invalhash);
325 }
326
327 for (; i < nbuf + niobuf; i++) {
328 bp = &buf[i];
329 bufhdrinit(bp);
330 binsheadfree(bp, &iobufqueue, -1);
331 }
332
333 printf("using %d buffer headers and %d cluster IO buffer headers\n",
334 nbuf, niobuf);
335
336 /* Set up zones used by the buffer cache */
337 bufzoneinit();
338
339 /* start the bcleanbuf() thread */
340 bcleanbuf_thread_init();
341
342 #if 0 /* notyet */
343 /* create a thread to do dynamic buffer queue balancing */
344 bufq_balance_thread_init();
345 #endif /* XXX */
346 }
347
348 /* __inline */
349 struct buf *
350 bio_doread(vp, blkno, size, cred, async, queuetype)
351 struct vnode *vp;
352 daddr_t blkno;
353 int size;
354 struct ucred *cred;
355 int async;
356 int queuetype;
357 {
358 register struct buf *bp;
359 struct proc *p = current_proc();
360
361 bp = getblk(vp, blkno, size, 0, 0, queuetype);
362
363 /*
364 * If buffer does not have data valid, start a read.
365 * Note that if buffer is B_INVAL, getblk() won't return it.
366 * Therefore, it's valid if it's I/O has completed or been delayed.
367 */
368 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
369 /* Start I/O for the buffer (keeping credentials). */
370 SET(bp->b_flags, B_READ | async);
371 if (cred != NOCRED && bp->b_rcred == NOCRED) {
372 /*
373 * NFS has embedded ucred.
374 * Can not crhold() here as that causes zone corruption
375 */
376 bp->b_rcred = crdup(cred);
377 }
378 VOP_STRATEGY(bp);
379
380 trace(TR_BREADMISS, pack(vp, size), blkno);
381
382 /* Pay for the read. */
383 if (p && p->p_stats)
384 p->p_stats->p_ru.ru_inblock++; /* XXX */
385 } else if (async) {
386 brelse(bp);
387 }
388
389 trace(TR_BREADHIT, pack(vp, size), blkno);
390
391 return (bp);
392 }
393 /*
394 * Read a disk block.
395 * This algorithm described in Bach (p.54).
396 */
397 int
398 bread(vp, blkno, size, cred, bpp)
399 struct vnode *vp;
400 daddr_t blkno;
401 int size;
402 struct ucred *cred;
403 struct buf **bpp;
404 {
405 register struct buf *bp;
406
407 /* Get buffer for block. */
408 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
409
410 /* Wait for the read to complete, and return result. */
411 return (biowait(bp));
412 }
413
414 /*
415 * Read a disk block. [bread() for meta-data]
416 * This algorithm described in Bach (p.54).
417 */
418 int
419 meta_bread(vp, blkno, size, cred, bpp)
420 struct vnode *vp;
421 daddr_t blkno;
422 int size;
423 struct ucred *cred;
424 struct buf **bpp;
425 {
426 register struct buf *bp;
427
428 /* Get buffer for block. */
429 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
430
431 /* Wait for the read to complete, and return result. */
432 return (biowait(bp));
433 }
434
435 /*
436 * Read-ahead multiple disk blocks. The first is sync, the rest async.
437 * Trivial modification to the breada algorithm presented in Bach (p.55).
438 */
439 int
440 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
441 struct vnode *vp;
442 daddr_t blkno; int size;
443 daddr_t rablks[]; int rasizes[];
444 int nrablks;
445 struct ucred *cred;
446 struct buf **bpp;
447 {
448 register struct buf *bp;
449 int i;
450
451 bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
452
453 /*
454 * For each of the read-ahead blocks, start a read, if necessary.
455 */
456 for (i = 0; i < nrablks; i++) {
457 /* If it's in the cache, just go on to next one. */
458 if (incore(vp, rablks[i]))
459 continue;
460
461 /* Get a buffer for the read-ahead block */
462 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, BLK_READ);
463 }
464
465 /* Otherwise, we had to start a read for it; wait until it's valid. */
466 return (biowait(bp));
467 }
468
469 /*
470 * Read with single-block read-ahead. Defined in Bach (p.55), but
471 * implemented as a call to breadn().
472 * XXX for compatibility with old file systems.
473 */
474 int
475 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
476 struct vnode *vp;
477 daddr_t blkno; int size;
478 daddr_t rablkno; int rabsize;
479 struct ucred *cred;
480 struct buf **bpp;
481 {
482
483 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
484 }
485
486 /*
487 * Block write. Described in Bach (p.56)
488 */
489 int
490 bwrite(bp)
491 struct buf *bp;
492 {
493 int rv, sync, wasdelayed;
494 struct proc *p = current_proc();
495 upl_t upl;
496 upl_page_info_t *pl;
497 void * object;
498 kern_return_t kret;
499 struct vnode *vp = bp->b_vp;
500
501 /* Remember buffer type, to switch on it later. */
502 sync = !ISSET(bp->b_flags, B_ASYNC);
503 wasdelayed = ISSET(bp->b_flags, B_DELWRI);
504 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
505
506 if (!sync) {
507 /*
508 * If not synchronous, pay for the I/O operation and make
509 * sure the buf is on the correct vnode queue. We have
510 * to do this now, because if we don't, the vnode may not
511 * be properly notified that its I/O has completed.
512 */
513 if (wasdelayed)
514 reassignbuf(bp, vp);
515 else
516 if (p && p->p_stats)
517 p->p_stats->p_ru.ru_oublock++; /* XXX */
518 }
519
520 trace(TR_BWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
521
522 /* Initiate disk write. Make sure the appropriate party is charged. */
523 SET(bp->b_flags, B_WRITEINPROG);
524 vp->v_numoutput++;
525
526 VOP_STRATEGY(bp);
527
528 if (sync) {
529 /*
530 * If I/O was synchronous, wait for it to complete.
531 */
532 rv = biowait(bp);
533
534 /*
535 * Pay for the I/O operation, if it's not been paid for, and
536 * make sure it's on the correct vnode queue. (async operatings
537 * were payed for above.)
538 */
539 if (wasdelayed)
540 reassignbuf(bp, vp);
541 else
542 if (p && p->p_stats)
543 p->p_stats->p_ru.ru_oublock++; /* XXX */
544
545 /* Release the buffer. */
546 brelse(bp);
547
548 return (rv);
549 } else {
550 return (0);
551 }
552 }
553
554 int
555 vn_bwrite(ap)
556 struct vop_bwrite_args *ap;
557 {
558 return (bwrite(ap->a_bp));
559 }
560
561 /*
562 * Delayed write.
563 *
564 * The buffer is marked dirty, but is not queued for I/O.
565 * This routine should be used when the buffer is expected
566 * to be modified again soon, typically a small write that
567 * partially fills a buffer.
568 *
569 * NB: magnetic tapes cannot be delayed; they must be
570 * written in the order that the writes are requested.
571 *
572 * Described in Leffler, et al. (pp. 208-213).
573 */
574 void
575 bdwrite(bp)
576 struct buf *bp;
577 {
578 struct proc *p = current_proc();
579 kern_return_t kret;
580 upl_t upl;
581 upl_page_info_t *pl;
582
583 /*
584 * If the block hasn't been seen before:
585 * (1) Mark it as having been seen,
586 * (2) Charge for the write.
587 * (3) Make sure it's on its vnode's correct block list,
588 */
589 if (!ISSET(bp->b_flags, B_DELWRI)) {
590 SET(bp->b_flags, B_DELWRI);
591 if (p && p->p_stats)
592 p->p_stats->p_ru.ru_oublock++; /* XXX */
593
594 reassignbuf(bp, bp->b_vp);
595 }
596
597
598 /* If this is a tape block, write it the block now. */
599 if (ISSET(bp->b_flags, B_TAPE)) {
600 /* bwrite(bp); */
601 VOP_BWRITE(bp);
602 return;
603 }
604
605 /* Otherwise, the "write" is done, so mark and release the buffer. */
606 SET(bp->b_flags, B_DONE);
607 brelse(bp);
608 }
609
610 /*
611 * Asynchronous block write; just an asynchronous bwrite().
612 */
613 void
614 bawrite(bp)
615 struct buf *bp;
616 {
617
618 SET(bp->b_flags, B_ASYNC);
619 VOP_BWRITE(bp);
620 }
621
622 /*
623 * Release a buffer on to the free lists.
624 * Described in Bach (p. 46).
625 */
626 void
627 brelse(bp)
628 struct buf *bp;
629 {
630 struct bqueues *bufq;
631 int s;
632 long whichq;
633
634 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
635 bp->b_lblkno * PAGE_SIZE, bp, bp->b_data, bp->b_flags, 0);
636
637 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
638
639 /* IO is done. Cleanup the UPL state */
640 if (!ISSET(bp->b_flags, B_META)
641 && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
642 kern_return_t kret;
643 upl_t upl;
644 int upl_flags;
645
646 if ( !ISSET(bp->b_flags, B_PAGELIST)) {
647 if ( !ISSET(bp->b_flags, B_INVAL)) {
648 kret = ubc_create_upl(bp->b_vp,
649 ubc_blktooff(bp->b_vp, bp->b_lblkno),
650 bp->b_bufsize,
651 &upl,
652 NULL,
653 UPL_PRECIOUS);
654 if (kret != KERN_SUCCESS)
655 panic("brelse: Failed to get pagelists");
656 #ifdef UBC_DEBUG
657 upl_ubc_alias_set(upl, bp, 5);
658 #endif /* UBC_DEBUG */
659 } else
660 upl = (upl_t) 0;
661 } else {
662 upl = bp->b_pagelist;
663 kret = ubc_upl_unmap(upl);
664
665 if (kret != KERN_SUCCESS)
666 panic("kernel_upl_unmap failed");
667 bp->b_data = 0;
668 }
669 if (upl) {
670 if (bp->b_flags & (B_ERROR | B_INVAL)) {
671 if (bp->b_flags & (B_READ | B_INVAL))
672 upl_flags = UPL_ABORT_DUMP_PAGES;
673 else
674 upl_flags = 0;
675 ubc_upl_abort(upl, upl_flags);
676 } else {
677 if (ISSET(bp->b_flags, (B_DELWRI | B_WASDIRTY)))
678 upl_flags = UPL_COMMIT_SET_DIRTY ;
679 else
680 upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
681 ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
682 UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
683 }
684 s = splbio();
685 CLR(bp->b_flags, B_PAGELIST);
686 bp->b_pagelist = 0;
687 splx(s);
688 }
689 } else {
690 if(ISSET(bp->b_flags, B_PAGELIST))
691 panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
692 }
693
694 /* Wake up any processes waiting for any buffer to become free. */
695 if (needbuffer) {
696 needbuffer = 0;
697 wakeup(&needbuffer);
698 }
699
700 /* Wake up any proceeses waiting for _this_ buffer to become free. */
701 if (ISSET(bp->b_flags, B_WANTED)) {
702 CLR(bp->b_flags, B_WANTED);
703 wakeup(bp);
704 }
705
706 /* Block disk interrupts. */
707 s = splbio();
708
709 /*
710 * Determine which queue the buffer should be on, then put it there.
711 */
712
713 /* If it's locked, don't report an error; try again later. */
714 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
715 CLR(bp->b_flags, B_ERROR);
716
717 /* If it's not cacheable, or an error, mark it invalid. */
718 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
719 SET(bp->b_flags, B_INVAL);
720
721 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
722 /*
723 * If it's invalid or empty, dissociate it from its vnode
724 * and put on the head of the appropriate queue.
725 */
726 if (bp->b_vp)
727 brelvp(bp);
728 CLR(bp->b_flags, B_DELWRI);
729 if (bp->b_bufsize <= 0)
730 whichq = BQ_EMPTY; /* no data */
731 else
732 whichq = BQ_AGE; /* invalid data */
733
734 bufq = &bufqueues[whichq];
735 binsheadfree(bp, bufq, whichq);
736 } else {
737 /*
738 * It has valid data. Put it on the end of the appropriate
739 * queue, so that it'll stick around for as long as possible.
740 */
741 if (ISSET(bp->b_flags, B_LOCKED))
742 whichq = BQ_LOCKED; /* locked in core */
743 else if (ISSET(bp->b_flags, B_META))
744 whichq = BQ_META; /* meta-data */
745 else if (ISSET(bp->b_flags, B_AGE))
746 whichq = BQ_AGE; /* stale but valid data */
747 else
748 whichq = BQ_LRU; /* valid data */
749
750 bufq = &bufqueues[whichq];
751 binstailfree(bp, bufq, whichq);
752 }
753
754 /* Unlock the buffer. */
755 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE));
756
757 /* Allow disk interrupts. */
758 splx(s);
759
760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
761 bp, bp->b_data, bp->b_flags, 0, 0);
762 }
763
764 /*
765 * Determine if a block is in the cache.
766 * Just look on what would be its hash chain. If it's there, return
767 * a pointer to it, unless it's marked invalid. If it's marked invalid,
768 * we normally don't return the buffer, unless the caller explicitly
769 * wants us to.
770 */
771 struct buf *
772 incore(vp, blkno)
773 struct vnode *vp;
774 daddr_t blkno;
775 {
776 struct buf *bp;
777 int bufseen = 0;
778
779 bp = BUFHASH(vp, blkno)->lh_first;
780
781 /* Search hash chain */
782 for (; bp != NULL; bp = bp->b_hash.le_next, bufseen++) {
783 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
784 !ISSET(bp->b_flags, B_INVAL))
785 return (bp);
786 if(bufseen >= nbuf)
787 panic("walked more than nbuf in incore");
788
789 }
790
791 return (0);
792 }
793
794 /* XXX FIXME -- Update the comment to reflect the UBC changes -- */
795 /*
796 * Get a block of requested size that is associated with
797 * a given vnode and block offset. If it is found in the
798 * block cache, mark it as having been found, make it busy
799 * and return it. Otherwise, return an empty block of the
800 * correct size. It is up to the caller to insure that the
801 * cached blocks be of the correct size.
802 */
803 struct buf *
804 getblk(vp, blkno, size, slpflag, slptimeo, operation)
805 register struct vnode *vp;
806 daddr_t blkno;
807 int size, slpflag, slptimeo, operation;
808 {
809 struct buf *bp;
810 int s, err;
811 upl_t upl;
812 upl_page_info_t *pl;
813 kern_return_t kret;
814 int error=0;
815 int pagedirty = 0;
816
817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
818 blkno * PAGE_SIZE, size, operation, 0, 0);
819 start:
820
821 s = splbio();
822 if (bp = incore(vp, blkno)) {
823 /* Found in the Buffer Cache */
824 if (ISSET(bp->b_flags, B_BUSY)) {
825 /* but is busy */
826 switch (operation) {
827 case BLK_READ:
828 case BLK_WRITE:
829 case BLK_META:
830 SET(bp->b_flags, B_WANTED);
831 bufstats.bufs_busyincore++;
832 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
833 slptimeo);
834 splx(s);
835 /*
836 * Callers who call with PCATCH or timeout are
837 * willing to deal with the NULL pointer
838 */
839 if (err && ((slpflag & PCATCH) ||
840 ((err == EWOULDBLOCK) && slptimeo)))
841 return (NULL);
842 goto start;
843 /*NOTREACHED*/
844 break;
845
846 case BLK_PAGEIN:
847 /* pagein operation must not use getblk */
848 panic("getblk: pagein for incore busy buffer");
849 splx(s);
850 /*NOTREACHED*/
851 break;
852
853 case BLK_PAGEOUT:
854 /* pageout operation must not use getblk */
855 panic("getblk: pageout for incore busy buffer");
856 splx(s);
857 /*NOTREACHED*/
858 break;
859
860 default:
861 panic("getblk: %d unknown operation 1", operation);
862 /*NOTREACHED*/
863 break;
864 }
865 } else {
866 /* not busy */
867 SET(bp->b_flags, (B_BUSY | B_CACHE));
868 bremfree(bp);
869 bufstats.bufs_incore++;
870 splx(s);
871
872 allocbuf(bp, size);
873 if (ISSET(bp->b_flags, B_PAGELIST))
874 panic("pagelist buffer is not busy");
875
876 switch (operation) {
877 case BLK_READ:
878 case BLK_WRITE:
879 if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
880 kret = ubc_create_upl(vp,
881 ubc_blktooff(vp, bp->b_lblkno),
882 bp->b_bufsize,
883 &upl,
884 &pl,
885 UPL_PRECIOUS);
886 if (kret != KERN_SUCCESS)
887 panic("Failed to get pagelists");
888
889 SET(bp->b_flags, B_PAGELIST);
890 bp->b_pagelist = upl;
891
892 if ( !upl_valid_page(pl, 0))
893 panic("getblk: incore buffer without valid page");
894
895 if (upl_dirty_page(pl, 0))
896 SET(bp->b_flags, B_WASDIRTY);
897 else
898 CLR(bp->b_flags, B_WASDIRTY);
899
900 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
901 if (kret != KERN_SUCCESS) {
902 panic("getblk: ubc_upl_map() failed with (%d)",
903 kret);
904 }
905 if (bp->b_data == 0) panic("ubc_upl_map mapped 0");
906 }
907 break;
908
909 case BLK_META:
910 /*
911 * VM is not involved in IO for the meta data
912 * buffer already has valid data
913 */
914 if(bp->b_data == 0)
915 panic("bp->b_data null incore buf=%x", bp);
916 break;
917
918 case BLK_PAGEIN:
919 case BLK_PAGEOUT:
920 panic("getblk: paging operation 1");
921 break;
922
923 default:
924 panic("getblk: %d unknown operation 2", operation);
925 /*NOTREACHED*/
926 break;
927 }
928 }
929 } else { /* not incore() */
930 int queue = BQ_EMPTY; /* Start with no preference */
931 splx(s);
932
933 if ((operation == BLK_META) || (UBCINVALID(vp)) ||
934 !(UBCINFOEXISTS(vp))) {
935 operation = BLK_META;
936 }
937 if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
938 goto start;
939 if (incore(vp, blkno)) {
940 SET(bp->b_flags, B_INVAL);
941 binshash(bp, &invalhash);
942 brelse(bp);
943 goto start;
944 }
945
946 /*
947 * if it is meta, the queue may be set to other
948 * type so reset as well as mark it to be B_META
949 * so that when buffer is released it will goto META queue
950 * Also, if the vnode is not VREG, then it is META
951 */
952 if (operation == BLK_META) {
953 SET(bp->b_flags, B_META);
954 queue = BQ_META;
955 }
956 /*
957 * Insert in the hash so that incore() can find it
958 */
959 binshash(bp, BUFHASH(vp, blkno));
960
961 allocbuf(bp, size);
962
963 switch (operation) {
964 case BLK_META:
965 /* buffer data is invalid */
966
967 #if !ZALLOC_METADATA
968 if (bp->b_data)
969 panic("bp->b_data is not nul; %x",bp);
970 kret = kmem_alloc(kernel_map,
971 &bp->b_data, bp->b_bufsize);
972 if (kret != KERN_SUCCESS)
973 panic("getblk: kmem_alloc() returned %d", kret);
974 #endif /* ZALLOC_METADATA */
975
976 if(bp->b_data == 0)
977 panic("bp->b_data is null %x",bp);
978
979 bp->b_blkno = bp->b_lblkno = blkno;
980 s = splbio();
981 bgetvp(vp, bp);
982 bufstats.bufs_miss++;
983 splx(s);
984 if (bp->b_data == 0)
985 panic("b_data is 0: 2");
986
987 /* wakeup the buffer */
988 CLR(bp->b_flags, B_WANTED);
989 wakeup(bp);
990 break;
991
992 case BLK_READ:
993 case BLK_WRITE:
994
995 if (ISSET(bp->b_flags, B_PAGELIST))
996 panic("B_PAGELIST in bp=%x",bp);
997
998 kret = ubc_create_upl(vp,
999 ubc_blktooff(vp, blkno),
1000 bp->b_bufsize,
1001 &upl,
1002 &pl,
1003 UPL_PRECIOUS);
1004 if (kret != KERN_SUCCESS)
1005 panic("Failed to get pagelists");
1006
1007 #ifdef UBC_DEBUG
1008 upl_ubc_alias_set(upl, bp, 4);
1009 #endif /* UBC_DEBUG */
1010 bp->b_blkno = bp->b_lblkno = blkno;
1011 bp->b_pagelist = upl;
1012
1013 SET(bp->b_flags, B_PAGELIST);
1014
1015 if (upl_valid_page(pl, 0)) {
1016 SET(bp->b_flags, B_CACHE | B_DONE);
1017 bufstats.bufs_vmhits++;
1018
1019 pagedirty = upl_dirty_page(pl, 0);
1020
1021 if (pagedirty)
1022 SET(bp->b_flags, B_WASDIRTY);
1023
1024 if (vp->v_tag == VT_NFS) {
1025 off_t f_offset;
1026 int valid_size;
1027
1028 bp->b_validoff = 0;
1029 bp->b_dirtyoff = 0;
1030
1031 f_offset = ubc_blktooff(vp, blkno);
1032
1033 if (f_offset > vp->v_ubcinfo->ui_size) {
1034 CLR(bp->b_flags, (B_CACHE|B_DONE|B_WASDIRTY));
1035 bp->b_validend = 0;
1036 bp->b_dirtyend = 0;
1037 } else {
1038 valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
1039 bp->b_validend = valid_size;
1040
1041 if (pagedirty)
1042 bp->b_dirtyend = valid_size;
1043 else
1044 bp->b_dirtyend = 0;
1045
1046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_NONE,
1047 bp->b_validend, bp->b_dirtyend,
1048 (int)vp->v_ubcinfo->ui_size, 0, 0);
1049 }
1050 } else {
1051 bp->b_validoff = 0;
1052 bp->b_dirtyoff = 0;
1053
1054 if (pagedirty) {
1055 /* page is dirty */
1056 bp->b_validend = bp->b_bcount;
1057 bp->b_dirtyend = bp->b_bcount;
1058 } else {
1059 /* page is clean */
1060 bp->b_validend = bp->b_bcount;
1061 bp->b_dirtyend = 0;
1062 }
1063 }
1064 if (error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL)) {
1065 panic("VOP_BMAP failed in getblk");
1066 /*NOTREACHED*/
1067 /*
1068 * XXX: We probably should invalidate the VM Page
1069 */
1070 bp->b_error = error;
1071 SET(bp->b_flags, (B_ERROR | B_INVAL));
1072 /* undo B_DONE that was set before upl_commit() */
1073 CLR(bp->b_flags, B_DONE);
1074 brelse(bp);
1075 return (0);
1076 }
1077 } else {
1078 bufstats.bufs_miss++;
1079 }
1080 kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
1081 if (kret != KERN_SUCCESS) {
1082 panic("getblk: ubc_upl_map() "
1083 "failed with (%d)", kret);
1084 }
1085 if (bp->b_data == 0) panic("kernel_upl_map mapped 0");
1086
1087 s = splbio();
1088 bgetvp(vp, bp);
1089 splx(s);
1090
1091 break;
1092
1093 case BLK_PAGEIN:
1094 case BLK_PAGEOUT:
1095 panic("getblk: paging operation 2");
1096 break;
1097 default:
1098 panic("getblk: %d unknown operation 3", operation);
1099 /*NOTREACHED*/
1100 break;
1101 }
1102 }
1103
1104 if (bp->b_data == NULL)
1105 panic("getblk: bp->b_addr is null");
1106
1107 if (bp->b_bufsize & 0xfff) {
1108 #if ZALLOC_METADATA
1109 if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
1110 #endif /* ZALLOC_METADATA */
1111 panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
1112 }
1113
1114 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
1115 bp, bp->b_data, bp->b_flags, 3, 0);
1116
1117 return (bp);
1118 }
1119
1120 /*
1121 * Get an empty, disassociated buffer of given size.
1122 */
1123 struct buf *
1124 geteblk(size)
1125 int size;
1126 {
1127 struct buf *bp;
1128 int queue = BQ_EMPTY;
1129 #if !ZALLOC_METADATA
1130 kern_return_t kret;
1131 vm_size_t desired_size = roundup(size, CLBYTES);
1132
1133 if (desired_size > MAXBSIZE)
1134 panic("geteblk: buffer larger than MAXBSIZE requested");
1135 #endif /* ZALLOC_METADATA */
1136
1137 while ((bp = getnewbuf(0, 0, &queue)) == 0)
1138 ;
1139 #if ZALLOC_METADATA
1140 SET(bp->b_flags, (B_META|B_INVAL));
1141 #else
1142 SET(bp->b_flags, B_INVAL);
1143 #endif /* ZALLOC_METADATA */
1144
1145 #if DIAGNOSTIC
1146 assert(queue == BQ_EMPTY);
1147 #endif /* DIAGNOSTIC */
1148 /* XXX need to implement logic to deal with other queues */
1149
1150 #if !ZALLOC_METADATA
1151 /* Empty buffer - allocate pages */
1152 kret = kmem_alloc_aligned(kernel_map, &bp->b_data, desired_size);
1153 if (kret != KERN_SUCCESS)
1154 panic("geteblk: kmem_alloc_aligned returned %d", kret);
1155 #endif /* ZALLOC_METADATA */
1156
1157 binshash(bp, &invalhash);
1158 allocbuf(bp, size);
1159 bufstats.bufs_eblk++;
1160
1161 return (bp);
1162 }
1163
1164 #if ZALLOC_METADATA
1165 /*
1166 * Zones for the meta data buffers
1167 */
1168
1169 #define MINMETA 512
1170 #define MAXMETA 4096
1171
1172 struct meta_zone_entry {
1173 zone_t mz_zone;
1174 vm_size_t mz_size;
1175 vm_size_t mz_max;
1176 char *mz_name;
1177 };
1178
1179 struct meta_zone_entry meta_zones[] = {
1180 {NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
1181 {NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
1182 {NULL, (MINMETA * 3), 16 * (MINMETA * 3), "buf.1536" },
1183 {NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
1184 {NULL, (MINMETA * 5), 16 * (MINMETA * 5), "buf.2560" },
1185 {NULL, (MINMETA * 6), 16 * (MINMETA * 6), "buf.3072" },
1186 {NULL, (MINMETA * 7), 16 * (MINMETA * 7), "buf.3584" },
1187 {NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
1188 {NULL, 0, 0, "" } /* End */
1189 };
1190 #endif /* ZALLOC_METADATA */
1191
1192 zone_t buf_hdr_zone;
1193 int buf_hdr_count;
1194
1195 /*
1196 * Initialize the meta data zones
1197 */
1198 static void
1199 bufzoneinit(void)
1200 {
1201 #if ZALLOC_METADATA
1202 int i;
1203
1204 for (i = 0; meta_zones[i].mz_size != 0; i++) {
1205 meta_zones[i].mz_zone =
1206 zinit(meta_zones[i].mz_size,
1207 meta_zones[i].mz_max,
1208 PAGE_SIZE,
1209 meta_zones[i].mz_name);
1210 }
1211 #endif /* ZALLOC_METADATA */
1212 buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
1213 }
1214
1215 #if ZALLOC_METADATA
1216 static zone_t
1217 getbufzone(size_t size)
1218 {
1219 int i;
1220
1221 if (size % 512)
1222 panic("getbufzone: incorect size = %d", size);
1223
1224 i = (size / 512) - 1;
1225 return (meta_zones[i].mz_zone);
1226 }
1227 #endif /* ZALLOC_METADATA */
1228
1229 /*
1230 * With UBC, there is no need to expand / shrink the file data
1231 * buffer. The VM uses the same pages, hence no waste.
1232 * All the file data buffers can have one size.
1233 * In fact expand / shrink would be an expensive operation.
1234 *
1235 * Only exception to this is meta-data buffers. Most of the
1236 * meta data operations are smaller than PAGE_SIZE. Having the
1237 * meta-data buffers grow and shrink as needed, optimizes use
1238 * of the kernel wired memory.
1239 */
1240
1241 int
1242 allocbuf(bp, size)
1243 struct buf *bp;
1244 int size;
1245 {
1246 vm_size_t desired_size;
1247
1248 desired_size = roundup(size, CLBYTES);
1249
1250 if(desired_size < PAGE_SIZE)
1251 desired_size = PAGE_SIZE;
1252 if (desired_size > MAXBSIZE)
1253 panic("allocbuf: buffer larger than MAXBSIZE requested");
1254
1255 #if ZALLOC_METADATA
1256 if (ISSET(bp->b_flags, B_META)) {
1257 kern_return_t kret;
1258 zone_t zprev, z;
1259 size_t nsize = roundup(size, MINMETA);
1260
1261 if (bp->b_data) {
1262 vm_offset_t elem = (vm_offset_t)bp->b_data;
1263
1264 if (ISSET(bp->b_flags, B_ZALLOC))
1265 if (bp->b_bufsize <= MAXMETA) {
1266 if (bp->b_bufsize < nsize) {
1267 /* reallocate to a bigger size */
1268 desired_size = nsize;
1269
1270 zprev = getbufzone(bp->b_bufsize);
1271 z = getbufzone(nsize);
1272 bp->b_data = (caddr_t)zalloc(z);
1273 if(bp->b_data == 0)
1274 panic("allocbuf: zalloc() returned NULL");
1275 bcopy(elem, bp->b_data, bp->b_bufsize);
1276 zfree(zprev, elem);
1277 } else {
1278 desired_size = bp->b_bufsize;
1279 }
1280 } else
1281 panic("allocbuf: B_ZALLOC set incorrectly");
1282 else
1283 if (bp->b_bufsize < desired_size) {
1284 /* reallocate to a bigger size */
1285 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1286 if (kret != KERN_SUCCESS)
1287 panic("allocbuf: kmem_alloc() returned %d", kret);
1288 if(bp->b_data == 0)
1289 panic("allocbuf: null b_data");
1290 bcopy(elem, bp->b_data, bp->b_bufsize);
1291 kmem_free(kernel_map, elem, bp->b_bufsize);
1292 } else {
1293 desired_size = bp->b_bufsize;
1294 }
1295 } else {
1296 /* new allocation */
1297 if (nsize <= MAXMETA) {
1298 desired_size = nsize;
1299 z = getbufzone(nsize);
1300 bp->b_data = (caddr_t)zalloc(z);
1301 if(bp->b_data == 0)
1302 panic("allocbuf: zalloc() returned NULL 2");
1303 SET(bp->b_flags, B_ZALLOC);
1304 } else {
1305 kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
1306 if (kret != KERN_SUCCESS)
1307 panic("allocbuf: kmem_alloc() 2 returned %d", kret);
1308 if(bp->b_data == 0)
1309 panic("allocbuf: null b_data 2");
1310 }
1311 }
1312 }
1313
1314 if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
1315 panic("allocbuf: bp->b_data is NULL");
1316 #endif /* ZALLOC_METADATA */
1317
1318 bp->b_bufsize = desired_size;
1319 bp->b_bcount = size;
1320 }
1321
1322 /*
1323 * Get a new buffer from one of the free lists.
1324 *
1325 * Request for a queue is passes in. The queue from which the buffer was taken
1326 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1327 * BQUEUE means no preference. Use heuristics in that case.
1328 * Heuristics is as follows:
1329 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1330 * If none available block till one is made available.
1331 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1332 * Pick the most stale buffer.
1333 * If found buffer was marked delayed write, start the async. write
1334 * and restart the search.
1335 * Initialize the fields and disassociate the buffer from the vnode.
1336 * Remove the buffer from the hash. Return the buffer and the queue
1337 * on which it was found.
1338 */
1339
1340 static struct buf *
1341 getnewbuf(slpflag, slptimeo, queue)
1342 int slpflag, slptimeo;
1343 int *queue;
1344 {
1345 register struct buf *bp;
1346 register struct buf *lru_bp;
1347 register struct buf *age_bp;
1348 register struct buf *meta_bp;
1349 register int age_time, lru_time, bp_time, meta_time;
1350 int s;
1351 struct ucred *cred;
1352 int req = *queue; /* save it for restarts */
1353
1354 start:
1355 s = splbio();
1356
1357 /* invalid request gets empty queue */
1358 if ((*queue > BQUEUES) || (*queue < 0)
1359 || (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
1360 *queue = BQ_EMPTY;
1361
1362 /* (*queue == BQUEUES) means no preference */
1363 if (*queue != BQUEUES) {
1364 /* Try for the requested queue first */
1365 bp = bufqueues[*queue].tqh_first;
1366 if (bp)
1367 goto found;
1368 }
1369
1370 /* Unable to use requested queue */
1371 age_bp = bufqueues[BQ_AGE].tqh_first;
1372 lru_bp = bufqueues[BQ_LRU].tqh_first;
1373 meta_bp = bufqueues[BQ_META].tqh_first;
1374
1375 if (!age_bp && !lru_bp && !meta_bp) { /* Unavailble on AGE or LRU */
1376 /* Try the empty list first */
1377 bp = bufqueues[BQ_EMPTY].tqh_first;
1378 if (bp) {
1379 *queue = BQ_EMPTY;
1380 goto found;
1381 }
1382
1383 /* Create a new temparory buffer header */
1384 bp = (struct buf *)zalloc(buf_hdr_zone);
1385
1386 if (bp) {
1387 bufhdrinit(bp);
1388 BLISTNONE(bp);
1389 binshash(bp, &invalhash);
1390 SET(bp->b_flags, B_HDRALLOC);
1391 *queue = BQ_EMPTY;
1392 binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
1393 buf_hdr_count++;
1394 goto found;
1395 }
1396
1397 /* Log this error condition */
1398 printf("getnewbuf: No useful buffers");
1399
1400 /* wait for a free buffer of any kind */
1401 needbuffer = 1;
1402 bufstats.bufs_sleeps++;
1403 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
1404 splx(s);
1405 return (0);
1406 }
1407
1408 /* Buffer available either on AGE or LRU or META */
1409 bp = NULL;
1410 *queue = -1;
1411
1412 /* Buffer available either on AGE or LRU */
1413 if (!age_bp) {
1414 bp = lru_bp;
1415 *queue = BQ_LRU;
1416 } else if (!lru_bp) {
1417 bp = age_bp;
1418 *queue = BQ_AGE;
1419 } else { /* buffer available on both AGE and LRU */
1420 age_time = time.tv_sec - age_bp->b_timestamp;
1421 lru_time = time.tv_sec - lru_bp->b_timestamp;
1422 if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
1423 bp = age_bp;
1424 *queue = BQ_AGE;
1425 /*
1426 * we should probably re-timestamp eveything in the
1427 * queues at this point with the current time
1428 */
1429 } else {
1430 if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
1431 bp = lru_bp;
1432 *queue = BQ_LRU;
1433 } else {
1434 bp = age_bp;
1435 *queue = BQ_AGE;
1436 }
1437 }
1438 }
1439
1440 if (!bp) { /* Neither on AGE nor on LRU */
1441 bp = meta_bp;
1442 *queue = BQ_META;
1443 } else if (meta_bp) {
1444 bp_time = time.tv_sec - bp->b_timestamp;
1445 meta_time = time.tv_sec - meta_bp->b_timestamp;
1446
1447 if (!(bp_time < 0) && !(meta_time < 0)) {
1448 /* time not set backwards */
1449 int bp_is_stale;
1450 bp_is_stale = (*queue == BQ_LRU) ?
1451 lru_is_stale : age_is_stale;
1452
1453 if ((meta_time >= meta_is_stale) &&
1454 (bp_time < bp_is_stale)) {
1455 bp = meta_bp;
1456 *queue = BQ_META;
1457 }
1458 }
1459 }
1460
1461 if (bp == NULL)
1462 panic("getnewbuf: null bp");
1463
1464 found:
1465 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1466 panic("getnewbuf: le_prev is deadbeef");
1467
1468 if(ISSET(bp->b_flags, B_BUSY))
1469 panic("getnewbuf reusing BUSY buf");
1470
1471 /* Clean it */
1472 if (bcleanbuf(bp)) {
1473 /* bawrite() issued, buffer not ready */
1474 splx(s);
1475 *queue = req;
1476 goto start;
1477 }
1478 splx(s);
1479 return (bp);
1480 }
1481 #include <mach/mach_types.h>
1482 #include <mach/memory_object_types.h>
1483
1484 /*
1485 * Clean a buffer.
1486 * Returns 0 is buffer is ready to use,
1487 * Returns 1 if issued a bawrite() to indicate
1488 * that the buffer is not ready.
1489 */
1490 int
1491 bcleanbuf(struct buf *bp)
1492 {
1493 int s;
1494 struct ucred *cred;
1495
1496 s = splbio();
1497
1498 /* Remove from the queue */
1499 bremfree(bp);
1500
1501 /* Buffer is no longer on free lists. */
1502 SET(bp->b_flags, B_BUSY);
1503
1504 if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
1505 panic("bcleanbuf: le_prev is deadbeef");
1506
1507 /*
1508 * If buffer was a delayed write, start the IO by queuing
1509 * it on the LAUNDRY queue, and return 1
1510 */
1511 if (ISSET(bp->b_flags, B_DELWRI)) {
1512 splx(s);
1513 binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
1514 blaundrycnt++;
1515 wakeup(&blaundrycnt);
1516 return (1);
1517 }
1518
1519 if (bp->b_vp)
1520 brelvp(bp);
1521 bremhash(bp);
1522 BLISTNONE(bp);
1523
1524 splx(s);
1525
1526 if (ISSET(bp->b_flags, B_META)) {
1527 #if ZALLOC_METADATA
1528 vm_offset_t elem = (vm_offset_t)bp->b_data;
1529 if (elem == 0)
1530 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1531
1532 if (ISSET(bp->b_flags, B_ZALLOC)) {
1533 if (bp->b_bufsize <= MAXMETA) {
1534 zone_t z;
1535
1536 z = getbufzone(bp->b_bufsize);
1537 bp->b_data = (caddr_t)0xdeadbeef;
1538 zfree(z, elem);
1539 CLR(bp->b_flags, B_ZALLOC);
1540 } else
1541 panic("bcleanbuf: B_ZALLOC set incorrectly");
1542 } else {
1543 bp->b_data = (caddr_t)0xdeadbeef;
1544 kmem_free(kernel_map, elem, bp->b_bufsize);
1545 }
1546 #else
1547 if (bp->b_data == 0)
1548 panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1549
1550 kmem_free(kernel_map, bp->b_data, bp->b_bufsize);
1551 #endif /* ZALLOC_METADATA */
1552 }
1553
1554 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
1555
1556 /* disassociate us from our vnode, if we had one... */
1557 s = splbio();
1558
1559 /* clear out various other fields */
1560 bp->b_bufsize = 0;
1561 bp->b_data = 0;
1562 bp->b_flags = B_BUSY;
1563 bp->b_dev = NODEV;
1564 bp->b_blkno = bp->b_lblkno = 0;
1565 bp->b_iodone = 0;
1566 bp->b_error = 0;
1567 bp->b_resid = 0;
1568 bp->b_bcount = 0;
1569 bp->b_dirtyoff = bp->b_dirtyend = 0;
1570 bp->b_validoff = bp->b_validend = 0;
1571
1572 /* nuke any credentials we were holding */
1573 cred = bp->b_rcred;
1574 if (cred != NOCRED) {
1575 bp->b_rcred = NOCRED;
1576 crfree(cred);
1577 }
1578 cred = bp->b_wcred;
1579 if (cred != NOCRED) {
1580 bp->b_wcred = NOCRED;
1581 crfree(cred);
1582 }
1583 splx(s);
1584 return (0);
1585 }
1586
1587
1588 /*
1589 * Wait for operations on the buffer to complete.
1590 * When they do, extract and return the I/O's error value.
1591 */
1592 int
1593 biowait(bp)
1594 struct buf *bp;
1595 {
1596 upl_t upl;
1597 upl_page_info_t *pl;
1598 int s;
1599 kern_return_t kret;
1600
1601 s = splbio();
1602 while (!ISSET(bp->b_flags, B_DONE))
1603 tsleep(bp, PRIBIO + 1, "biowait", 0);
1604 splx(s);
1605
1606 /* check for interruption of I/O (e.g. via NFS), then errors. */
1607 if (ISSET(bp->b_flags, B_EINTR)) {
1608 CLR(bp->b_flags, B_EINTR);
1609 return (EINTR);
1610 } else if (ISSET(bp->b_flags, B_ERROR))
1611 return (bp->b_error ? bp->b_error : EIO);
1612 else
1613 return (0);
1614 }
1615
1616 /*
1617 * Mark I/O complete on a buffer.
1618 *
1619 * If a callback has been requested, e.g. the pageout
1620 * daemon, do so. Otherwise, awaken waiting processes.
1621 *
1622 * [ Leffler, et al., says on p.247:
1623 * "This routine wakes up the blocked process, frees the buffer
1624 * for an asynchronous write, or, for a request by the pagedaemon
1625 * process, invokes a procedure specified in the buffer structure" ]
1626 *
1627 * In real life, the pagedaemon (or other system processes) wants
1628 * to do async stuff to, and doesn't want the buffer brelse()'d.
1629 * (for swap pager, that puts swap buffers on the free lists (!!!),
1630 * for the vn device, that puts malloc'd buffers on the free lists!)
1631 */
1632 void
1633 biodone(bp)
1634 struct buf *bp;
1635 {
1636 boolean_t funnel_state;
1637 int s;
1638
1639 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1640
1641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
1642 bp, bp->b_data, bp->b_flags, 0, 0);
1643
1644 if (ISSET(bp->b_flags, B_DONE))
1645 panic("biodone already");
1646 SET(bp->b_flags, B_DONE); /* note that it's done */
1647 /*
1648 * I/O was done, so don't believe
1649 * the DIRTY state from VM anymore
1650 */
1651 CLR(bp->b_flags, B_WASDIRTY);
1652
1653 if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
1654 vwakeup(bp); /* wake up reader */
1655
1656 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1657 CLR(bp->b_flags, B_CALL); /* but note callout done */
1658 (*bp->b_iodone)(bp);
1659 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
1660 brelse(bp);
1661 else { /* or just wakeup the buffer */
1662 CLR(bp->b_flags, B_WANTED);
1663 wakeup(bp);
1664 }
1665
1666 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
1667 bp, bp->b_data, bp->b_flags, 0, 0);
1668
1669 thread_funnel_set(kernel_flock, funnel_state);
1670 }
1671
1672 /*
1673 * Return a count of buffers on the "locked" queue.
1674 */
1675 int
1676 count_lock_queue()
1677 {
1678 register struct buf *bp;
1679 register int n = 0;
1680
1681 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
1682 bp = bp->b_freelist.tqe_next)
1683 n++;
1684 return (n);
1685 }
1686
1687 /*
1688 * Return a count of 'busy' buffers. Used at the time of shutdown.
1689 */
1690 int
1691 count_busy_buffers()
1692 {
1693 register struct buf *bp;
1694 register int nbusy = 0;
1695
1696 for (bp = &buf[nbuf]; --bp >= buf; )
1697 if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1698 nbusy++;
1699 return (nbusy);
1700 }
1701
1702 #if 1 /*DIAGNOSTIC */
1703 /*
1704 * Print out statistics on the current allocation of the buffer pool.
1705 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1706 * in vfs_syscalls.c using sysctl.
1707 */
1708 void
1709 vfs_bufstats()
1710 {
1711 int s, i, j, count;
1712 register struct buf *bp;
1713 register struct bqueues *dp;
1714 int counts[MAXBSIZE/CLBYTES+1];
1715 static char *bname[BQUEUES] =
1716 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1717
1718 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1719 count = 0;
1720 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1721 counts[j] = 0;
1722 s = splbio();
1723 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1724 counts[bp->b_bufsize/CLBYTES]++;
1725 count++;
1726 }
1727 splx(s);
1728 printf("%s: total-%d", bname[i], count);
1729 for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
1730 if (counts[j] != 0)
1731 printf(", %d-%d", j * CLBYTES, counts[j]);
1732 printf("\n");
1733 }
1734 }
1735 #endif /* DIAGNOSTIC */
1736
1737 #define NRESERVEDIOBUFS 16
1738
1739 struct buf *
1740 alloc_io_buf(vp, priv)
1741 struct vnode *vp;
1742 int priv;
1743 {
1744 register struct buf *bp;
1745 int s;
1746
1747 s = splbio();
1748
1749 while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
1750 need_iobuffer = 1;
1751 bufstats.bufs_iobufsleeps++;
1752 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
1753 }
1754
1755 while ((bp = iobufqueue.tqh_first) == NULL) {
1756 need_iobuffer = 1;
1757 bufstats.bufs_iobufsleeps++;
1758 (void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
1759 }
1760
1761 TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
1762 bp->b_timestamp = 0;
1763
1764 /* clear out various fields */
1765 bp->b_flags = B_BUSY;
1766 bp->b_blkno = bp->b_lblkno = 0;
1767 bp->b_iodone = 0;
1768 bp->b_error = 0;
1769 bp->b_resid = 0;
1770 bp->b_bcount = 0;
1771 bp->b_bufsize = 0;
1772 bp->b_vp = vp;
1773
1774 if (vp->v_type == VBLK || vp->v_type == VCHR)
1775 bp->b_dev = vp->v_rdev;
1776 else
1777 bp->b_dev = NODEV;
1778 bufstats.bufs_iobufinuse++;
1779 if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
1780 bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
1781 splx(s);
1782
1783 return (bp);
1784 }
1785
1786 void
1787 free_io_buf(bp)
1788 struct buf *bp;
1789 {
1790 int s;
1791
1792 s = splbio();
1793 /* put buffer back on the head of the iobufqueue */
1794 bp->b_vp = NULL;
1795 bp->b_flags = B_INVAL;
1796
1797 binsheadfree(bp, &iobufqueue, -1);
1798
1799 /* Wake up any processes waiting for any buffer to become free. */
1800 if (need_iobuffer) {
1801 need_iobuffer = 0;
1802 wakeup(&need_iobuffer);
1803 }
1804 bufstats.bufs_iobufinuse--;
1805 splx(s);
1806 }
1807
1808
1809 /* not hookedup yet */
1810
1811 /* XXX move this to a separate file */
1812 /*
1813 * Dynamic Scaling of the Buffer Queues
1814 */
1815
1816 typedef long long blsize_t;
1817
1818 blsize_t MAXNBUF; /* initialize to (mem_size / PAGE_SIZE) */
1819 /* Global tunable limits */
1820 blsize_t nbufh; /* number of buffer headers */
1821 blsize_t nbuflow; /* minimum number of buffer headers required */
1822 blsize_t nbufhigh; /* maximum number of buffer headers allowed */
1823 blsize_t nbuftarget; /* preferred number of buffer headers */
1824
1825 /*
1826 * assertions:
1827 *
1828 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1829 * 2. nbufhigh <= MAXNBUF
1830 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1831 * 4. nbufh can not be set by sysctl().
1832 */
1833
1834 /* Per queue tunable limits */
1835
1836 struct bufqlim {
1837 blsize_t bl_nlow; /* minimum number of buffer headers required */
1838 blsize_t bl_num; /* number of buffer headers on the queue */
1839 blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
1840 blsize_t bl_target; /* preferred number of buffer headers */
1841 long bl_stale; /* Seconds after which a buffer is considered stale */
1842 } bufqlim[BQUEUES];
1843
1844 /*
1845 * assertions:
1846 *
1847 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
1848 * 2. bl_nlhigh <= MAXNBUF
1849 * 3. bufqlim[BQ_META].bl_nlow != 0
1850 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1851 * file system IO operations)
1852 * 5. bl_num can not be set by sysctl().
1853 * 6. bl_nhigh <= nbufhigh
1854 */
1855
1856 /*
1857 * Rationale:
1858 * ----------
1859 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1860 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1861 *
1862 * These limits are exported to by means of sysctl().
1863 * It was decided to define blsize_t as a 64 bit quantity.
1864 * This will make sure that we will not be required to change it
1865 * as long as we do not exceed 64 bit address space for the kernel.
1866 *
1867 * low and high numbers parameters initialized at compile time
1868 * and boot arguments can be used to override them. sysctl()
1869 * would not change the value. sysctl() can get all the values
1870 * but can set only target. num is the current level.
1871 *
1872 * Advantages of having a "bufqscan" thread doing the balancing are,
1873 * Keep enough bufs on BQ_EMPTY.
1874 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1875 * getnewbuf() perfoms best if a buffer was found there.
1876 * Also this minimizes the possibility of starting IO
1877 * from getnewbuf(). That's a performance win, too.
1878 *
1879 * Localize complex logic [balancing as well as time aging]
1880 * to balancebufq().
1881 *
1882 * Simplify getnewbuf() logic by elimination of time aging code.
1883 */
1884
1885 /*
1886 * Algorithm:
1887 * -----------
1888 * The goal of the dynamic scaling of the buffer queues to to keep
1889 * the size of the LRU close to bl_target. Buffers on a queue would
1890 * be time aged.
1891 *
1892 * There would be a thread which will be responsible for "balancing"
1893 * the buffer cache queues.
1894 *
1895 * The scan order would be: AGE, LRU, META, EMPTY.
1896 */
1897
1898 long bufqscanwait = 0;
1899
1900 extern void bufqscan_thread();
1901 extern int balancebufq(int q);
1902 extern int btrimempty(int n);
1903 extern int initbufqscan(void);
1904 extern int nextbufq(int q);
1905 extern void buqlimprt(int all);
1906
1907 void
1908 bufq_balance_thread_init()
1909 {
1910
1911 if (bufqscanwait++ == 0) {
1912 int i;
1913
1914 /* Initalize globals */
1915 MAXNBUF = (mem_size / PAGE_SIZE);
1916 nbufh = nbuf;
1917 nbuflow = min(nbufh, 100);
1918 nbufhigh = min(MAXNBUF, max(nbufh, 2048));
1919 nbuftarget = (mem_size >> 5) / PAGE_SIZE;
1920 nbuftarget = max(nbuflow, nbuftarget);
1921 nbuftarget = min(nbufhigh, nbuftarget);
1922
1923 /*
1924 * Initialize the bufqlim
1925 */
1926
1927 /* LOCKED queue */
1928 bufqlim[BQ_LOCKED].bl_nlow = 0;
1929 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1930 bufqlim[BQ_LOCKED].bl_target = 0;
1931 bufqlim[BQ_LOCKED].bl_stale = 30;
1932
1933 /* LRU queue */
1934 bufqlim[BQ_LRU].bl_nlow = 0;
1935 bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
1936 bufqlim[BQ_LRU].bl_target = nbuftarget/4;
1937 bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
1938
1939 /* AGE queue */
1940 bufqlim[BQ_AGE].bl_nlow = 0;
1941 bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
1942 bufqlim[BQ_AGE].bl_target = nbuftarget/4;
1943 bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
1944
1945 /* EMPTY queue */
1946 bufqlim[BQ_EMPTY].bl_nlow = 0;
1947 bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
1948 bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
1949 bufqlim[BQ_EMPTY].bl_stale = 600000;
1950
1951 /* META queue */
1952 bufqlim[BQ_META].bl_nlow = 0;
1953 bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
1954 bufqlim[BQ_META].bl_target = nbuftarget/4;
1955 bufqlim[BQ_META].bl_stale = META_IS_STALE;
1956
1957 /* LAUNDRY queue */
1958 bufqlim[BQ_LOCKED].bl_nlow = 0;
1959 bufqlim[BQ_LOCKED].bl_nlhigh = 32;
1960 bufqlim[BQ_LOCKED].bl_target = 0;
1961 bufqlim[BQ_LOCKED].bl_stale = 30;
1962
1963 buqlimprt(1);
1964 }
1965
1966 /* create worker thread */
1967 kernel_thread(kernel_task, bufqscan_thread);
1968 }
1969
1970 /* The workloop for the buffer balancing thread */
1971 void
1972 bufqscan_thread()
1973 {
1974 boolean_t funnel_state;
1975 int moretodo = 0;
1976
1977 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1978
1979 for(;;) {
1980 do {
1981 int q; /* buffer queue to process */
1982
1983 for (q = initbufqscan(); q; ) {
1984 moretodo |= balancebufq(q);
1985 q = nextbufq(q);
1986 }
1987 } while (moretodo);
1988
1989 #if 1 || DIAGNOSTIC
1990 vfs_bufstats();
1991 buqlimprt(0);
1992 #endif
1993 (void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
1994 moretodo = 0;
1995 }
1996
1997 (void) thread_funnel_set(kernel_flock, FALSE);
1998 }
1999
2000 /* Seed for the buffer queue balancing */
2001 int
2002 initbufqscan()
2003 {
2004 /* Start with AGE queue */
2005 return (BQ_AGE);
2006 }
2007
2008 /* Pick next buffer queue to balance */
2009 int
2010 nextbufq(int q)
2011 {
2012 int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
2013
2014 q++;
2015 q %= sizeof(order);
2016 return (order[q]);
2017 }
2018
2019 /* function to balance the buffer queues */
2020 int
2021 balancebufq(int q)
2022 {
2023 int moretodo = 0;
2024 int s = splbio();
2025 int n;
2026
2027 /* reject invalid q */
2028 if ((q < 0) || (q >= BQUEUES))
2029 goto out;
2030
2031 /* LOCKED or LAUNDRY queue MUST not be balanced */
2032 if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
2033 goto out;
2034
2035 n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
2036
2037 /* If queue has less than target nothing more to do */
2038 if (n < 0)
2039 goto out;
2040
2041 if ( n > 8 ) {
2042 /* Balance only a small amount (12.5%) at a time */
2043 n >>= 3;
2044 }
2045
2046 /* EMPTY queue needs special handling */
2047 if (q == BQ_EMPTY) {
2048 moretodo |= btrimempty(n);
2049 goto out;
2050 }
2051
2052 for (; n > 0; n--) {
2053 struct buf *bp = bufqueues[q].tqh_first;
2054 if (!bp)
2055 break;
2056
2057 /* check if it's stale */
2058 if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
2059 if (bcleanbuf(bp)) {
2060 /* bawrite() issued, bp not ready */
2061 moretodo = 1;
2062 } else {
2063 /* release the cleaned buffer to BQ_EMPTY */
2064 SET(bp->b_flags, B_INVAL);
2065 brelse(bp);
2066 }
2067 } else
2068 break;
2069 }
2070
2071 out:
2072 splx(s);
2073 return (moretodo);
2074 }
2075
2076 int
2077 btrimempty(int n)
2078 {
2079 /*
2080 * When struct buf are allocated dynamically, this would
2081 * reclaim upto 'n' struct buf from the empty queue.
2082 */
2083
2084 return (0);
2085 }
2086
2087 void
2088 bufqinc(int q)
2089 {
2090 if ((q < 0) || (q >= BQUEUES))
2091 return;
2092
2093 bufqlim[q].bl_num++;
2094 return;
2095 }
2096
2097 void
2098 bufqdec(int q)
2099 {
2100 if ((q < 0) || (q >= BQUEUES))
2101 return;
2102
2103 bufqlim[q].bl_num--;
2104 return;
2105 }
2106
2107 void
2108 buqlimprt(int all)
2109 {
2110 int i;
2111 static char *bname[BQUEUES] =
2112 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2113
2114 if (all)
2115 for (i = 0; i < BQUEUES; i++) {
2116 printf("%s : ", bname[i]);
2117 printf("min = %d, ", (long)bufqlim[i].bl_nlow);
2118 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2119 printf("max = %d, ", (long)bufqlim[i].bl_nlhigh);
2120 printf("target = %d, ", (long)bufqlim[i].bl_target);
2121 printf("stale after %d seconds\n", bufqlim[i].bl_stale);
2122 }
2123 else
2124 for (i = 0; i < BQUEUES; i++) {
2125 printf("%s : ", bname[i]);
2126 printf("cur = %d, ", (long)bufqlim[i].bl_num);
2127 }
2128 }
2129
2130 /*
2131 * If the getnewbuf() calls bcleanbuf() on the same thread
2132 * there is a potential for stack overrun and deadlocks.
2133 * So we always handoff the work to worker thread for completion
2134 */
2135
2136 static void
2137 bcleanbuf_thread_init()
2138 {
2139 static void bcleanbuf_thread();
2140
2141 /* create worker thread */
2142 kernel_thread(kernel_task, bcleanbuf_thread);
2143 }
2144
2145 static void
2146 bcleanbuf_thread()
2147 {
2148 boolean_t funnel_state;
2149 struct buf *bp;
2150
2151 funnel_state = thread_funnel_set(kernel_flock, TRUE);
2152
2153 doit:
2154 while (blaundrycnt == 0)
2155 (void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 60 * hz);
2156 bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
2157 /* Remove from the queue */
2158 bremfree(bp);
2159 blaundrycnt--;
2160 /* do the IO */
2161 bawrite(bp);
2162 /* start again */
2163 goto doit;
2164
2165 (void) thread_funnel_set(kernel_flock, funnel_state);
2166 }
2167