2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 * Copyright (c) 1994 Christopher G. Demetriou
25 * Copyright (c) 1982, 1986, 1989, 1993
26 * The Regents of the University of California. All rights reserved.
27 * (c) UNIX System Laboratories, Inc.
28 * All or some portions of this file are derived from material licensed
29 * to the University of California by American Telephone and Telegraph
30 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
31 * the permission of UNIX System Laboratories, Inc.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * The NEXTSTEP Software License Agreement specifies the terms
62 * and conditions for redistribution.
64 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
69 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
70 * Leffler, et al.: The Design and Implementation of the 4.3BSD
71 * UNIX Operating System (Addison Welley, 1989)
73 #define ZALLOC_METADATA 1
75 #include <sys/param.h>
76 #include <sys/systm.h>
79 #include <sys/vnode.h>
80 #include <sys/mount.h>
81 #include <sys/trace.h>
82 #include <sys/malloc.h>
83 #include <sys/resourcevar.h>
84 #include <miscfs/specfs/specdev.h>
86 #include <vm/vm_pageout.h>
88 #include <kern/assert.h>
89 #endif /* DIAGNOSTIC */
90 #include <kern/task.h>
91 #include <kern/zalloc.h>
93 #include <sys/kdebug.h>
95 extern void bufqinc(int q
);
96 extern void bufqdec(int q
);
97 extern void bufq_balance_thread_init();
99 extern void reassignbuf(struct buf
*, struct vnode
*);
100 static struct buf
*getnewbuf(int slpflag
, int slptimeo
, int *queue
);
102 extern int niobuf
; /* The number of IO buffer headers for cluster IO */
105 struct proc
*traceproc
;
106 int tracewhich
, tracebuf
[TRCSIZ
];
108 char traceflags
[TR_NFLAGS
];
112 * Definitions for the buffer hash lists.
114 #define BUFHASH(dvp, lbn) \
115 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
116 LIST_HEAD(bufhashhdr
, buf
) *bufhashtbl
, invalhash
;
119 /* Definitions for the buffer stats. */
120 struct bufstats bufstats
;
123 * Insq/Remq for the buffer hash lists.
126 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
127 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
131 TAILQ_HEAD(ioqueue
, buf
) iobufqueue
;
132 TAILQ_HEAD(bqueues
, buf
) bufqueues
[BQUEUES
];
137 * Insq/Remq for the buffer free lists.
139 #define binsheadfree(bp, dp, whichq) do { \
140 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
142 (bp)->b_whichq = whichq; \
143 (bp)->b_timestamp = time.tv_sec; \
146 #define binstailfree(bp, dp, whichq) do { \
147 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
149 (bp)->b_whichq = whichq; \
150 (bp)->b_timestamp = time.tv_sec; \
153 #define BHASHENTCHECK(bp) \
154 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
155 panic("%x: b_hash.le_prev is deadb", (bp));
157 #define BLISTNONE(bp) \
158 (bp)->b_hash.le_next = (struct buf *)0; \
159 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
161 simple_lock_data_t bufhashlist_slock
; /* lock on buffer hash list */
164 * Time in seconds before a buffer on a list is
165 * considered as a stale buffer
167 #define LRU_IS_STALE 120 /* default value for the LRU */
168 #define AGE_IS_STALE 60 /* default value for the AGE */
169 #define META_IS_STALE 180 /* default value for the BQ_META */
171 int lru_is_stale
= LRU_IS_STALE
;
172 int age_is_stale
= AGE_IS_STALE
;
173 int meta_is_stale
= META_IS_STALE
;
177 blistenterhead(struct bufhashhdr
* head
, struct buf
* bp
)
179 if ((bp
->b_hash
.le_next
= (head
)->lh_first
) != NULL
)
180 (head
)->lh_first
->b_hash
.le_prev
= &(bp
)->b_hash
.le_next
;
181 (head
)->lh_first
= bp
;
182 bp
->b_hash
.le_prev
= &(head
)->lh_first
;
183 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
184 panic("blistenterhead: le_prev is deadbeef");
191 binshash(struct buf
*bp
, struct bufhashhdr
*dp
)
197 simple_lock(&bufhashlist_slock
);
199 if(incore(bp
->b_vp
, bp
->b_lblkno
)) {
200 panic("adding to queue already existing element");
206 for(; nbp
!= NULL
; nbp
= nbp
->b_hash
.le_next
) {
208 panic("buf already in hashlist");
212 LIST_INSERT_HEAD(dp
, bp
, b_hash
);
214 blistenterhead(dp
, bp
);
216 simple_unlock(&bufhashlist_slock
);
220 bremhash(struct buf
*bp
)
224 simple_lock(&bufhashlist_slock
);
225 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
226 panic("bremhash le_prev is deadbeef");
227 if (bp
->b_hash
.le_next
== bp
)
228 panic("bremhash: next points to self");
230 if (bp
->b_hash
.le_next
!= NULL
)
231 bp
->b_hash
.le_next
->b_hash
.le_prev
= bp
->b_hash
.le_prev
;
232 *bp
->b_hash
.le_prev
= (bp
)->b_hash
.le_next
;
233 simple_unlock(&bufhashlist_slock
);
240 * Remove a buffer from the free list it's on
246 struct bqueues
*dp
= NULL
;
250 * We only calculate the head of the freelist when removing
251 * the last element of the list as that is the only time that
252 * it is needed (e.g. to reset the tail pointer).
254 * NB: This makes an assumption about how tailq's are implemented.
256 if (bp
->b_freelist
.tqe_next
== NULL
) {
257 for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
258 if (dp
->tqh_last
== &bp
->b_freelist
.tqe_next
)
260 if (dp
== &bufqueues
[BQUEUES
])
261 panic("bremfree: lost tail");
263 TAILQ_REMOVE(dp
, bp
, b_freelist
);
264 whichq
= bp
->b_whichq
;
271 * Initialize buffers and hash links for buffers.
276 register struct buf
*bp
;
277 register struct bqueues
*dp
;
282 static void bufzoneinit();
283 #endif /* ZALLOC_METADATA */
285 /* Initialize the buffer queues ('freelists') and the hash table */
286 for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
288 bufhashtbl
= hashinit(nbuf
, M_CACHE
, &bufhash
);
290 simple_lock_init(&bufhashlist_slock
);
292 metabuf
= nbuf
/8; /* reserved for meta buf */
294 /* Initialize the buffer headers */
295 for (i
= 0; i
< nbuf
; i
++) {
297 bzero((char *)bp
, sizeof *bp
);
299 bp
->b_rcred
= NOCRED
;
300 bp
->b_wcred
= NOCRED
;
301 bp
->b_vnbufs
.le_next
= NOLIST
;
302 bp
->b_flags
= B_INVAL
;
304 * metabuf buffer headers on the meta-data list and
305 * rest of the buffer headers on the empty list
313 dp
= &bufqueues
[whichq
];
314 binsheadfree(bp
, dp
, whichq
);
315 binshash(bp
, &invalhash
);
318 for (; i
< nbuf
+ niobuf
; i
++) {
320 bzero((char *)bp
, sizeof *bp
);
322 bp
->b_rcred
= NOCRED
;
323 bp
->b_wcred
= NOCRED
;
324 bp
->b_vnbufs
.le_next
= NOLIST
;
325 bp
->b_flags
= B_INVAL
;
326 binsheadfree(bp
, &iobufqueue
, -1);
329 printf("using %d buffer headers and %d cluster IO buffer headers\n",
333 /* Set up zones for meta-data */
338 /* create a thread to do dynamic buffer queue balancing */
339 bufq_balance_thread_init();
345 bio_doread(vp
, blkno
, size
, cred
, async
, queuetype
)
353 register struct buf
*bp
;
354 struct proc
*p
= current_proc();
356 bp
= getblk(vp
, blkno
, size
, 0, 0, queuetype
);
359 * If buffer does not have data valid, start a read.
360 * Note that if buffer is B_INVAL, getblk() won't return it.
361 * Therefore, it's valid if it's I/O has completed or been delayed.
363 if (!ISSET(bp
->b_flags
, (B_DONE
| B_DELWRI
))) {
364 /* Start I/O for the buffer (keeping credentials). */
365 SET(bp
->b_flags
, B_READ
| async
);
366 if (cred
!= NOCRED
&& bp
->b_rcred
== NOCRED
) {
372 trace(TR_BREADMISS
, pack(vp
, size
), blkno
);
374 /* Pay for the read. */
376 p
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
381 trace(TR_BREADHIT
, pack(vp
, size
), blkno
);
387 * This algorithm described in Bach (p.54).
390 bread(vp
, blkno
, size
, cred
, bpp
)
397 register struct buf
*bp
;
399 /* Get buffer for block. */
400 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, BLK_READ
);
402 /* Wait for the read to complete, and return result. */
403 return (biowait(bp
));
407 * Read a disk block. [bread() for meta-data]
408 * This algorithm described in Bach (p.54).
411 meta_bread(vp
, blkno
, size
, cred
, bpp
)
418 register struct buf
*bp
;
420 /* Get buffer for block. */
421 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, BLK_META
);
423 /* Wait for the read to complete, and return result. */
424 return (biowait(bp
));
428 * Read-ahead multiple disk blocks. The first is sync, the rest async.
429 * Trivial modification to the breada algorithm presented in Bach (p.55).
432 breadn(vp
, blkno
, size
, rablks
, rasizes
, nrablks
, cred
, bpp
)
434 daddr_t blkno
; int size
;
435 daddr_t rablks
[]; int rasizes
[];
440 register struct buf
*bp
;
443 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, BLK_READ
);
446 * For each of the read-ahead blocks, start a read, if necessary.
448 for (i
= 0; i
< nrablks
; i
++) {
449 /* If it's in the cache, just go on to next one. */
450 if (incore(vp
, rablks
[i
]))
453 /* Get a buffer for the read-ahead block */
454 (void) bio_doread(vp
, rablks
[i
], rasizes
[i
], cred
, B_ASYNC
, BLK_READ
);
457 /* Otherwise, we had to start a read for it; wait until it's valid. */
458 return (biowait(bp
));
462 * Read with single-block read-ahead. Defined in Bach (p.55), but
463 * implemented as a call to breadn().
464 * XXX for compatibility with old file systems.
467 breada(vp
, blkno
, size
, rablkno
, rabsize
, cred
, bpp
)
469 daddr_t blkno
; int size
;
470 daddr_t rablkno
; int rabsize
;
475 return (breadn(vp
, blkno
, size
, &rablkno
, &rabsize
, 1, cred
, bpp
));
479 * Block write. Described in Bach (p.56)
485 int rv
, sync
, wasdelayed
;
486 struct proc
*p
= current_proc();
491 struct vnode
*vp
= bp
->b_vp
;
493 /* Remember buffer type, to switch on it later. */
494 sync
= !ISSET(bp
->b_flags
, B_ASYNC
);
495 wasdelayed
= ISSET(bp
->b_flags
, B_DELWRI
);
496 CLR(bp
->b_flags
, (B_READ
| B_DONE
| B_ERROR
| B_DELWRI
));
500 * If not synchronous, pay for the I/O operation and make
501 * sure the buf is on the correct vnode queue. We have
502 * to do this now, because if we don't, the vnode may not
503 * be properly notified that its I/O has completed.
509 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
512 trace(TR_BWRITE
, pack(vp
, bp
->b_bcount
), bp
->b_lblkno
);
514 /* Initiate disk write. Make sure the appropriate party is charged. */
515 SET(bp
->b_flags
, B_WRITEINPROG
);
522 * If I/O was synchronous, wait for it to complete.
527 * Pay for the I/O operation, if it's not been paid for, and
528 * make sure it's on the correct vnode queue. (async operatings
529 * were payed for above.)
535 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
537 /* Release the buffer. */
548 struct vop_bwrite_args
*ap
;
550 return (bwrite(ap
->a_bp
));
556 * The buffer is marked dirty, but is not queued for I/O.
557 * This routine should be used when the buffer is expected
558 * to be modified again soon, typically a small write that
559 * partially fills a buffer.
561 * NB: magnetic tapes cannot be delayed; they must be
562 * written in the order that the writes are requested.
564 * Described in Leffler, et al. (pp. 208-213).
570 struct proc
*p
= current_proc();
576 * If the block hasn't been seen before:
577 * (1) Mark it as having been seen,
578 * (2) Charge for the write.
579 * (3) Make sure it's on its vnode's correct block list,
581 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
582 SET(bp
->b_flags
, B_DELWRI
);
584 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
586 reassignbuf(bp
, bp
->b_vp
);
590 /* If this is a tape block, write it the block now. */
591 if (ISSET(bp
->b_flags
, B_TAPE
)) {
597 /* Otherwise, the "write" is done, so mark and release the buffer. */
598 SET(bp
->b_flags
, B_DONE
);
603 * Asynchronous block write; just an asynchronous bwrite().
610 SET(bp
->b_flags
, B_ASYNC
);
615 * Release a buffer on to the free lists.
616 * Described in Bach (p. 46).
622 struct bqueues
*bufq
;
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 388)) | DBG_FUNC_START
,
627 bp
->b_lblkno
* PAGE_SIZE
, bp
, bp
->b_data
, bp
->b_flags
, 0);
629 trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
631 /* IO is done. Cleanup the UPL state */
632 if (!ISSET(bp
->b_flags
, B_META
)
633 && UBCINFOEXISTS(bp
->b_vp
) && bp
->b_bufsize
) {
639 if ( !ISSET(bp
->b_flags
, B_PAGELIST
)) {
640 if ( !ISSET(bp
->b_flags
, B_INVAL
)) {
644 object
= ubc_getobject(bp
->b_vp
, UBC_NOREACTIVATE
);
645 if (object
== (void *)NULL
)
646 panic("vmobject for vp is null");
647 if (bp
->b_bufsize
& 0xfff)
648 panic("list request is with less than 4k");
650 file_offset
= ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
);
652 kret
= vm_fault_list_request(object
,
653 (vm_object_offset_t
)file_offset
, bp
->b_bufsize
,
655 (UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
656 | UPL_SET_INTERNAL
));
657 if (kret
!= KERN_SUCCESS
)
658 panic("brelse: Failed to get pagelists");
660 upl_ubc_alias_set(upl
, bp
, 5);
661 #endif /* UBC_DEBUG */
665 upl
= bp
->b_pagelist
;
666 kret
= kernel_upl_unmap(kernel_map
, upl
);
668 if (kret
!= KERN_SUCCESS
)
669 panic("kernel_upl_unmap failed");
673 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
675 if (bp
->b_flags
& (B_ERROR
| B_INVAL
)) {
676 if (bp
->b_flags
& (B_READ
| B_INVAL
))
677 upl_flags
= UPL_ABORT_DUMP_PAGES
;
680 kernel_upl_abort(upl
, upl_flags
);
682 if (ISSET(bp
->b_flags
, (B_DELWRI
| B_WASDIRTY
)))
683 upl_flags
= UPL_COMMIT_SET_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
685 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
| UPL_COMMIT_FREE_ON_EMPTY
;
686 kernel_upl_commit_range(upl
, 0, bp
->b_bufsize
,
688 | UPL_COMMIT_INACTIVATE
,
689 pl
, MAX_UPL_TRANSFER
);
692 CLR(bp
->b_flags
, B_PAGELIST
);
697 if(ISSET(bp
->b_flags
, B_PAGELIST
))
698 panic("brelse: pagelist set for non VREG; vp=%x", bp
->b_vp
);
701 /* Wake up any processes waiting for any buffer to become free. */
707 /* Wake up any proceeses waiting for _this_ buffer to become free. */
708 if (ISSET(bp
->b_flags
, B_WANTED
)) {
709 CLR(bp
->b_flags
, B_WANTED
);
713 /* Block disk interrupts. */
717 * Determine which queue the buffer should be on, then put it there.
720 /* If it's locked, don't report an error; try again later. */
721 if (ISSET(bp
->b_flags
, (B_LOCKED
|B_ERROR
)) == (B_LOCKED
|B_ERROR
))
722 CLR(bp
->b_flags
, B_ERROR
);
724 /* If it's not cacheable, or an error, mark it invalid. */
725 if (ISSET(bp
->b_flags
, (B_NOCACHE
|B_ERROR
)))
726 SET(bp
->b_flags
, B_INVAL
);
728 if ((bp
->b_bufsize
<= 0) || ISSET(bp
->b_flags
, B_INVAL
)) {
730 * If it's invalid or empty, dissociate it from its vnode
731 * and put on the head of the appropriate queue.
735 CLR(bp
->b_flags
, B_DELWRI
);
736 if (bp
->b_bufsize
<= 0)
737 whichq
= BQ_EMPTY
; /* no data */
739 whichq
= BQ_AGE
; /* invalid data */
741 bufq
= &bufqueues
[whichq
];
742 binsheadfree(bp
, bufq
, whichq
);
745 * It has valid data. Put it on the end of the appropriate
746 * queue, so that it'll stick around for as long as possible.
748 if (ISSET(bp
->b_flags
, B_LOCKED
))
749 whichq
= BQ_LOCKED
; /* locked in core */
750 else if (ISSET(bp
->b_flags
, B_META
))
751 whichq
= BQ_META
; /* meta-data */
752 else if (ISSET(bp
->b_flags
, B_AGE
))
753 whichq
= BQ_AGE
; /* stale but valid data */
755 whichq
= BQ_LRU
; /* valid data */
757 bufq
= &bufqueues
[whichq
];
758 binstailfree(bp
, bufq
, whichq
);
761 /* Unlock the buffer. */
762 CLR(bp
->b_flags
, (B_AGE
| B_ASYNC
| B_BUSY
| B_NOCACHE
));
764 /* Allow disk interrupts. */
767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 388)) | DBG_FUNC_END
,
768 bp
, bp
->b_data
, bp
->b_flags
, 0, 0);
772 * Determine if a block is in the cache.
773 * Just look on what would be its hash chain. If it's there, return
774 * a pointer to it, unless it's marked invalid. If it's marked invalid,
775 * we normally don't return the buffer, unless the caller explicitly
786 bp
= BUFHASH(vp
, blkno
)->lh_first
;
788 /* Search hash chain */
789 for (; bp
!= NULL
; bp
= bp
->b_hash
.le_next
, bufseen
++) {
790 if (bp
->b_lblkno
== blkno
&& bp
->b_vp
== vp
&&
791 !ISSET(bp
->b_flags
, B_INVAL
))
794 panic("walked more than nbuf in incore");
801 /* XXX FIXME -- Update the comment to reflect the UBC changes -- */
803 * Get a block of requested size that is associated with
804 * a given vnode and block offset. If it is found in the
805 * block cache, mark it as having been found, make it busy
806 * and return it. Otherwise, return an empty block of the
807 * correct size. It is up to the caller to insure that the
808 * cached blocks be of the correct size.
811 getblk(vp
, blkno
, size
, slpflag
, slptimeo
, operation
)
812 register struct vnode
*vp
;
814 int size
, slpflag
, slptimeo
, operation
;
828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_START
,
829 blkno
* PAGE_SIZE
, size
, operation
, 0, 0);
832 if (bp
= incore(vp
, blkno
)) {
833 /* Found in the Buffer Cache */
834 if (ISSET(bp
->b_flags
, B_BUSY
)) {
840 SET(bp
->b_flags
, B_WANTED
);
841 bufstats
.bufs_busyincore
++;
842 err
= tsleep(bp
, slpflag
| (PRIBIO
+ 1), "getblk",
846 * Callers who call with PCATCH or timeout are
847 * willing to deal with the NULL pointer
849 if (err
&& ((slpflag
& PCATCH
) ||
850 ((err
== EWOULDBLOCK
) && slptimeo
)))
857 /* pagein operation must not use getblk */
858 panic("getblk: pagein for incore busy buffer");
864 /* pageout operation must not use getblk */
865 panic("getblk: pageout for incore busy buffer");
871 panic("getblk: %d unknown operation 1", operation
);
877 SET(bp
->b_flags
, (B_BUSY
| B_CACHE
));
879 bufstats
.bufs_incore
++;
883 if (ISSET(bp
->b_flags
, B_PAGELIST
))
884 panic("pagelist buffer is not busy");
889 if (UBCISVALID(bp
->b_vp
) && bp
->b_bufsize
) {
891 if (bp
->b_bufsize
& 0xfff)
892 panic("list request is with less than 4k");
894 object
= ubc_getobject(vp
, UBC_NOREACTIVATE
);
895 if (object
== (void *)NULL
)
896 panic("vmobject for vp is null");
898 file_offset
= ubc_blktooff(vp
, bp
->b_lblkno
);
900 kret
= vm_fault_list_request(object
,
901 (vm_object_offset_t
)file_offset
, bp
->b_bufsize
,
903 (UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
| UPL_SET_INTERNAL
));
905 if (kret
!= KERN_SUCCESS
)
906 panic("Failed to get pagelists");
908 SET(bp
->b_flags
, B_PAGELIST
);
909 bp
->b_pagelist
= upl
;
911 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
913 if ( !upl_valid_page(pl
, 0))
914 panic("getblk: incore buffer without valid page");
916 if (upl_dirty_page(pl
, 0))
917 SET(bp
->b_flags
, B_WASDIRTY
);
919 CLR(bp
->b_flags
, B_WASDIRTY
);
921 kret
= kernel_upl_map(kernel_map
, upl
, (vm_address_t
*)&(bp
->b_data
));
922 if (kret
!= KERN_SUCCESS
) {
923 panic("getblk: kernel_upl_map() "
924 "failed with (%d)", kret
);
926 if (bp
->b_data
== 0) panic("kernel_upl_map mapped 0");
932 * VM is not involved in IO for the meta data
933 * buffer already has valid data
936 panic("bp->b_data null incore buf=%x", bp
);
941 panic("getblk: paging operation 1");
945 panic("getblk: %d unknown operation 2", operation
);
950 } else { /* not incore() */
951 int queue
= BQ_EMPTY
; /* Start with no preference */
954 if ((operation
== BLK_META
) || (UBCINVALID(vp
)) ||
955 !(UBCINFOEXISTS(vp
))) {
956 operation
= BLK_META
;
958 if ((bp
= getnewbuf(slpflag
, slptimeo
, &queue
)) == NULL
)
961 * if it is meta, the queue may be set to other
962 * type so reset as well as mark it to be B_META
963 * so that when buffer is released it will goto META queue
964 * Also, if the vnode is not VREG, then it is META
966 if (operation
== BLK_META
) {
967 SET(bp
->b_flags
, B_META
);
974 /* buffer data is invalid */
977 * Insert in the hash so that incore() can find it
979 binshash(bp
, BUFHASH(vp
, blkno
));
982 panic("bp->b_data is not nul; %x",bp
);
983 kret
= kmem_alloc(kernel_map
,
984 &bp
->b_data
, bp
->b_bufsize
);
985 if (kret
!= KERN_SUCCESS
)
986 panic("getblk: kmem_alloc() returned %d", kret
);
987 #endif /* ZALLOC_METADATA */
990 panic("bp->b_data is null %x",bp
);
992 bp
->b_blkno
= bp
->b_lblkno
= blkno
;
995 bufstats
.bufs_miss
++;
998 panic("b_data is 0: 2");
1000 /* wakeup the buffer */
1001 CLR(bp
->b_flags
, B_WANTED
);
1008 * Insert in the hash so that incore() can find it
1010 binshash(bp
, BUFHASH(vp
, blkno
));
1011 pager
= ubc_getpager(vp
);
1012 file_offset
= ubc_blktooff(vp
, blkno
);
1014 object
= ubc_getobject(vp
, UBC_NOREACTIVATE
);
1015 if (object
== (void *)NULL
)
1016 panic("vmobject for vp is null");
1017 if (bp
->b_bufsize
& 0xfff)
1018 panic("list request is with less than 4k");
1020 if (ISSET(bp
->b_flags
, B_PAGELIST
))
1021 panic("B_PAGELIST in bp=%x",bp
);
1023 kret
= vm_fault_list_request(object
,
1024 (vm_object_offset_t
)file_offset
, bp
->b_bufsize
,
1026 (UPL_NO_SYNC
| UPL_CLEAN_IN_PLACE
| UPL_PRECIOUS
| UPL_SET_INTERNAL
));
1028 if (kret
!= KERN_SUCCESS
)
1029 panic("Failed to get pagelists");
1032 upl_ubc_alias_set(upl
, bp
, 4);
1033 #endif /* UBC_DEBUG */
1034 bp
->b_blkno
= bp
->b_lblkno
= blkno
;
1035 bp
->b_pagelist
= upl
;
1037 SET(bp
->b_flags
, B_PAGELIST
);
1038 pl
= UPL_GET_INTERNAL_PAGE_LIST(upl
);
1040 if (upl_valid_page(pl
, 0)) {
1041 SET(bp
->b_flags
, B_CACHE
| B_DONE
);
1042 bufstats
.bufs_vmhits
++;
1044 pagedirty
= upl_dirty_page(pl
, 0);
1047 SET(bp
->b_flags
, B_WASDIRTY
);
1049 if (vp
->v_tag
== VT_NFS
) {
1056 f_offset
= ubc_blktooff(vp
, blkno
);
1058 if (f_offset
> vp
->v_ubcinfo
->ui_size
) {
1059 CLR(bp
->b_flags
, (B_CACHE
|B_DONE
|B_WASDIRTY
));
1063 valid_size
= min(((unsigned int)(vp
->v_ubcinfo
->ui_size
- f_offset
)), PAGE_SIZE
);
1064 bp
->b_validend
= valid_size
;
1067 bp
->b_dirtyend
= valid_size
;
1071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_NONE
,
1072 bp
->b_validend
, bp
->b_dirtyend
,
1073 (int)vp
->v_ubcinfo
->ui_size
, 0, 0);
1081 bp
->b_validend
= bp
->b_bcount
;
1082 bp
->b_dirtyend
= bp
->b_bcount
;
1085 bp
->b_validend
= bp
->b_bcount
;
1089 if (error
= VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
, NULL
)) {
1090 panic("VOP_BMAP failed in getblk");
1093 * XXX: We probably should invalidate the VM Page
1095 bp
->b_error
= error
;
1096 SET(bp
->b_flags
, (B_ERROR
| B_INVAL
));
1097 /* undo B_DONE that was set before upl_commit() */
1098 CLR(bp
->b_flags
, B_DONE
);
1103 bufstats
.bufs_miss
++;
1105 kret
= kernel_upl_map(kernel_map
, upl
, (vm_address_t
*)&(bp
->b_data
));
1106 if (kret
!= KERN_SUCCESS
) {
1107 panic("getblk: kernel_upl_map() "
1108 "failed with (%d)", kret
);
1110 if (bp
->b_data
== 0) panic("kernel_upl_map mapped 0");
1120 panic("getblk: paging operation 2");
1123 panic("getblk: %d unknown operation 3", operation
);
1129 if (bp
->b_data
== NULL
)
1130 panic("getblk: bp->b_addr is null");
1132 if (bp
->b_bufsize
& 0xfff) {
1134 if (ISSET(bp
->b_flags
, B_META
) && (bp
->b_bufsize
& 0x1ff))
1135 #endif /* ZALLOC_METADATA */
1136 panic("getblk: bp->b_bufsize = %d", bp
->b_bufsize
);
1139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_END
,
1140 bp
, bp
->b_data
, bp
->b_flags
, 3, 0);
1146 * Get an empty, disassociated buffer of given size.
1153 int queue
= BQ_EMPTY
;
1154 #if !ZALLOC_METADATA
1156 vm_size_t desired_size
= roundup(size
, CLBYTES
);
1158 if (desired_size
> MAXBSIZE
)
1159 panic("geteblk: buffer larger than MAXBSIZE requested");
1160 #endif /* ZALLOC_METADATA */
1162 while ((bp
= getnewbuf(0, 0, &queue
)) == 0)
1165 SET(bp
->b_flags
, (B_META
|B_INVAL
));
1167 SET(bp
->b_flags
, B_INVAL
);
1168 #endif /* ZALLOC_METADATA */
1171 assert(queue
== BQ_EMPTY
);
1172 #endif /* DIAGNOSTIC */
1173 /* XXX need to implement logic to deal with other queues */
1175 #if !ZALLOC_METADATA
1176 /* Empty buffer - allocate pages */
1177 kret
= kmem_alloc_aligned(kernel_map
, &bp
->b_data
, desired_size
);
1178 if (kret
!= KERN_SUCCESS
)
1179 panic("geteblk: kmem_alloc_aligned returned %d", kret
);
1180 #endif /* ZALLOC_METADATA */
1182 binshash(bp
, &invalhash
);
1184 bufstats
.bufs_eblk
++;
1191 * Zones for the meta data buffers
1195 #define MAXMETA 4096
1197 struct meta_zone_entry
{
1204 struct meta_zone_entry meta_zones
[] = {
1205 {NULL
, (MINMETA
* 1), 128 * (MINMETA
* 1), "buf.512" },
1206 {NULL
, (MINMETA
* 2), 64 * (MINMETA
* 2), "buf.1024" },
1207 {NULL
, (MINMETA
* 3), 16 * (MINMETA
* 3), "buf.1536" },
1208 {NULL
, (MINMETA
* 4), 16 * (MINMETA
* 4), "buf.2048" },
1209 {NULL
, (MINMETA
* 5), 16 * (MINMETA
* 5), "buf.2560" },
1210 {NULL
, (MINMETA
* 6), 16 * (MINMETA
* 6), "buf.3072" },
1211 {NULL
, (MINMETA
* 7), 16 * (MINMETA
* 7), "buf.3584" },
1212 {NULL
, (MINMETA
* 8), 512 * (MINMETA
* 8), "buf.4096" },
1213 {NULL
, 0, 0, "" } /* End */
1217 * Initialize the meta data zones
1224 for (i
= 0; meta_zones
[i
].mz_size
!= 0; i
++) {
1225 meta_zones
[i
].mz_zone
=
1226 zinit(meta_zones
[i
].mz_size
,
1227 meta_zones
[i
].mz_max
,
1229 meta_zones
[i
].mz_name
);
1234 getbufzone(size_t size
)
1239 panic("getbufzone: incorect size = %d", size
);
1241 i
= (size
/ 512) - 1;
1242 return (meta_zones
[i
].mz_zone
);
1244 #endif /* ZALLOC_METADATA */
1247 * With UBC, there is no need to expand / shrink the file data
1248 * buffer. The VM uses the same pages, hence no waste.
1249 * All the file data buffers can have one size.
1250 * In fact expand / shrink would be an expensive operation.
1252 * Only exception to this is meta-data buffers. Most of the
1253 * meta data operations are smaller than PAGE_SIZE. Having the
1254 * meta-data buffers grow and shrink as needed, optimizes use
1255 * of the kernel wired memory.
1263 vm_size_t desired_size
;
1265 desired_size
= roundup(size
, CLBYTES
);
1267 if(desired_size
< PAGE_SIZE
)
1268 desired_size
= PAGE_SIZE
;
1269 if (desired_size
> MAXBSIZE
)
1270 panic("allocbuf: buffer larger than MAXBSIZE requested");
1273 if (ISSET(bp
->b_flags
, B_META
)) {
1276 size_t nsize
= roundup(size
, MINMETA
);
1279 vm_offset_t elem
= (vm_offset_t
)bp
->b_data
;
1281 if (ISSET(bp
->b_flags
, B_ZALLOC
))
1282 if (bp
->b_bufsize
<= MAXMETA
) {
1283 if (bp
->b_bufsize
< nsize
) {
1284 /* reallocate to a bigger size */
1285 desired_size
= nsize
;
1287 zprev
= getbufzone(bp
->b_bufsize
);
1288 z
= getbufzone(nsize
);
1289 bp
->b_data
= (caddr_t
)zalloc(z
);
1291 panic("allocbuf: zalloc() returned NULL");
1292 bcopy(elem
, bp
->b_data
, bp
->b_bufsize
);
1295 desired_size
= bp
->b_bufsize
;
1298 panic("allocbuf: B_ZALLOC set incorrectly");
1300 if (bp
->b_bufsize
< desired_size
) {
1301 /* reallocate to a bigger size */
1302 kret
= kmem_alloc(kernel_map
, &bp
->b_data
, desired_size
);
1303 if (kret
!= KERN_SUCCESS
)
1304 panic("allocbuf: kmem_alloc() returned %d", kret
);
1306 panic("allocbuf: null b_data");
1307 bcopy(elem
, bp
->b_data
, bp
->b_bufsize
);
1308 kmem_free(kernel_map
, elem
, bp
->b_bufsize
);
1310 desired_size
= bp
->b_bufsize
;
1313 /* new allocation */
1314 if (nsize
<= MAXMETA
) {
1315 desired_size
= nsize
;
1316 z
= getbufzone(nsize
);
1317 bp
->b_data
= (caddr_t
)zalloc(z
);
1319 panic("allocbuf: zalloc() returned NULL 2");
1320 SET(bp
->b_flags
, B_ZALLOC
);
1322 kret
= kmem_alloc(kernel_map
, &bp
->b_data
, desired_size
);
1323 if (kret
!= KERN_SUCCESS
)
1324 panic("allocbuf: kmem_alloc() 2 returned %d", kret
);
1326 panic("allocbuf: null b_data 2");
1331 if (ISSET(bp
->b_flags
, B_META
) && (bp
->b_data
== 0))
1332 panic("allocbuf: bp->b_data is NULL");
1333 #endif /* ZALLOC_METADATA */
1335 bp
->b_bufsize
= desired_size
;
1336 bp
->b_bcount
= size
;
1340 * Get a new buffer from one of the free lists.
1342 * Request for a queue is passes in. The queue from which the buffer was taken
1343 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1344 * BQUEUE means no preference. Use heuristics in that case.
1345 * Heuristics is as follows:
1346 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1347 * If none available block till one is made available.
1348 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1349 * Pick the most stale buffer.
1350 * If found buffer was marked delayed write, start the async. write
1351 * and restart the search.
1352 * Initialize the fields and disassociate the buffer from the vnode.
1353 * Remove the buffer from the hash. Return the buffer and the queue
1354 * on which it was found.
1358 getnewbuf(slpflag
, slptimeo
, queue
)
1359 int slpflag
, slptimeo
;
1362 register struct buf
*bp
;
1363 register struct buf
*lru_bp
;
1364 register struct buf
*age_bp
;
1365 register struct buf
*meta_bp
;
1366 register int age_time
, lru_time
, bp_time
, meta_time
;
1369 int req
= *queue
; /* save it for restarts */
1374 /* invalid request gets empty queue */
1375 if ((*queue
> BQUEUES
) || (*queue
< 0))
1378 /* (*queue == BQUEUES) means no preference */
1379 if (*queue
!= BQUEUES
) {
1380 /* Try for the requested queue first */
1381 bp
= bufqueues
[*queue
].tqh_first
;
1386 /* Unable to use requested queue */
1387 age_bp
= bufqueues
[BQ_AGE
].tqh_first
;
1388 lru_bp
= bufqueues
[BQ_LRU
].tqh_first
;
1389 meta_bp
= bufqueues
[BQ_META
].tqh_first
;
1391 if (!age_bp
&& !lru_bp
&& !meta_bp
) { /* Unavailble on AGE or LRU */
1392 /* Try the empty list first */
1393 bp
= bufqueues
[BQ_EMPTY
].tqh_first
;
1399 /* with UBC this is a fatal condition */
1400 panic("getnewbuf: No useful buffers");
1402 /* Log this error condition */
1403 printf("getnewbuf: No useful buffers");
1404 #endif /* DIAGNOSTIC */
1406 /* wait for a free buffer of any kind */
1408 bufstats
.bufs_sleeps
++;
1409 tsleep(&needbuffer
, slpflag
|(PRIBIO
+1), "getnewbuf", slptimeo
);
1414 /* Buffer available either on AGE or LRU or META */
1418 /* Buffer available either on AGE or LRU */
1422 } else if (!lru_bp
) {
1425 } else { /* buffer available on both AGE and LRU */
1426 age_time
= time
.tv_sec
- age_bp
->b_timestamp
;
1427 lru_time
= time
.tv_sec
- lru_bp
->b_timestamp
;
1428 if ((age_time
< 0) || (lru_time
< 0)) { /* time set backwards */
1432 * we should probably re-timestamp eveything in the
1433 * queues at this point with the current time
1436 if ((lru_time
>= lru_is_stale
) && (age_time
< age_is_stale
)) {
1446 if (!bp
) { /* Neither on AGE nor on LRU */
1449 } else if (meta_bp
) {
1450 bp_time
= time
.tv_sec
- bp
->b_timestamp
;
1451 meta_time
= time
.tv_sec
- meta_bp
->b_timestamp
;
1453 if (!(bp_time
< 0) && !(meta_time
< 0)) {
1454 /* time not set backwards */
1456 bp_is_stale
= (*queue
== BQ_LRU
) ?
1457 lru_is_stale
: age_is_stale
;
1459 if ((meta_time
>= meta_is_stale
) &&
1460 (bp_time
< bp_is_stale
)) {
1468 panic("getnewbuf: null bp");
1471 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
1472 panic("getnewbuf: le_prev is deadbeef");
1474 if(ISSET(bp
->b_flags
, B_BUSY
))
1475 panic("getnewbuf reusing BUSY buf");
1478 if (bcleanbuf(bp
)) {
1479 /* bawrite() issued, buffer not ready */
1487 #include <mach/mach_types.h>
1488 #include <mach/memory_object_types.h>
1492 * Returns 0 is buffer is ready to use,
1493 * Returns 1 if issued a bawrite() to indicate
1494 * that the buffer is not ready.
1497 bcleanbuf(struct buf
*bp
)
1504 /* Remove from the queue */
1507 /* Buffer is no longer on free lists. */
1508 SET(bp
->b_flags
, B_BUSY
);
1510 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
1511 panic("bcleanbuf: le_prev is deadbeef");
1513 /* If buffer was a delayed write, start it, and return 1 */
1514 if (ISSET(bp
->b_flags
, B_DELWRI
)) {
1527 if (ISSET(bp
->b_flags
, B_META
)) {
1529 vm_offset_t elem
= (vm_offset_t
)bp
->b_data
;
1531 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1533 if (ISSET(bp
->b_flags
, B_ZALLOC
)) {
1534 if (bp
->b_bufsize
<= MAXMETA
) {
1537 z
= getbufzone(bp
->b_bufsize
);
1538 bp
->b_data
= (caddr_t
)0xdeadbeef;
1540 CLR(bp
->b_flags
, B_ZALLOC
);
1542 panic("bcleanbuf: B_ZALLOC set incorrectly");
1544 bp
->b_data
= (caddr_t
)0xdeadbeef;
1545 kmem_free(kernel_map
, elem
, bp
->b_bufsize
);
1548 if (bp
->b_data
== 0)
1549 panic("bcleanbuf: bp->b_data == NULL for B_META buffer");
1551 kmem_free(kernel_map
, bp
->b_data
, bp
->b_bufsize
);
1552 #endif /* ZALLOC_METADATA */
1555 trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
1557 /* disassociate us from our vnode, if we had one... */
1560 /* clear out various other fields */
1562 bp
->b_flags
= B_BUSY
;
1564 bp
->b_blkno
= bp
->b_lblkno
= 0;
1569 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1570 bp
->b_validoff
= bp
->b_validend
= 0;
1572 /* nuke any credentials we were holding */
1574 if (cred
!= NOCRED
) {
1575 bp
->b_rcred
= NOCRED
;
1579 if (cred
!= NOCRED
) {
1580 bp
->b_wcred
= NOCRED
;
1589 * Wait for operations on the buffer to complete.
1590 * When they do, extract and return the I/O's error value.
1597 upl_page_info_t
*pl
;
1602 while (!ISSET(bp
->b_flags
, B_DONE
))
1603 tsleep(bp
, PRIBIO
+ 1, "biowait", 0);
1606 /* check for interruption of I/O (e.g. via NFS), then errors. */
1607 if (ISSET(bp
->b_flags
, B_EINTR
)) {
1608 CLR(bp
->b_flags
, B_EINTR
);
1610 } else if (ISSET(bp
->b_flags
, B_ERROR
))
1611 return (bp
->b_error
? bp
->b_error
: EIO
);
1617 * Mark I/O complete on a buffer.
1619 * If a callback has been requested, e.g. the pageout
1620 * daemon, do so. Otherwise, awaken waiting processes.
1622 * [ Leffler, et al., says on p.247:
1623 * "This routine wakes up the blocked process, frees the buffer
1624 * for an asynchronous write, or, for a request by the pagedaemon
1625 * process, invokes a procedure specified in the buffer structure" ]
1627 * In real life, the pagedaemon (or other system processes) wants
1628 * to do async stuff to, and doesn't want the buffer brelse()'d.
1629 * (for swap pager, that puts swap buffers on the free lists (!!!),
1630 * for the vn device, that puts malloc'd buffers on the free lists!)
1636 boolean_t funnel_state
;
1639 funnel_state
= thread_funnel_set(kernel_flock
, TRUE
);
1641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 387)) | DBG_FUNC_START
,
1642 bp
, bp
->b_data
, bp
->b_flags
, 0, 0);
1644 if (ISSET(bp
->b_flags
, B_DONE
))
1645 panic("biodone already");
1646 SET(bp
->b_flags
, B_DONE
); /* note that it's done */
1648 * I/O was done, so don't believe
1649 * the DIRTY state from VM anymore
1651 CLR(bp
->b_flags
, B_WASDIRTY
);
1653 if (!ISSET(bp
->b_flags
, B_READ
) && !ISSET(bp
->b_flags
, B_RAW
))
1654 vwakeup(bp
); /* wake up reader */
1656 if (ISSET(bp
->b_flags
, B_CALL
)) { /* if necessary, call out */
1657 CLR(bp
->b_flags
, B_CALL
); /* but note callout done */
1658 (*bp
->b_iodone
)(bp
);
1659 } else if (ISSET(bp
->b_flags
, B_ASYNC
)) /* if async, release it */
1661 else { /* or just wakeup the buffer */
1662 CLR(bp
->b_flags
, B_WANTED
);
1666 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 387)) | DBG_FUNC_END
,
1667 bp
, bp
->b_data
, bp
->b_flags
, 0, 0);
1669 thread_funnel_set(kernel_flock
, funnel_state
);
1673 * Return a count of buffers on the "locked" queue.
1678 register struct buf
*bp
;
1681 for (bp
= bufqueues
[BQ_LOCKED
].tqh_first
; bp
;
1682 bp
= bp
->b_freelist
.tqe_next
)
1688 * Return a count of 'busy' buffers. Used at the time of shutdown.
1691 count_busy_buffers()
1693 register struct buf
*bp
;
1694 register int nbusy
= 0;
1696 for (bp
= &buf
[nbuf
]; --bp
>= buf
; )
1697 if ((bp
->b_flags
& (B_BUSY
|B_INVAL
)) == B_BUSY
)
1702 #if 1 /*DIAGNOSTIC */
1704 * Print out statistics on the current allocation of the buffer pool.
1705 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1706 * in vfs_syscalls.c using sysctl.
1712 register struct buf
*bp
;
1713 register struct bqueues
*dp
;
1714 int counts
[MAXBSIZE
/CLBYTES
+1];
1715 static char *bname
[BQUEUES
] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
1717 for (dp
= bufqueues
, i
= 0; dp
< &bufqueues
[BQUEUES
]; dp
++, i
++) {
1719 for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
1722 for (bp
= dp
->tqh_first
; bp
; bp
= bp
->b_freelist
.tqe_next
) {
1723 counts
[bp
->b_bufsize
/CLBYTES
]++;
1727 printf("%s: total-%d", bname
[i
], count
);
1728 for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
1730 printf(", %d-%d", j
* CLBYTES
, counts
[j
]);
1734 #endif /* DIAGNOSTIC */
1741 register struct buf
*bp
;
1746 while ((bp
= iobufqueue
.tqh_first
) == NULL
) {
1748 bufstats
.bufs_iobufsleeps
++;
1749 tsleep(&need_iobuffer
, (PRIBIO
+1), "alloc_io_buf", 0);
1751 TAILQ_REMOVE(&iobufqueue
, bp
, b_freelist
);
1752 bp
->b_timestamp
= 0;
1754 /* clear out various fields */
1755 bp
->b_flags
= B_BUSY
;
1756 bp
->b_blkno
= bp
->b_lblkno
= 0;
1764 if (vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
)
1765 bp
->b_dev
= vp
->v_rdev
;
1768 bufstats
.bufs_iobufinuse
++;
1769 if (bufstats
.bufs_iobufinuse
> bufstats
.bufs_iobufmax
)
1770 bufstats
.bufs_iobufmax
= bufstats
.bufs_iobufinuse
;
1783 /* put buffer back on the head of the iobufqueue */
1785 bp
->b_flags
= B_INVAL
;
1787 binsheadfree(bp
, &iobufqueue
, -1);
1789 /* Wake up any processes waiting for any buffer to become free. */
1790 if (need_iobuffer
) {
1792 wakeup(&need_iobuffer
);
1794 bufstats
.bufs_iobufinuse
--;
1799 /* not hookedup yet */
1801 /* XXX move this to a separate file */
1803 * Dynamic Scaling of the Buffer Queues
1806 typedef long long blsize_t
;
1808 blsize_t MAXNBUF
; /* initialize to (mem_size / PAGE_SIZE) */
1809 /* Global tunable limits */
1810 blsize_t nbufh
; /* number of buffer headers */
1811 blsize_t nbuflow
; /* minimum number of buffer headers required */
1812 blsize_t nbufhigh
; /* maximum number of buffer headers allowed */
1813 blsize_t nbuftarget
; /* preferred number of buffer headers */
1818 * 1. 0 < nbuflow <= nbufh <= nbufhigh
1819 * 2. nbufhigh <= MAXNBUF
1820 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
1821 * 4. nbufh can not be set by sysctl().
1824 /* Per queue tunable limits */
1827 blsize_t bl_nlow
; /* minimum number of buffer headers required */
1828 blsize_t bl_num
; /* number of buffer headers on the queue */
1829 blsize_t bl_nlhigh
; /* maximum number of buffer headers allowed */
1830 blsize_t bl_target
; /* preferred number of buffer headers */
1831 long bl_stale
; /* Seconds after which a buffer is considered stale */
1837 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
1838 * 2. bl_nlhigh <= MAXNBUF
1839 * 3. bufqlim[BQ_META].bl_nlow != 0
1840 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
1841 * file system IO operations)
1842 * 5. bl_num can not be set by sysctl().
1843 * 6. bl_nhigh <= nbufhigh
1849 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
1850 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
1852 * These limits are exported to by means of sysctl().
1853 * It was decided to define blsize_t as a 64 bit quantity.
1854 * This will make sure that we will not be required to change it
1855 * as long as we do not exceed 64 bit address space for the kernel.
1857 * low and high numbers parameters initialized at compile time
1858 * and boot arguments can be used to override them. sysctl()
1859 * would not change the value. sysctl() can get all the values
1860 * but can set only target. num is the current level.
1862 * Advantages of having a "bufqscan" thread doing the balancing are,
1863 * Keep enough bufs on BQ_EMPTY.
1864 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
1865 * getnewbuf() perfoms best if a buffer was found there.
1866 * Also this minimizes the possibility of starting IO
1867 * from getnewbuf(). That's a performance win, too.
1869 * Localize complex logic [balancing as well as time aging]
1872 * Simplify getnewbuf() logic by elimination of time aging code.
1878 * The goal of the dynamic scaling of the buffer queues to to keep
1879 * the size of the LRU close to bl_target. Buffers on a queue would
1882 * There would be a thread which will be responsible for "balancing"
1883 * the buffer cache queues.
1885 * The scan order would be: AGE, LRU, META, EMPTY.
1888 long bufqscanwait
= 0;
1890 extern void bufqscan_thread();
1891 extern int balancebufq(int q
);
1892 extern int btrimempty(int n
);
1893 extern int initbufqscan(void);
1894 extern int nextbufq(int q
);
1895 extern void buqlimprt(int all
);
1898 bufq_balance_thread_init()
1901 if (bufqscanwait
++ == 0) {
1904 /* Initalize globals */
1905 MAXNBUF
= (mem_size
/ PAGE_SIZE
);
1907 nbuflow
= min(nbufh
, 100);
1908 nbufhigh
= min(MAXNBUF
, max(nbufh
, 2048));
1909 nbuftarget
= (mem_size
>> 5) / PAGE_SIZE
;
1910 nbuftarget
= max(nbuflow
, nbuftarget
);
1911 nbuftarget
= min(nbufhigh
, nbuftarget
);
1914 * Initialize the bufqlim
1918 bufqlim
[BQ_LOCKED
].bl_nlow
= 0;
1919 bufqlim
[BQ_LOCKED
].bl_nlhigh
= 32;
1920 bufqlim
[BQ_LOCKED
].bl_target
= 0;
1921 bufqlim
[BQ_LOCKED
].bl_stale
= 30;
1924 bufqlim
[BQ_LRU
].bl_nlow
= 0;
1925 bufqlim
[BQ_LRU
].bl_nlhigh
= nbufhigh
/4;
1926 bufqlim
[BQ_LRU
].bl_target
= nbuftarget
/4;
1927 bufqlim
[BQ_LRU
].bl_stale
= LRU_IS_STALE
;
1930 bufqlim
[BQ_AGE
].bl_nlow
= 0;
1931 bufqlim
[BQ_AGE
].bl_nlhigh
= nbufhigh
/4;
1932 bufqlim
[BQ_AGE
].bl_target
= nbuftarget
/4;
1933 bufqlim
[BQ_AGE
].bl_stale
= AGE_IS_STALE
;
1936 bufqlim
[BQ_EMPTY
].bl_nlow
= 0;
1937 bufqlim
[BQ_EMPTY
].bl_nlhigh
= nbufhigh
/4;
1938 bufqlim
[BQ_EMPTY
].bl_target
= nbuftarget
/4;
1939 bufqlim
[BQ_EMPTY
].bl_stale
= 600000;
1942 bufqlim
[BQ_META
].bl_nlow
= 0;
1943 bufqlim
[BQ_META
].bl_nlhigh
= nbufhigh
/4;
1944 bufqlim
[BQ_META
].bl_target
= nbuftarget
/4;
1945 bufqlim
[BQ_META
].bl_stale
= META_IS_STALE
;
1950 /* create worker thread */
1951 kernel_thread(kernel_task
, bufqscan_thread
);
1954 /* The workloop for the buffer balancing thread */
1958 boolean_t funnel_state
;
1961 funnel_state
= thread_funnel_set(kernel_flock
, TRUE
);
1965 int q
; /* buffer queue to process */
1967 for (q
= initbufqscan(); q
; ) {
1968 moretodo
|= balancebufq(q
);
1977 (void)tsleep((void *)&bufqscanwait
, PRIBIO
, "bufqscanwait", 60 * hz
);
1981 (void) thread_funnel_set(kernel_flock
, FALSE
);
1984 /* Seed for the buffer queue balancing */
1988 /* Start with AGE queue */
1992 /* Pick next buffer queue to balance */
1996 int order
[] = { BQ_AGE
, BQ_LRU
, BQ_META
, BQ_EMPTY
, 0 };
2003 /* function to balance the buffer queues */
2011 /* reject invalid q */
2012 if ((q
< 0) || (q
>= BQUEUES
))
2015 /* LOCKED queue MUST not be balanced */
2019 n
= (bufqlim
[q
].bl_num
- bufqlim
[q
].bl_target
);
2021 /* If queue has less than target nothing more to do */
2026 /* Balance only a small amount (12.5%) at a time */
2030 /* EMPTY queue needs special handling */
2031 if (q
== BQ_EMPTY
) {
2032 moretodo
|= btrimempty(n
);
2036 for (; n
> 0; n
--) {
2037 struct buf
*bp
= bufqueues
[q
].tqh_first
;
2041 /* check if it's stale */
2042 if ((time
.tv_sec
- bp
->b_timestamp
) > bufqlim
[q
].bl_stale
) {
2043 if (bcleanbuf(bp
)) {
2044 /* bawrite() issued, bp not ready */
2047 /* release the cleaned buffer to BQ_EMPTY */
2048 SET(bp
->b_flags
, B_INVAL
);
2064 * When struct buf are allocated dynamically, this would
2065 * reclaim upto 'n' struct buf from the empty queue.
2074 if ((q
< 0) || (q
>= BQUEUES
))
2077 bufqlim
[q
].bl_num
++;
2084 if ((q
< 0) || (q
>= BQUEUES
))
2087 bufqlim
[q
].bl_num
--;
2095 static char *bname
[BQUEUES
] = { "LOCKED", "LRU", "AGE", "EMPTY", "META" };
2098 for (i
= 0; i
< BQUEUES
; i
++) {
2099 printf("%s : ", bname
[i
]);
2100 printf("min = %d, ", (long)bufqlim
[i
].bl_nlow
);
2101 printf("cur = %d, ", (long)bufqlim
[i
].bl_num
);
2102 printf("max = %d, ", (long)bufqlim
[i
].bl_nlhigh
);
2103 printf("target = %d, ", (long)bufqlim
[i
].bl_target
);
2104 printf("stale after %d seconds\n", bufqlim
[i
].bl_stale
);
2107 for (i
= 0; i
< BQUEUES
; i
++) {
2108 printf("%s : ", bname
[i
]);
2109 printf("cur = %d, ", (long)bufqlim
[i
].bl_num
);