2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
27 * Copyright (c) 1994 Christopher G. Demetriou
28 * Copyright (c) 1982, 1986, 1989, 1993
29 * The Regents of the University of California. All rights reserved.
30 * (c) UNIX System Laboratories, Inc.
31 * All or some portions of this file are derived from material licensed
32 * to the University of California by American Telephone and Telegraph
33 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
34 * the permission of UNIX System Laboratories, Inc.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * The NEXTSTEP Software License Agreement specifies the terms
65 * and conditions for redistribution.
67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
72 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 * Leffler, et al.: The Design and Implementation of the 4.3BSD
74 * UNIX Operating System (Addison Welley, 1989)
77 #include <sys/param.h>
78 #include <sys/systm.h>
81 #include <sys/vnode.h>
82 #include <sys/mount.h>
83 #include <sys/trace.h>
84 #include <sys/malloc.h>
85 #include <sys/resourcevar.h>
86 #include <miscfs/specfs/specdev.h>
88 #include <vm/vm_pageout.h>
90 #include <kern/assert.h>
91 #endif /* DIAGNOSTIC */
92 #include <kern/task.h>
93 #include <kern/zalloc.h>
95 #include <sys/kdebug.h>
96 #include <machine/spl.h>
98 static __inline__
void bufqinc(int q
);
99 static __inline__
void bufqdec(int q
);
101 static int do_breadn_for_type(struct vnode
*vp
, daddr_t blkno
, int size
, daddr_t
*rablks
,
102 int *rasizes
, int nrablks
, struct ucred
*cred
, struct buf
**bpp
, int queuetype
);
103 static struct buf
*getnewbuf(int slpflag
, int slptimeo
, int *queue
);
104 static int bcleanbuf(struct buf
*bp
);
105 static int brecover_data(struct buf
*bp
);
106 extern void vwakeup();
108 extern int niobuf
; /* The number of IO buffer headers for cluster IO */
111 /* zone allocated buffer headers */
112 static zone_t buf_hdr_zone
;
113 static int buf_hdr_count
;
116 struct proc
*traceproc
;
117 int tracewhich
, tracebuf
[TRCSIZ
];
119 char traceflags
[TR_NFLAGS
];
123 * Definitions for the buffer hash lists.
125 #define BUFHASH(dvp, lbn) \
126 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
127 LIST_HEAD(bufhashhdr
, buf
) *bufhashtbl
, invalhash
;
130 /* Definitions for the buffer stats. */
131 struct bufstats bufstats
;
133 /* Number of delayed write buffers */
137 * Insq/Remq for the buffer hash lists.
140 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
141 #define bremhash(bp) LIST_REMOVE(bp, b_hash)
145 TAILQ_HEAD(ioqueue
, buf
) iobufqueue
;
146 TAILQ_HEAD(bqueues
, buf
) bufqueues
[BQUEUES
];
147 static int needbuffer
;
148 static int need_iobuffer
;
151 * Insq/Remq for the buffer free lists.
153 #define binsheadfree(bp, dp, whichq) do { \
154 TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
156 (bp)->b_whichq = whichq; \
157 (bp)->b_timestamp = time.tv_sec; \
160 #define binstailfree(bp, dp, whichq) do { \
161 TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
163 (bp)->b_whichq = whichq; \
164 (bp)->b_timestamp = time.tv_sec; \
167 #define BHASHENTCHECK(bp) \
168 if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
169 panic("%x: b_hash.le_prev is not deadbeef", (bp));
171 #define BLISTNONE(bp) \
172 (bp)->b_hash.le_next = (struct buf *)0; \
173 (bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
176 * Insq/Remq for the vnode usage lists.
178 #define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
179 #define bufremvn(bp) { \
180 LIST_REMOVE(bp, b_vnbufs); \
181 (bp)->b_vnbufs.le_next = NOLIST; \
184 simple_lock_data_t bufhashlist_slock
; /* lock on buffer hash list */
186 /* number of per vnode, "in flight" buffer writes */
187 #define BUFWRITE_THROTTLE 9
191 * Time in seconds before a buffer on a list is
192 * considered as a stale buffer
194 #define LRU_IS_STALE 120 /* default value for the LRU */
195 #define AGE_IS_STALE 60 /* default value for the AGE */
196 #define META_IS_STALE 180 /* default value for the BQ_META */
198 int lru_is_stale
= LRU_IS_STALE
;
199 int age_is_stale
= AGE_IS_STALE
;
200 int meta_is_stale
= META_IS_STALE
;
202 /* LIST_INSERT_HEAD() with assertions */
203 static __inline__
void
204 blistenterhead(struct bufhashhdr
* head
, struct buf
* bp
)
206 if ((bp
->b_hash
.le_next
= (head
)->lh_first
) != NULL
)
207 (head
)->lh_first
->b_hash
.le_prev
= &(bp
)->b_hash
.le_next
;
208 (head
)->lh_first
= bp
;
209 bp
->b_hash
.le_prev
= &(head
)->lh_first
;
210 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
211 panic("blistenterhead: le_prev is deadbeef");
214 static __inline__
void
215 binshash(struct buf
*bp
, struct bufhashhdr
*dp
)
219 simple_lock(&bufhashlist_slock
);
222 if((bad
= incore(bp
->b_vp
, bp
->b_lblkno
)))
223 panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp
, bad
);
229 for(; nbp
!= NULL
; nbp
= nbp
->b_hash
.le_next
) {
231 panic("buf already in hashlist");
234 blistenterhead(dp
, bp
);
235 simple_unlock(&bufhashlist_slock
);
238 static __inline__
void
239 bremhash(struct buf
*bp
)
241 simple_lock(&bufhashlist_slock
);
242 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
243 panic("bremhash le_prev is deadbeef");
244 if (bp
->b_hash
.le_next
== bp
)
245 panic("bremhash: next points to self");
247 if (bp
->b_hash
.le_next
!= NULL
)
248 bp
->b_hash
.le_next
->b_hash
.le_prev
= bp
->b_hash
.le_prev
;
249 *bp
->b_hash
.le_prev
= (bp
)->b_hash
.le_next
;
250 simple_unlock(&bufhashlist_slock
);
254 * Remove a buffer from the free list it's on
260 struct bqueues
*dp
= NULL
;
264 * We only calculate the head of the freelist when removing
265 * the last element of the list as that is the only time that
266 * it is needed (e.g. to reset the tail pointer).
268 * NB: This makes an assumption about how tailq's are implemented.
270 if (bp
->b_freelist
.tqe_next
== NULL
) {
271 for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
272 if (dp
->tqh_last
== &bp
->b_freelist
.tqe_next
)
274 if (dp
== &bufqueues
[BQUEUES
])
275 panic("bremfree: lost tail");
277 TAILQ_REMOVE(dp
, bp
, b_freelist
);
278 whichq
= bp
->b_whichq
;
285 * Associate a buffer with a vnode.
289 register struct vnode
*vp
;
290 register struct buf
*bp
;
294 panic("bgetvp: not free");
297 if (vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
)
298 bp
->b_dev
= vp
->v_rdev
;
302 * Insert onto list for new vnode.
304 bufinsvn(bp
, &vp
->v_cleanblkhd
);
308 * Disassociate a buffer from a vnode.
312 register struct buf
*bp
;
316 if (bp
->b_vp
== (struct vnode
*) 0)
317 panic("brelvp: NULL vp");
319 * Delete from old vnode list, if on one.
321 if (bp
->b_vnbufs
.le_next
!= NOLIST
)
324 bp
->b_vp
= (struct vnode
*) 0;
329 * Reassign a buffer from one vnode to another.
330 * Used to assign file specific control information
331 * (indirect blocks) to the vnode to which they belong.
334 reassignbuf(bp
, newvp
)
335 register struct buf
*bp
;
336 register struct vnode
*newvp
;
338 register struct buflists
*listheadp
;
341 printf("reassignbuf: NULL");
345 * Delete from old vnode list, if on one.
347 if (bp
->b_vnbufs
.le_next
!= NOLIST
)
350 * If dirty, put on list of dirty buffers;
351 * otherwise insert onto list of clean buffers.
353 if (ISSET(bp
->b_flags
, B_DELWRI
))
354 listheadp
= &newvp
->v_dirtyblkhd
;
356 listheadp
= &newvp
->v_cleanblkhd
;
357 bufinsvn(bp
, listheadp
);
360 static __inline__
void
361 bufhdrinit(struct buf
*bp
)
363 bzero((char *)bp
, sizeof *bp
);
365 bp
->b_rcred
= NOCRED
;
366 bp
->b_wcred
= NOCRED
;
367 bp
->b_vnbufs
.le_next
= NOLIST
;
368 bp
->b_flags
= B_INVAL
;
374 * Initialize buffers and hash links for buffers.
376 __private_extern__
void
379 register struct buf
*bp
;
380 register struct bqueues
*dp
;
384 static void bufzoneinit();
385 static void bcleanbuf_thread_init();
387 /* Initialize the buffer queues ('freelists') and the hash table */
388 for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++)
390 bufhashtbl
= hashinit(nbuf
, M_CACHE
, &bufhash
);
392 simple_lock_init(&bufhashlist_slock
);
394 metabuf
= nbuf
/8; /* reserved for meta buf */
396 /* Initialize the buffer headers */
397 for (i
= 0; i
< nbuf
; i
++) {
402 * metabuf buffer headers on the meta-data list and
403 * rest of the buffer headers on the empty list
411 dp
= &bufqueues
[whichq
];
412 binsheadfree(bp
, dp
, whichq
);
413 binshash(bp
, &invalhash
);
416 for (; i
< nbuf
+ niobuf
; i
++) {
419 binsheadfree(bp
, &iobufqueue
, -1);
422 printf("using %d buffer headers and %d cluster IO buffer headers\n",
425 /* Set up zones used by the buffer cache */
428 /* start the bcleanbuf() thread */
429 bcleanbuf_thread_init();
433 static void bufq_balance_thread_init();
434 /* create a thread to do dynamic buffer queue balancing */
435 bufq_balance_thread_init();
441 bio_doread(vp
, blkno
, size
, cred
, async
, queuetype
)
449 register struct buf
*bp
;
450 struct proc
*p
= current_proc();
452 bp
= getblk(vp
, blkno
, size
, 0, 0, queuetype
);
455 * If buffer does not have data valid, start a read.
456 * Note that if buffer is B_INVAL, getblk() won't return it.
457 * Therefore, it's valid if it's I/O has completed or been delayed.
459 if (!ISSET(bp
->b_flags
, (B_DONE
| B_DELWRI
))) {
460 /* Start I/O for the buffer (keeping credentials). */
461 SET(bp
->b_flags
, B_READ
| async
);
462 if (cred
!= NOCRED
&& bp
->b_rcred
== NOCRED
) {
464 * NFS has embedded ucred.
465 * Can not crhold() here as that causes zone corruption
467 bp
->b_rcred
= crdup(cred
);
472 trace(TR_BREADMISS
, pack(vp
, size
), blkno
);
474 /* Pay for the read. */
476 p
->p_stats
->p_ru
.ru_inblock
++; /* XXX */
481 trace(TR_BREADHIT
, pack(vp
, size
), blkno
);
487 * This algorithm described in Bach (p.54).
490 bread(vp
, blkno
, size
, cred
, bpp
)
497 register struct buf
*bp
;
499 /* Get buffer for block. */
500 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, BLK_READ
);
502 /* Wait for the read to complete, and return result. */
503 return (biowait(bp
));
507 * Read a disk block. [bread() for meta-data]
508 * This algorithm described in Bach (p.54).
511 meta_bread(vp
, blkno
, size
, cred
, bpp
)
518 register struct buf
*bp
;
520 /* Get buffer for block. */
521 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, BLK_META
);
523 /* Wait for the read to complete, and return result. */
524 return (biowait(bp
));
528 * Read-ahead multiple disk blocks. The first is sync, the rest async.
531 breadn(vp
, blkno
, size
, rablks
, rasizes
, nrablks
, cred
, bpp
)
533 daddr_t blkno
; int size
;
534 daddr_t rablks
[]; int rasizes
[];
539 return (do_breadn_for_type(vp
, blkno
, size
, rablks
, rasizes
, nrablks
, cred
, bpp
, BLK_READ
));
543 * Read-ahead multiple disk blocks. The first is sync, the rest async.
544 * [breadn() for meta-data]
547 meta_breadn(vp
, blkno
, size
, rablks
, rasizes
, nrablks
, cred
, bpp
)
549 daddr_t blkno
; int size
;
550 daddr_t rablks
[]; int rasizes
[];
555 return (do_breadn_for_type(vp
, blkno
, size
, rablks
, rasizes
, nrablks
, cred
, bpp
, BLK_META
));
559 * Perform the reads for breadn() and meta_breadn().
560 * Trivial modification to the breada algorithm presented in Bach (p.55).
563 do_breadn_for_type(struct vnode
*vp
, daddr_t blkno
, int size
, daddr_t
*rablks
, int *rasizes
,
564 int nrablks
, struct ucred
*cred
, struct buf
**bpp
, int queuetype
)
566 register struct buf
*bp
;
569 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0, queuetype
);
572 * For each of the read-ahead blocks, start a read, if necessary.
574 for (i
= 0; i
< nrablks
; i
++) {
575 /* If it's in the cache, just go on to next one. */
576 if (incore(vp
, rablks
[i
]))
579 /* Get a buffer for the read-ahead block */
580 (void) bio_doread(vp
, rablks
[i
], rasizes
[i
], cred
, B_ASYNC
, queuetype
);
583 /* Otherwise, we had to start a read for it; wait until it's valid. */
584 return (biowait(bp
));
588 * Read with single-block read-ahead. Defined in Bach (p.55), but
589 * implemented as a call to breadn().
590 * XXX for compatibility with old file systems.
593 breada(vp
, blkno
, size
, rablkno
, rabsize
, cred
, bpp
)
595 daddr_t blkno
; int size
;
596 daddr_t rablkno
; int rabsize
;
601 return (breadn(vp
, blkno
, size
, &rablkno
, &rabsize
, 1, cred
, bpp
));
605 * Block write. Described in Bach (p.56)
611 int rv
, sync
, wasdelayed
;
612 struct proc
*p
= current_proc();
613 struct vnode
*vp
= bp
->b_vp
;
615 if (bp
->b_data
== 0) {
616 if (brecover_data(bp
) == 0)
619 /* Remember buffer type, to switch on it later. */
620 sync
= !ISSET(bp
->b_flags
, B_ASYNC
);
621 wasdelayed
= ISSET(bp
->b_flags
, B_DELWRI
);
622 CLR(bp
->b_flags
, (B_READ
| B_DONE
| B_ERROR
| B_DELWRI
));
625 wakeup((caddr_t
)&nbdwrite
);
630 * If not synchronous, pay for the I/O operation and make
631 * sure the buf is on the correct vnode queue. We have
632 * to do this now, because if we don't, the vnode may not
633 * be properly notified that its I/O has completed.
639 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
642 trace(TR_BUFWRITE
, pack(vp
, bp
->b_bcount
), bp
->b_lblkno
);
644 /* Initiate disk write. Make sure the appropriate party is charged. */
645 SET(bp
->b_flags
, B_WRITEINPROG
);
652 * If I/O was synchronous, wait for it to complete.
657 * Pay for the I/O operation, if it's not been paid for, and
658 * make sure it's on the correct vnode queue. (async operatings
659 * were payed for above.)
665 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
667 /* Release the buffer. */
668 // XXXdbg - only if the unused bit is set
669 if (!ISSET(bp
->b_flags
, B_NORELSE
)) {
672 CLR(bp
->b_flags
, B_NORELSE
);
683 struct vop_bwrite_args
*ap
;
685 return (bwrite(ap
->a_bp
));
691 * The buffer is marked dirty, but is not queued for I/O.
692 * This routine should be used when the buffer is expected
693 * to be modified again soon, typically a small write that
694 * partially fills a buffer.
696 * NB: magnetic tapes cannot be delayed; they must be
697 * written in the order that the writes are requested.
699 * Described in Leffler, et al. (pp. 208-213).
701 * Note: With the abilitty to allocate additional buffer
702 * headers, we can get in to the situation where "too" many
703 * bdwrite()s can create situation where the kernel can create
704 * buffers faster than the disks can service. Doing a bawrite() in
705 * cases were we have "too many" outstanding bdwrite()s avoids that.
707 __private_extern__
int
708 bdwrite_internal(bp
, return_error
)
712 struct proc
*p
= current_proc();
713 struct vnode
*vp
= bp
->b_vp
;
716 * If the block hasn't been seen before:
717 * (1) Mark it as having been seen,
718 * (2) Charge for the write.
719 * (3) Make sure it's on its vnode's correct block list,
721 if (!ISSET(bp
->b_flags
, B_DELWRI
)) {
722 SET(bp
->b_flags
, B_DELWRI
);
724 p
->p_stats
->p_ru
.ru_oublock
++; /* XXX */
729 /* If this is a tape block, write it the block now. */
730 if (ISSET(bp
->b_flags
, B_TAPE
)) {
737 * If the vnode has "too many" write operations in progress
738 * wait for them to finish the IO
740 while (vp
->v_numoutput
>= BUFWRITE_THROTTLE
) {
741 vp
->v_flag
|= VTHROTTLED
;
742 (void)tsleep((caddr_t
)&vp
->v_numoutput
, PRIBIO
+ 1, "bdwrite", 0);
746 * If we have too many delayed write buffers,
747 * more than we can "safely" handle, just fall back to
748 * doing the async write
751 panic("bdwrite: Negative nbdwrite");
753 // can't do a bawrite() if the LOCKED bit is set because the
754 // buffer is part of a transaction and can't go to disk until
755 // the LOCKED bit is cleared.
756 if (!ISSET(bp
->b_flags
, B_LOCKED
) && nbdwrite
> ((nbuf
/4)*3)) {
764 /* Otherwise, the "write" is done, so mark and release the buffer. */
765 SET(bp
->b_flags
, B_DONE
);
774 (void) bdwrite_internal(bp
, 0);
779 * Asynchronous block write; just an asynchronous bwrite().
781 * Note: With the abilitty to allocate additional buffer
782 * headers, we can get in to the situation where "too" many
783 * bawrite()s can create situation where the kernel can create
784 * buffers faster than the disks can service.
785 * We limit the number of "in flight" writes a vnode can have to
789 bawrite_internal(bp
, throttle
)
793 struct vnode
*vp
= bp
->b_vp
;
797 * If the vnode has "too many" write operations in progress
798 * wait for them to finish the IO
800 while (vp
->v_numoutput
>= BUFWRITE_THROTTLE
) {
802 vp
->v_flag
|= VTHROTTLED
;
803 (void)tsleep((caddr_t
)&vp
->v_numoutput
,
804 PRIBIO
+ 1, "bawrite", 0);
806 return (EWOULDBLOCK
);
810 SET(bp
->b_flags
, B_ASYNC
);
819 (void) bawrite_internal(bp
, 1);
825 * Called prior to the locking of any vnodes when we are expecting to
826 * write. We do not want to starve the buffer cache with too many
827 * dirty buffers so we block here. By blocking prior to the locking
828 * of any vnodes we attempt to avoid the situation where a locked vnode
829 * prevents the various system daemons from flushing related buffers.
835 /* XXX To be implemented later */
839 * Release a buffer on to the free lists.
840 * Described in Bach (p. 46).
846 struct bqueues
*bufq
;
850 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 388)) | DBG_FUNC_START
,
851 bp
->b_lblkno
* PAGE_SIZE
, (int)bp
, (int)bp
->b_data
,
854 trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
856 // if we're invalidating a buffer that has the B_CALL bit
857 // set then call the b_iodone function so it gets cleaned
860 if (ISSET(bp
->b_flags
, B_META
) && ISSET(bp
->b_flags
, B_INVAL
)) {
861 if (ISSET(bp
->b_flags
, B_CALL
) && !ISSET(bp
->b_flags
, B_DELWRI
)) {
862 panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp
);
864 if (ISSET(bp
->b_flags
, B_CALL
)) { /* if necessary, call out */
865 void (*iodone_func
)(struct buf
*) = bp
->b_iodone
;
867 CLR(bp
->b_flags
, B_CALL
); /* but note callout done */
870 if (iodone_func
== NULL
) {
871 panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp
);
877 /* IO is done. Cleanup the UPL state */
878 if (!ISSET(bp
->b_flags
, B_META
)
879 && UBCINFOEXISTS(bp
->b_vp
) && bp
->b_bufsize
) {
884 if ( !ISSET(bp
->b_flags
, B_PAGELIST
)) {
885 if ( !ISSET(bp
->b_flags
, B_INVAL
)) {
886 kret
= ubc_create_upl(bp
->b_vp
,
887 ubc_blktooff(bp
->b_vp
, bp
->b_lblkno
),
892 if (kret
!= KERN_SUCCESS
)
893 panic("brelse: Failed to get pagelists");
895 upl_ubc_alias_set(upl
, bp
, 5);
896 #endif /* UBC_DEBUG */
900 upl
= bp
->b_pagelist
;
903 kret
= ubc_upl_unmap(upl
);
905 if (kret
!= KERN_SUCCESS
)
906 panic("kernel_upl_unmap failed");
911 if (bp
->b_flags
& (B_ERROR
| B_INVAL
)) {
912 if (bp
->b_flags
& (B_READ
| B_INVAL
))
913 upl_flags
= UPL_ABORT_DUMP_PAGES
;
916 ubc_upl_abort(upl
, upl_flags
);
918 if (ISSET(bp
->b_flags
, B_NEEDCOMMIT
))
919 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
920 else if (ISSET(bp
->b_flags
, B_DELWRI
| B_WASDIRTY
))
921 upl_flags
= UPL_COMMIT_SET_DIRTY
;
923 upl_flags
= UPL_COMMIT_CLEAR_DIRTY
;
924 ubc_upl_commit_range(upl
, 0, bp
->b_bufsize
, upl_flags
|
925 UPL_COMMIT_INACTIVATE
| UPL_COMMIT_FREE_ON_EMPTY
);
928 CLR(bp
->b_flags
, B_PAGELIST
);
933 if(ISSET(bp
->b_flags
, B_PAGELIST
))
934 panic("brelse: pagelist set for non VREG; vp=%x", bp
->b_vp
);
937 /* Wake up any processes waiting for any buffer to become free. */
943 /* Wake up any proceeses waiting for _this_ buffer to become free. */
944 if (ISSET(bp
->b_flags
, B_WANTED
)) {
945 CLR(bp
->b_flags
, B_WANTED
);
949 /* Block disk interrupts. */
953 * Determine which queue the buffer should be on, then put it there.
956 /* If it's locked, don't report an error; try again later. */
957 if (ISSET(bp
->b_flags
, (B_LOCKED
|B_ERROR
)) == (B_LOCKED
|B_ERROR
))
958 CLR(bp
->b_flags
, B_ERROR
);
960 /* If it's not cacheable, or an error, mark it invalid. */
961 if (ISSET(bp
->b_flags
, (B_NOCACHE
|B_ERROR
)))
962 SET(bp
->b_flags
, B_INVAL
);
964 if ((bp
->b_bufsize
<= 0) || ISSET(bp
->b_flags
, B_INVAL
)) {
966 * If it's invalid or empty, dissociate it from its vnode
967 * and put on the head of the appropriate queue.
971 if (ISSET(bp
->b_flags
, B_DELWRI
)) {
972 CLR(bp
->b_flags
, B_DELWRI
);
974 wakeup((caddr_t
)&nbdwrite
);
976 if (bp
->b_bufsize
<= 0)
977 whichq
= BQ_EMPTY
; /* no data */
978 else if (ISSET(bp
->b_flags
, B_META
))
979 whichq
= BQ_META
; /* meta-data */
981 whichq
= BQ_AGE
; /* invalid data */
983 bufq
= &bufqueues
[whichq
];
984 binsheadfree(bp
, bufq
, whichq
);
987 * It has valid data. Put it on the end of the appropriate
988 * queue, so that it'll stick around for as long as possible.
990 if (ISSET(bp
->b_flags
, B_LOCKED
))
991 whichq
= BQ_LOCKED
; /* locked in core */
992 else if (ISSET(bp
->b_flags
, B_META
))
993 whichq
= BQ_META
; /* meta-data */
994 else if (ISSET(bp
->b_flags
, B_AGE
))
995 whichq
= BQ_AGE
; /* stale but valid data */
997 whichq
= BQ_LRU
; /* valid data */
999 bufq
= &bufqueues
[whichq
];
1000 binstailfree(bp
, bufq
, whichq
);
1003 /* Unlock the buffer. */
1004 CLR(bp
->b_flags
, (B_AGE
| B_ASYNC
| B_BUSY
| B_NOCACHE
));
1006 /* Allow disk interrupts. */
1009 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 388)) | DBG_FUNC_END
,
1010 (int)bp
, (int)bp
->b_data
, bp
->b_flags
, 0, 0);
1014 * Determine if a block is in the cache.
1015 * Just look on what would be its hash chain. If it's there, return
1016 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1017 * we normally don't return the buffer, unless the caller explicitly
1027 bp
= BUFHASH(vp
, blkno
)->lh_first
;
1029 /* Search hash chain */
1030 for (; bp
!= NULL
; bp
= bp
->b_hash
.le_next
) {
1031 if (bp
->b_lblkno
== blkno
&& bp
->b_vp
== vp
&&
1032 !ISSET(bp
->b_flags
, B_INVAL
))
1040 /* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
1042 * Get a block of requested size that is associated with
1043 * a given vnode and block offset. If it is found in the
1044 * block cache, mark it as having been found, make it busy
1045 * and return it. Otherwise, return an empty block of the
1046 * correct size. It is up to the caller to insure that the
1047 * cached blocks be of the correct size.
1050 getblk(vp
, blkno
, size
, slpflag
, slptimeo
, operation
)
1051 register struct vnode
*vp
;
1053 int size
, slpflag
, slptimeo
, operation
;
1058 upl_page_info_t
*pl
;
1063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_START
,
1064 blkno
* PAGE_SIZE
, size
, operation
, 0, 0);
1068 if ((bp
= incore(vp
, blkno
))) {
1069 /* Found in the Buffer Cache */
1070 if (ISSET(bp
->b_flags
, B_BUSY
)) {
1072 switch (operation
) {
1076 SET(bp
->b_flags
, B_WANTED
);
1077 bufstats
.bufs_busyincore
++;
1078 err
= tsleep(bp
, slpflag
| (PRIBIO
+ 1), "getblk",
1082 * Callers who call with PCATCH or timeout are
1083 * willing to deal with the NULL pointer
1085 if (err
&& ((slpflag
& PCATCH
) ||
1086 ((err
== EWOULDBLOCK
) && slptimeo
)))
1093 /* pagein operation must not use getblk */
1094 panic("getblk: pagein for incore busy buffer");
1100 /* pageout operation must not use getblk */
1101 panic("getblk: pageout for incore busy buffer");
1107 panic("getblk: %d unknown operation 1", operation
);
1113 SET(bp
->b_flags
, (B_BUSY
| B_CACHE
));
1115 bufstats
.bufs_incore
++;
1119 if (ISSET(bp
->b_flags
, B_PAGELIST
))
1120 panic("pagelist buffer is not busy");
1122 switch (operation
) {
1125 if (UBCISVALID(bp
->b_vp
) && bp
->b_bufsize
) {
1126 kret
= ubc_create_upl(vp
,
1127 ubc_blktooff(vp
, bp
->b_lblkno
),
1132 if (kret
!= KERN_SUCCESS
)
1133 panic("Failed to get pagelists");
1135 SET(bp
->b_flags
, B_PAGELIST
);
1136 bp
->b_pagelist
= upl
;
1138 if (!upl_valid_page(pl
, 0)) {
1139 if (vp
->v_tag
!= VT_NFS
)
1140 panic("getblk: incore buffer without valid page");
1141 CLR(bp
->b_flags
, B_CACHE
);
1144 if (upl_dirty_page(pl
, 0))
1145 SET(bp
->b_flags
, B_WASDIRTY
);
1147 CLR(bp
->b_flags
, B_WASDIRTY
);
1149 kret
= ubc_upl_map(upl
, (vm_address_t
*)&(bp
->b_data
));
1150 if (kret
!= KERN_SUCCESS
)
1151 panic("getblk: ubc_upl_map() failed with (%d)",
1153 if (bp
->b_data
== 0)
1154 panic("ubc_upl_map mapped 0");
1160 * VM is not involved in IO for the meta data
1161 * buffer already has valid data
1164 panic("bp->b_data null incore buf=%x", bp
);
1169 panic("getblk: paging operation 1");
1173 panic("getblk: %d unknown operation 2", operation
);
1178 } else { /* not incore() */
1179 int queue
= BQ_EMPTY
; /* Start with no preference */
1182 if ((operation
== BLK_META
) || (UBCINVALID(vp
)) ||
1183 !(UBCINFOEXISTS(vp
))) {
1184 operation
= BLK_META
;
1186 if ((bp
= getnewbuf(slpflag
, slptimeo
, &queue
)) == NULL
)
1188 if (incore(vp
, blkno
)) {
1189 SET(bp
->b_flags
, B_INVAL
);
1190 binshash(bp
, &invalhash
);
1195 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
1196 * CALLED! BE CAREFUL.
1200 * if it is meta, the queue may be set to other
1201 * type so reset as well as mark it to be B_META
1202 * so that when buffer is released it will goto META queue
1203 * Also, if the vnode is not VREG, then it is META
1205 if (operation
== BLK_META
) {
1206 SET(bp
->b_flags
, B_META
);
1210 bp
->b_blkno
= bp
->b_lblkno
= blkno
;
1214 * Insert in the hash so that incore() can find it
1216 binshash(bp
, BUFHASH(vp
, blkno
));
1224 switch (operation
) {
1226 /* buffer data is invalid */
1229 panic("bp->b_data is null %x",bp
);
1231 bufstats
.bufs_miss
++;
1233 /* wakeup the buffer */
1234 CLR(bp
->b_flags
, B_WANTED
);
1241 if (ISSET(bp
->b_flags
, B_PAGELIST
))
1242 panic("B_PAGELIST in bp=%x",bp
);
1244 kret
= ubc_create_upl(vp
,
1245 ubc_blktooff(vp
, blkno
),
1250 if (kret
!= KERN_SUCCESS
)
1251 panic("Failed to get pagelists");
1254 upl_ubc_alias_set(upl
, bp
, 4);
1255 #endif /* UBC_DEBUG */
1256 bp
->b_pagelist
= upl
;
1258 SET(bp
->b_flags
, B_PAGELIST
);
1260 if (upl_valid_page(pl
, 0)) {
1261 SET(bp
->b_flags
, B_CACHE
| B_DONE
);
1262 bufstats
.bufs_vmhits
++;
1264 pagedirty
= upl_dirty_page(pl
, 0);
1267 SET(bp
->b_flags
, B_WASDIRTY
);
1269 if (vp
->v_tag
== VT_NFS
) {
1276 f_offset
= ubc_blktooff(vp
, blkno
);
1278 if (f_offset
> vp
->v_ubcinfo
->ui_size
) {
1279 CLR(bp
->b_flags
, (B_CACHE
|B_DONE
|B_WASDIRTY
));
1283 valid_size
= min(((unsigned int)(vp
->v_ubcinfo
->ui_size
- f_offset
)), PAGE_SIZE
);
1284 bp
->b_validend
= valid_size
;
1287 bp
->b_dirtyend
= valid_size
;
1291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_NONE
,
1292 bp
->b_validend
, bp
->b_dirtyend
,
1293 (int)vp
->v_ubcinfo
->ui_size
, 0, 0);
1301 bp
->b_validend
= bp
->b_bcount
;
1302 bp
->b_dirtyend
= bp
->b_bcount
;
1305 bp
->b_validend
= bp
->b_bcount
;
1309 error
= VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
, NULL
);
1311 panic("getblk: VOP_BMAP failed");
1314 * XXX: We probably should invalidate the VM Page
1316 bp
->b_error
= error
;
1317 SET(bp
->b_flags
, (B_ERROR
| B_INVAL
));
1318 /* undo B_DONE that was set before upl_commit() */
1319 CLR(bp
->b_flags
, B_DONE
);
1324 bufstats
.bufs_miss
++;
1326 kret
= ubc_upl_map(upl
, (vm_address_t
*)&(bp
->b_data
));
1327 if (kret
!= KERN_SUCCESS
) {
1328 panic("getblk: ubc_upl_map() "
1329 "failed with (%d)", kret
);
1331 if (bp
->b_data
== 0)
1332 panic("kernel_upl_map mapped 0");
1338 panic("getblk: paging operation 2");
1341 panic("getblk: %d unknown operation 3", operation
);
1347 if (bp
->b_data
== NULL
)
1348 panic("getblk: bp->b_addr is null");
1350 if (bp
->b_bufsize
& 0xfff) {
1351 if (ISSET(bp
->b_flags
, B_META
) && (bp
->b_bufsize
& 0x1ff))
1352 panic("getblk: bp->b_bufsize = %d", bp
->b_bufsize
);
1355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 386)) | DBG_FUNC_END
,
1356 (int)bp
, (int)bp
->b_data
, bp
->b_flags
, 3, 0);
1362 * Get an empty, disassociated buffer of given size.
1369 int queue
= BQ_EMPTY
;
1371 while ((bp
= getnewbuf(0, 0, &queue
)) == 0)
1373 SET(bp
->b_flags
, (B_META
|B_INVAL
));
1376 assert(queue
== BQ_EMPTY
);
1377 #endif /* DIAGNOSTIC */
1378 /* XXX need to implement logic to deal with other queues */
1380 binshash(bp
, &invalhash
);
1382 bufstats
.bufs_eblk
++;
1388 * Zones for the meta data buffers
1392 #define MAXMETA 4096
1394 struct meta_zone_entry
{
1401 struct meta_zone_entry meta_zones
[] = {
1402 {NULL
, (MINMETA
* 1), 128 * (MINMETA
* 1), "buf.512" },
1403 {NULL
, (MINMETA
* 2), 64 * (MINMETA
* 2), "buf.1024" },
1404 {NULL
, (MINMETA
* 4), 16 * (MINMETA
* 4), "buf.2048" },
1405 {NULL
, (MINMETA
* 8), 512 * (MINMETA
* 8), "buf.4096" },
1406 {NULL
, 0, 0, "" } /* End */
1410 * Initialize the meta data zones
1417 for (i
= 0; meta_zones
[i
].mz_size
!= 0; i
++) {
1418 meta_zones
[i
].mz_zone
=
1419 zinit(meta_zones
[i
].mz_size
,
1420 meta_zones
[i
].mz_max
,
1422 meta_zones
[i
].mz_name
);
1424 buf_hdr_zone
= zinit(sizeof(struct buf
), 32, PAGE_SIZE
, "buf headers");
1427 static __inline__ zone_t
1428 getbufzone(size_t size
)
1432 if ((size
% 512) || (size
< MINMETA
) || (size
> MAXMETA
))
1433 panic("getbufzone: incorect size = %d", size
);
1435 for (i
= 0; meta_zones
[i
].mz_size
!= 0; i
++) {
1436 if (meta_zones
[i
].mz_size
>= size
)
1440 return (meta_zones
[i
].mz_zone
);
1444 * With UBC, there is no need to expand / shrink the file data
1445 * buffer. The VM uses the same pages, hence no waste.
1446 * All the file data buffers can have one size.
1447 * In fact expand / shrink would be an expensive operation.
1449 * Only exception to this is meta-data buffers. Most of the
1450 * meta data operations are smaller than PAGE_SIZE. Having the
1451 * meta-data buffers grow and shrink as needed, optimizes use
1452 * of the kernel wired memory.
1460 vm_size_t desired_size
;
1462 desired_size
= roundup(size
, CLBYTES
);
1464 if(desired_size
< PAGE_SIZE
)
1465 desired_size
= PAGE_SIZE
;
1466 if (desired_size
> MAXBSIZE
)
1467 panic("allocbuf: buffer larger than MAXBSIZE requested");
1469 if (ISSET(bp
->b_flags
, B_META
)) {
1472 size_t nsize
= roundup(size
, MINMETA
);
1475 vm_offset_t elem
= (vm_offset_t
)bp
->b_data
;
1477 if (ISSET(bp
->b_flags
, B_ZALLOC
))
1478 if (bp
->b_bufsize
<= MAXMETA
) {
1479 if (bp
->b_bufsize
< nsize
) {
1480 /* reallocate to a bigger size */
1482 zprev
= getbufzone(bp
->b_bufsize
);
1483 if (nsize
<= MAXMETA
) {
1484 desired_size
= nsize
;
1485 z
= getbufzone(nsize
);
1486 bp
->b_data
= (caddr_t
)zalloc(z
);
1488 panic("allocbuf: zalloc() returned NULL");
1490 kret
= kmem_alloc(kernel_map
, &bp
->b_data
, desired_size
);
1491 if (kret
!= KERN_SUCCESS
)
1492 panic("allocbuf: kmem_alloc() 0 returned %d", kret
);
1494 panic("allocbuf: null b_data 0");
1495 CLR(bp
->b_flags
, B_ZALLOC
);
1497 bcopy((const void *)elem
, bp
->b_data
, bp
->b_bufsize
);
1500 desired_size
= bp
->b_bufsize
;
1503 panic("allocbuf: B_ZALLOC set incorrectly");
1505 if (bp
->b_bufsize
< desired_size
) {
1506 /* reallocate to a bigger size */
1507 kret
= kmem_alloc(kernel_map
, &bp
->b_data
, desired_size
);
1508 if (kret
!= KERN_SUCCESS
)
1509 panic("allocbuf: kmem_alloc() returned %d", kret
);
1511 panic("allocbuf: null b_data");
1512 bcopy((const void *)elem
, bp
->b_data
, bp
->b_bufsize
);
1513 kmem_free(kernel_map
, elem
, bp
->b_bufsize
);
1515 desired_size
= bp
->b_bufsize
;
1518 /* new allocation */
1519 if (nsize
<= MAXMETA
) {
1520 desired_size
= nsize
;
1521 z
= getbufzone(nsize
);
1522 bp
->b_data
= (caddr_t
)zalloc(z
);
1524 panic("allocbuf: zalloc() returned NULL 2");
1525 SET(bp
->b_flags
, B_ZALLOC
);
1527 kret
= kmem_alloc(kernel_map
, &bp
->b_data
, desired_size
);
1528 if (kret
!= KERN_SUCCESS
)
1529 panic("allocbuf: kmem_alloc() 2 returned %d", kret
);
1531 panic("allocbuf: null b_data 2");
1536 if (ISSET(bp
->b_flags
, B_META
) && (bp
->b_data
== 0))
1537 panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp
);
1539 bp
->b_bufsize
= desired_size
;
1540 bp
->b_bcount
= size
;
1545 * Get a new buffer from one of the free lists.
1547 * Request for a queue is passes in. The queue from which the buffer was taken
1548 * from is returned. Out of range queue requests get BQ_EMPTY. Request for
1549 * BQUEUE means no preference. Use heuristics in that case.
1550 * Heuristics is as follows:
1551 * Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
1552 * If none available block till one is made available.
1553 * If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
1554 * Pick the most stale buffer.
1555 * If found buffer was marked delayed write, start the async. write
1556 * and restart the search.
1557 * Initialize the fields and disassociate the buffer from the vnode.
1558 * Remove the buffer from the hash. Return the buffer and the queue
1559 * on which it was found.
1563 getnewbuf(slpflag
, slptimeo
, queue
)
1564 int slpflag
, slptimeo
;
1567 register struct buf
*bp
;
1568 register struct buf
*lru_bp
;
1569 register struct buf
*age_bp
;
1570 register struct buf
*meta_bp
;
1571 register int age_time
, lru_time
, bp_time
, meta_time
;
1573 int req
= *queue
; /* save it for restarts */
1578 /* invalid request gets empty queue */
1579 if ((*queue
> BQUEUES
) || (*queue
< 0)
1580 || (*queue
== BQ_LAUNDRY
) || (*queue
== BQ_LOCKED
))
1583 /* (*queue == BQUEUES) means no preference */
1584 if (*queue
!= BQUEUES
) {
1585 /* Try for the requested queue first */
1586 bp
= bufqueues
[*queue
].tqh_first
;
1591 /* Unable to use requested queue */
1592 age_bp
= bufqueues
[BQ_AGE
].tqh_first
;
1593 lru_bp
= bufqueues
[BQ_LRU
].tqh_first
;
1594 meta_bp
= bufqueues
[BQ_META
].tqh_first
;
1596 if (!age_bp
&& !lru_bp
&& !meta_bp
) {
1598 * Unavailble on AGE or LRU or META queues
1599 * Try the empty list first
1601 bp
= bufqueues
[BQ_EMPTY
].tqh_first
;
1607 /* Create a new temparory buffer header */
1608 bp
= (struct buf
*)zalloc(buf_hdr_zone
);
1613 binshash(bp
, &invalhash
);
1614 SET(bp
->b_flags
, B_HDRALLOC
);
1616 binsheadfree(bp
, &bufqueues
[BQ_EMPTY
], BQ_EMPTY
);
1621 /* Log this error condition */
1622 printf("getnewbuf: No useful buffers");
1624 /* wait for a free buffer of any kind */
1626 bufstats
.bufs_sleeps
++;
1627 tsleep(&needbuffer
, slpflag
|(PRIBIO
+1), "getnewbuf", slptimeo
);
1632 /* Buffer available either on AGE or LRU or META */
1636 /* Buffer available either on AGE or LRU */
1640 } else if (!lru_bp
) {
1643 } else { /* buffer available on both AGE and LRU */
1644 age_time
= time
.tv_sec
- age_bp
->b_timestamp
;
1645 lru_time
= time
.tv_sec
- lru_bp
->b_timestamp
;
1646 if ((age_time
< 0) || (lru_time
< 0)) { /* time set backwards */
1650 * we should probably re-timestamp eveything in the
1651 * queues at this point with the current time
1654 if ((lru_time
>= lru_is_stale
) && (age_time
< age_is_stale
)) {
1664 if (!bp
) { /* Neither on AGE nor on LRU */
1667 } else if (meta_bp
) {
1668 bp_time
= time
.tv_sec
- bp
->b_timestamp
;
1669 meta_time
= time
.tv_sec
- meta_bp
->b_timestamp
;
1671 if (!(bp_time
< 0) && !(meta_time
< 0)) {
1672 /* time not set backwards */
1674 bp_is_stale
= (*queue
== BQ_LRU
) ?
1675 lru_is_stale
: age_is_stale
;
1677 if ((meta_time
>= meta_is_stale
) &&
1678 (bp_time
< bp_is_stale
)) {
1686 panic("getnewbuf: null bp");
1689 if (ISSET(bp
->b_flags
, B_LOCKED
)) {
1690 panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp
, bp
->b_flags
);
1693 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
1694 panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp
);
1696 if(ISSET(bp
->b_flags
, B_BUSY
))
1697 panic("getnewbuf reusing BUSY buf @ 0x%x", bp
);
1700 if (bcleanbuf(bp
)) {
1701 /* bawrite() issued, buffer not ready */
1710 #include <mach/mach_types.h>
1711 #include <mach/memory_object_types.h>
1712 #include <kern/sched_prim.h>
1716 * Returns 0 is buffer is ready to use,
1717 * Returns 1 if issued a bawrite() to indicate
1718 * that the buffer is not ready.
1721 bcleanbuf(struct buf
*bp
)
1729 /* Remove from the queue */
1732 /* Buffer is no longer on free lists. */
1733 SET(bp
->b_flags
, B_BUSY
);
1735 /* Check whether the buffer header was "allocated" */
1736 if (ISSET(bp
->b_flags
, B_HDRALLOC
))
1739 if (bp
->b_hash
.le_prev
== (struct buf
**)0xdeadbeef)
1740 panic("bcleanbuf: le_prev is deadbeef");
1743 * If buffer was a delayed write, start the IO by queuing
1744 * it on the LAUNDRY queue, and return 1
1746 if (ISSET(bp
->b_flags
, B_DELWRI
)) {
1748 binstailfree(bp
, &bufqueues
[BQ_LAUNDRY
], BQ_LAUNDRY
);
1750 wakeup(&blaundrycnt
);
1751 /* and give it a chance to run */
1752 (void)thread_block(THREAD_CONTINUE_NULL
);
1763 if (ISSET(bp
->b_flags
, B_META
)) {
1764 vm_offset_t elem
= (vm_offset_t
)bp
->b_data
;
1766 panic("bcleanbuf: NULL bp->b_data B_META buffer");
1768 if (ISSET(bp
->b_flags
, B_ZALLOC
)) {
1769 if (bp
->b_bufsize
<= MAXMETA
) {
1772 z
= getbufzone(bp
->b_bufsize
);
1773 bp
->b_data
= (caddr_t
)0xdeadbeef;
1775 CLR(bp
->b_flags
, B_ZALLOC
);
1777 panic("bcleanbuf: B_ZALLOC set incorrectly");
1779 bp
->b_data
= (caddr_t
)0xdeadbeef;
1780 kmem_free(kernel_map
, elem
, bp
->b_bufsize
);
1784 trace(TR_BRELSE
, pack(bp
->b_vp
, bp
->b_bufsize
), bp
->b_lblkno
);
1786 /* disassociate us from our vnode, if we had one... */
1789 /* clear out various other fields */
1792 bp
->b_flags
= B_BUSY
;
1794 SET(bp
->b_flags
, B_HDRALLOC
);
1796 bp
->b_blkno
= bp
->b_lblkno
= 0;
1801 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1802 bp
->b_validoff
= bp
->b_validend
= 0;
1804 /* nuke any credentials we were holding */
1806 if (cred
!= NOCRED
) {
1807 bp
->b_rcred
= NOCRED
;
1811 if (cred
!= NOCRED
) {
1812 bp
->b_wcred
= NOCRED
;
1821 * Wait for operations on the buffer to complete.
1822 * When they do, extract and return the I/O's error value.
1831 while (!ISSET(bp
->b_flags
, B_DONE
))
1832 tsleep(bp
, PRIBIO
+ 1, "biowait", 0);
1835 /* check for interruption of I/O (e.g. via NFS), then errors. */
1836 if (ISSET(bp
->b_flags
, B_EINTR
)) {
1837 CLR(bp
->b_flags
, B_EINTR
);
1839 } else if (ISSET(bp
->b_flags
, B_ERROR
))
1840 return (bp
->b_error
? bp
->b_error
: EIO
);
1846 * Mark I/O complete on a buffer.
1848 * If a callback has been requested, e.g. the pageout
1849 * daemon, do so. Otherwise, awaken waiting processes.
1851 * [ Leffler, et al., says on p.247:
1852 * "This routine wakes up the blocked process, frees the buffer
1853 * for an asynchronous write, or, for a request by the pagedaemon
1854 * process, invokes a procedure specified in the buffer structure" ]
1856 * In real life, the pagedaemon (or other system processes) wants
1857 * to do async stuff to, and doesn't want the buffer brelse()'d.
1858 * (for swap pager, that puts swap buffers on the free lists (!!!),
1859 * for the vn device, that puts malloc'd buffers on the free lists!)
1865 boolean_t funnel_state
;
1867 extern struct timeval priority_IO_timestamp_for_root
;
1868 extern int hard_throttle_on_root
;
1870 funnel_state
= thread_funnel_set(kernel_flock
, TRUE
);
1872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 387)) | DBG_FUNC_START
,
1873 (int)bp
, (int)bp
->b_data
, bp
->b_flags
, 0, 0);
1875 if (ISSET(bp
->b_flags
, B_DONE
))
1876 panic("biodone already");
1877 SET(bp
->b_flags
, B_DONE
); /* note that it's done */
1879 * I/O was done, so don't believe
1880 * the DIRTY state from VM anymore
1882 CLR(bp
->b_flags
, B_WASDIRTY
);
1884 if (!ISSET(bp
->b_flags
, B_READ
) && !ISSET(bp
->b_flags
, B_RAW
))
1885 vwakeup(bp
); /* wake up reader */
1887 if (kdebug_enable
) {
1888 int code
= DKIO_DONE
;
1890 if (bp
->b_flags
& B_READ
)
1892 if (bp
->b_flags
& B_ASYNC
)
1895 if (bp
->b_flags
& B_META
)
1897 else if (bp
->b_flags
& (B_PGIN
| B_PAGEOUT
))
1898 code
|= DKIO_PAGING
;
1900 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW
, code
) | DBG_FUNC_NONE
,
1901 (unsigned int)bp
, (unsigned int)bp
->b_vp
,
1902 bp
->b_resid
, bp
->b_error
, 0);
1905 /* Wakeup the throttled write operations as needed */
1908 && (vp
->v_flag
& VTHROTTLED
)
1909 && (vp
->v_numoutput
<= (BUFWRITE_THROTTLE
/ 3))) {
1910 vp
->v_flag
&= ~VTHROTTLED
;
1911 wakeup((caddr_t
)&vp
->v_numoutput
);
1913 if ((bp
->b_flags
& B_PGIN
) && (vp
->v_mount
->mnt_kern_flag
& MNTK_ROOTDEV
)) {
1914 priority_IO_timestamp_for_root
= time
;
1915 hard_throttle_on_root
= 0;
1917 if (ISSET(bp
->b_flags
, B_CALL
)) { /* if necessary, call out */
1918 void (*iodone_func
)(struct buf
*) = bp
->b_iodone
;
1920 CLR(bp
->b_flags
, B_CALL
); /* but note callout done */
1921 bp
->b_iodone
= NULL
;
1923 if (iodone_func
== NULL
) {
1924 panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp
);
1928 } else if (ISSET(bp
->b_flags
, B_ASYNC
)) /* if async, release it */
1930 else { /* or just wakeup the buffer */
1931 CLR(bp
->b_flags
, B_WANTED
);
1935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW
, 387)) | DBG_FUNC_END
,
1936 (int)bp
, (int)bp
->b_data
, bp
->b_flags
, 0, 0);
1938 thread_funnel_set(kernel_flock
, funnel_state
);
1942 * Return a count of buffers on the "locked" queue.
1947 register struct buf
*bp
;
1950 for (bp
= bufqueues
[BQ_LOCKED
].tqh_first
; bp
;
1951 bp
= bp
->b_freelist
.tqe_next
)
1957 * Return a count of 'busy' buffers. Used at the time of shutdown.
1960 count_busy_buffers()
1962 register struct buf
*bp
;
1963 register int nbusy
= 0;
1965 for (bp
= &buf
[nbuf
]; --bp
>= buf
; )
1966 if ((bp
->b_flags
& (B_BUSY
|B_INVAL
)) == B_BUSY
)
1973 * Print out statistics on the current allocation of the buffer pool.
1974 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1975 * in vfs_syscalls.c using sysctl.
1981 register struct buf
*bp
;
1982 register struct bqueues
*dp
;
1983 int counts
[MAXBSIZE
/CLBYTES
+1];
1984 static char *bname
[BQUEUES
] =
1985 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
1987 for (dp
= bufqueues
, i
= 0; dp
< &bufqueues
[BQUEUES
]; dp
++, i
++) {
1989 for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
1992 for (bp
= dp
->tqh_first
; bp
; bp
= bp
->b_freelist
.tqe_next
) {
1993 counts
[bp
->b_bufsize
/CLBYTES
]++;
1997 printf("%s: total-%d", bname
[i
], count
);
1998 for (j
= 0; j
<= MAXBSIZE
/CLBYTES
; j
++)
2000 printf(", %d-%d", j
* CLBYTES
, counts
[j
]);
2004 #endif /* DIAGNOSTIC */
2006 #define NRESERVEDIOBUFS 64
2008 __private_extern__
struct buf
*
2009 alloc_io_buf(vp
, priv
)
2013 register struct buf
*bp
;
2018 while (niobuf
- NRESERVEDIOBUFS
< bufstats
.bufs_iobufinuse
&& !priv
) {
2020 bufstats
.bufs_iobufsleeps
++;
2021 (void) tsleep(&need_iobuffer
, (PRIBIO
+1), "alloc_io_buf", 0);
2024 while ((bp
= iobufqueue
.tqh_first
) == NULL
) {
2026 bufstats
.bufs_iobufsleeps
++;
2027 (void) tsleep(&need_iobuffer
, (PRIBIO
+1), "alloc_io_buf1", 0);
2030 TAILQ_REMOVE(&iobufqueue
, bp
, b_freelist
);
2031 bp
->b_timestamp
= 0;
2033 /* clear out various fields */
2034 bp
->b_flags
= B_BUSY
;
2035 bp
->b_blkno
= bp
->b_lblkno
= 0;
2044 if (vp
->v_type
== VBLK
|| vp
->v_type
== VCHR
)
2045 bp
->b_dev
= vp
->v_rdev
;
2048 bufstats
.bufs_iobufinuse
++;
2049 if (bufstats
.bufs_iobufinuse
> bufstats
.bufs_iobufmax
)
2050 bufstats
.bufs_iobufmax
= bufstats
.bufs_iobufinuse
;
2056 __private_extern__
void
2063 /* put buffer back on the head of the iobufqueue */
2065 bp
->b_flags
= B_INVAL
;
2067 binsheadfree(bp
, &iobufqueue
, -1);
2069 /* Wake up any processes waiting for any buffer to become free. */
2070 if (need_iobuffer
) {
2072 wakeup(&need_iobuffer
);
2074 bufstats
.bufs_iobufinuse
--;
2078 /* disabled for now */
2080 /* XXX move this to a separate file */
2082 * Dynamic Scaling of the Buffer Queues
2085 typedef long long blsize_t
;
2087 blsize_t MAXNBUF
; /* initialize to (sane_size / PAGE_SIZE) */
2088 /* Global tunable limits */
2089 blsize_t nbufh
; /* number of buffer headers */
2090 blsize_t nbuflow
; /* minimum number of buffer headers required */
2091 blsize_t nbufhigh
; /* maximum number of buffer headers allowed */
2092 blsize_t nbuftarget
; /* preferred number of buffer headers */
2097 * 1. 0 < nbuflow <= nbufh <= nbufhigh
2098 * 2. nbufhigh <= MAXNBUF
2099 * 3. 0 < nbuflow <= nbuftarget <= nbufhigh
2100 * 4. nbufh can not be set by sysctl().
2103 /* Per queue tunable limits */
2106 blsize_t bl_nlow
; /* minimum number of buffer headers required */
2107 blsize_t bl_num
; /* number of buffer headers on the queue */
2108 blsize_t bl_nlhigh
; /* maximum number of buffer headers allowed */
2109 blsize_t bl_target
; /* preferred number of buffer headers */
2110 long bl_stale
; /* Seconds after which a buffer is considered stale */
2116 * 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
2117 * 2. bl_nlhigh <= MAXNBUF
2118 * 3. bufqlim[BQ_META].bl_nlow != 0
2119 * 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
2120 * file system IO operations)
2121 * 5. bl_num can not be set by sysctl().
2122 * 6. bl_nhigh <= nbufhigh
2128 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
2129 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
2131 * These limits are exported to by means of sysctl().
2132 * It was decided to define blsize_t as a 64 bit quantity.
2133 * This will make sure that we will not be required to change it
2134 * as long as we do not exceed 64 bit address space for the kernel.
2136 * low and high numbers parameters initialized at compile time
2137 * and boot arguments can be used to override them. sysctl()
2138 * would not change the value. sysctl() can get all the values
2139 * but can set only target. num is the current level.
2141 * Advantages of having a "bufqscan" thread doing the balancing are,
2142 * Keep enough bufs on BQ_EMPTY.
2143 * getnewbuf() by default will always select a buffer from the BQ_EMPTY.
2144 * getnewbuf() perfoms best if a buffer was found there.
2145 * Also this minimizes the possibility of starting IO
2146 * from getnewbuf(). That's a performance win, too.
2148 * Localize complex logic [balancing as well as time aging]
2151 * Simplify getnewbuf() logic by elimination of time aging code.
2157 * The goal of the dynamic scaling of the buffer queues to to keep
2158 * the size of the LRU close to bl_target. Buffers on a queue would
2161 * There would be a thread which will be responsible for "balancing"
2162 * the buffer cache queues.
2164 * The scan order would be: AGE, LRU, META, EMPTY.
2167 long bufqscanwait
= 0;
2169 static void bufqscan_thread();
2170 static int balancebufq(int q
);
2171 static int btrimempty(int n
);
2172 static __inline__
int initbufqscan(void);
2173 static __inline__
int nextbufq(int q
);
2174 static void buqlimprt(int all
);
2177 bufq_balance_thread_init()
2180 if (bufqscanwait
++ == 0) {
2182 /* Initalize globals */
2183 MAXNBUF
= (sane_size
/ PAGE_SIZE
);
2185 nbuflow
= min(nbufh
, 100);
2186 nbufhigh
= min(MAXNBUF
, max(nbufh
, 2048));
2187 nbuftarget
= (sane_size
>> 5) / PAGE_SIZE
;
2188 nbuftarget
= max(nbuflow
, nbuftarget
);
2189 nbuftarget
= min(nbufhigh
, nbuftarget
);
2192 * Initialize the bufqlim
2196 bufqlim
[BQ_LOCKED
].bl_nlow
= 0;
2197 bufqlim
[BQ_LOCKED
].bl_nlhigh
= 32;
2198 bufqlim
[BQ_LOCKED
].bl_target
= 0;
2199 bufqlim
[BQ_LOCKED
].bl_stale
= 30;
2202 bufqlim
[BQ_LRU
].bl_nlow
= 0;
2203 bufqlim
[BQ_LRU
].bl_nlhigh
= nbufhigh
/4;
2204 bufqlim
[BQ_LRU
].bl_target
= nbuftarget
/4;
2205 bufqlim
[BQ_LRU
].bl_stale
= LRU_IS_STALE
;
2208 bufqlim
[BQ_AGE
].bl_nlow
= 0;
2209 bufqlim
[BQ_AGE
].bl_nlhigh
= nbufhigh
/4;
2210 bufqlim
[BQ_AGE
].bl_target
= nbuftarget
/4;
2211 bufqlim
[BQ_AGE
].bl_stale
= AGE_IS_STALE
;
2214 bufqlim
[BQ_EMPTY
].bl_nlow
= 0;
2215 bufqlim
[BQ_EMPTY
].bl_nlhigh
= nbufhigh
/4;
2216 bufqlim
[BQ_EMPTY
].bl_target
= nbuftarget
/4;
2217 bufqlim
[BQ_EMPTY
].bl_stale
= 600000;
2220 bufqlim
[BQ_META
].bl_nlow
= 0;
2221 bufqlim
[BQ_META
].bl_nlhigh
= nbufhigh
/4;
2222 bufqlim
[BQ_META
].bl_target
= nbuftarget
/4;
2223 bufqlim
[BQ_META
].bl_stale
= META_IS_STALE
;
2226 bufqlim
[BQ_LOCKED
].bl_nlow
= 0;
2227 bufqlim
[BQ_LOCKED
].bl_nlhigh
= 32;
2228 bufqlim
[BQ_LOCKED
].bl_target
= 0;
2229 bufqlim
[BQ_LOCKED
].bl_stale
= 30;
2234 /* create worker thread */
2235 kernel_thread(kernel_task
, bufqscan_thread
);
2238 /* The workloop for the buffer balancing thread */
2242 boolean_t funnel_state
;
2245 funnel_state
= thread_funnel_set(kernel_flock
, TRUE
);
2249 int q
; /* buffer queue to process */
2253 moretodo
|= balancebufq(q
);
2262 (void)tsleep((void *)&bufqscanwait
, PRIBIO
, "bufqscanwait", 60 * hz
);
2266 (void) thread_funnel_set(kernel_flock
, FALSE
);
2269 /* Seed for the buffer queue balancing */
2270 static __inline__
int
2273 /* Start with AGE queue */
2277 /* Pick next buffer queue to balance */
2278 static __inline__
int
2281 int order
[] = { BQ_AGE
, BQ_LRU
, BQ_META
, BQ_EMPTY
, 0 };
2288 /* function to balance the buffer queues */
2296 /* reject invalid q */
2297 if ((q
< 0) || (q
>= BQUEUES
))
2300 /* LOCKED or LAUNDRY queue MUST not be balanced */
2301 if ((q
== BQ_LOCKED
) || (q
== BQ_LAUNDRY
))
2304 n
= (bufqlim
[q
].bl_num
- bufqlim
[q
].bl_target
);
2306 /* If queue has less than target nothing more to do */
2311 /* Balance only a small amount (12.5%) at a time */
2315 /* EMPTY queue needs special handling */
2316 if (q
== BQ_EMPTY
) {
2317 moretodo
|= btrimempty(n
);
2321 for (; n
> 0; n
--) {
2322 struct buf
*bp
= bufqueues
[q
].tqh_first
;
2326 /* check if it's stale */
2327 if ((time
.tv_sec
- bp
->b_timestamp
) > bufqlim
[q
].bl_stale
) {
2328 if (bcleanbuf(bp
)) {
2329 /* bawrite() issued, bp not ready */
2332 /* release the cleaned buffer to BQ_EMPTY */
2333 SET(bp
->b_flags
, B_INVAL
);
2349 * When struct buf are allocated dynamically, this would
2350 * reclaim upto 'n' struct buf from the empty queue.
2356 static __inline__
void
2359 if ((q
< 0) || (q
>= BQUEUES
))
2362 bufqlim
[q
].bl_num
++;
2366 static __inline__
void
2369 if ((q
< 0) || (q
>= BQUEUES
))
2372 bufqlim
[q
].bl_num
--;
2380 static char *bname
[BQUEUES
] =
2381 { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
2384 for (i
= 0; i
< BQUEUES
; i
++) {
2385 printf("%s : ", bname
[i
]);
2386 printf("min = %ld, ", (long)bufqlim
[i
].bl_nlow
);
2387 printf("cur = %ld, ", (long)bufqlim
[i
].bl_num
);
2388 printf("max = %ld, ", (long)bufqlim
[i
].bl_nlhigh
);
2389 printf("target = %ld, ", (long)bufqlim
[i
].bl_target
);
2390 printf("stale after %ld seconds\n", bufqlim
[i
].bl_stale
);
2393 for (i
= 0; i
< BQUEUES
; i
++) {
2394 printf("%s : ", bname
[i
]);
2395 printf("cur = %ld, ", (long)bufqlim
[i
].bl_num
);
2400 * If the getnewbuf() calls bcleanbuf() on the same thread
2401 * there is a potential for stack overrun and deadlocks.
2402 * So we always handoff the work to worker thread for completion
2406 bcleanbuf_thread_init()
2408 static void bcleanbuf_thread();
2410 /* create worker thread */
2411 kernel_thread(kernel_task
, bcleanbuf_thread
);
2417 boolean_t funnel_state
;
2422 funnel_state
= thread_funnel_set(kernel_flock
, TRUE
);
2425 while (blaundrycnt
== 0)
2426 (void)tsleep((void *)&blaundrycnt
, PRIBIO
, "blaundry", 60 * hz
);
2427 bp
= TAILQ_FIRST(&bufqueues
[BQ_LAUNDRY
]);
2428 /* Remove from the queue */
2433 error
= bawrite_internal(bp
, 0);
2435 binstailfree(bp
, &bufqueues
[BQ_LAUNDRY
], BQ_LAUNDRY
);
2438 (void)tsleep((void *)&blaundrycnt
, PRIBIO
, "blaundry", 1);
2441 (void)thread_block(THREAD_CONTINUE_NULL
);
2448 (void) thread_funnel_set(kernel_flock
, funnel_state
);
2453 brecover_data(struct buf
*bp
)
2456 upl_page_info_t
*pl
;
2459 struct vnode
*vp
= bp
->b_vp
;
2461 if (vp
->v_tag
== VT_NFS
)
2463 * NFS currently deals with this case
2464 * in a slightly different manner...
2465 * continue to let it do so
2469 if (!UBCISVALID(vp
) || bp
->b_bufsize
== 0)
2472 kret
= ubc_create_upl(vp
,
2473 ubc_blktooff(vp
, bp
->b_lblkno
),
2478 if (kret
!= KERN_SUCCESS
)
2479 panic("Failed to get pagelists");
2481 for (upl_offset
= 0; upl_offset
< bp
->b_bufsize
; upl_offset
+= PAGE_SIZE
) {
2483 if (!upl_valid_page(pl
, upl_offset
/ PAGE_SIZE
) || !upl_dirty_page(pl
, upl_offset
/ PAGE_SIZE
)) {
2484 ubc_upl_abort(upl
, 0);
2488 SET(bp
->b_flags
, B_PAGELIST
);
2489 bp
->b_pagelist
= upl
;
2491 kret
= ubc_upl_map(upl
, (vm_address_t
*)&(bp
->b_data
));
2492 if (kret
!= KERN_SUCCESS
)
2493 panic("getblk: ubc_upl_map() failed with (%d)", kret
);
2494 if (bp
->b_data
== 0)
2495 panic("ubc_upl_map mapped 0");
2501 SET(bp
->b_flags
, B_INVAL
);
2509 bp_cmp(void *a
, void *b
)
2511 struct buf
*bp_a
= *(struct buf
**)a
,
2512 *bp_b
= *(struct buf
**)b
;
2515 // don't have to worry about negative block
2516 // numbers so this is ok to do.
2518 res
= (bp_a
->b_blkno
- bp_b
->b_blkno
);
2526 bflushq(int whichq
, struct mount
*mp
)
2528 struct buf
*bp
, *next
;
2529 int i
, buf_count
, s
;
2530 int counter
=0, total_writes
=0;
2531 static struct buf
*flush_table
[NFLUSH
];
2533 if (whichq
< 0 || whichq
>= BQUEUES
) {
2539 bp
= TAILQ_FIRST(&bufqueues
[whichq
]);
2540 for(buf_count
=0; bp
; bp
=next
) {
2541 next
= bp
->b_freelist
.tqe_next
;
2543 if (bp
->b_vp
== NULL
|| bp
->b_vp
->v_mount
!= mp
) {
2547 if ((bp
->b_flags
& B_DELWRI
) && (bp
->b_flags
& B_BUSY
) == 0) {
2548 if (whichq
!= BQ_LOCKED
&& (bp
->b_flags
& B_LOCKED
)) {
2549 panic("bflushq: bp @ 0x%x is locked!\n", bp
);
2553 bp
->b_flags
|= B_BUSY
;
2554 flush_table
[buf_count
] = bp
;
2558 if (buf_count
>= NFLUSH
) {
2559 qsort(flush_table
, buf_count
, sizeof(struct buf
*), bp_cmp
);
2561 for(i
=0; i
< buf_count
; i
++) {
2562 bawrite(flush_table
[i
]);
2570 if (buf_count
> 0) {
2571 qsort(flush_table
, buf_count
, sizeof(struct buf
*), bp_cmp
);
2572 for(i
=0; i
< buf_count
; i
++) {
2573 bawrite(flush_table
[i
]);
2577 return total_writes
;