git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2007 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*-
	30	* Copyright (c) 1994 Christopher G. Demetriou
	31	* Copyright (c) 1982, 1986, 1989, 1993
	32	* The Regents of the University of California. All rights reserved.
	33	* (c) UNIX System Laboratories, Inc.
	34	* All or some portions of this file are derived from material licensed
	35	* to the University of California by American Telephone and Telegraph
	36	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	37	* the permission of UNIX System Laboratories, Inc.
	38	*
	39	* Redistribution and use in source and binary forms, with or without
	40	* modification, are permitted provided that the following conditions
	41	* are met:
	42	* 1. Redistributions of source code must retain the above copyright
	43	* notice, this list of conditions and the following disclaimer.
	44	* 2. Redistributions in binary form must reproduce the above copyright
	45	* notice, this list of conditions and the following disclaimer in the
	46	* documentation and/or other materials provided with the distribution.
	47	* 3. All advertising materials mentioning features or use of this software
	48	* must display the following acknowledgement:
	49	* This product includes software developed by the University of
	50	* California, Berkeley and its contributors.
	51	* 4. Neither the name of the University nor the names of its contributors
	52	* may be used to endorse or promote products derived from this software
	53	* without specific prior written permission.
	54	*
	55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	65	* SUCH DAMAGE.
	66	*
	67	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
	68	*/
	69
	70	/*
	71	* Some references:
	72	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
	73	* Leffler, et al.: The Design and Implementation of the 4.3BSD
	74	* UNIX Operating System (Addison Welley, 1989)
	75	*/
	76
	77	#include <sys/param.h>
	78	#include <sys/systm.h>
	79	#include <sys/proc_internal.h>
	80	#include <sys/buf_internal.h>
	81	#include <sys/vnode_internal.h>
	82	#include <sys/mount_internal.h>
	83	#include <sys/trace.h>
	84	#include <sys/malloc.h>
	85	#include <sys/resourcevar.h>
	86	#include <miscfs/specfs/specdev.h>
	87	#include <sys/ubc.h>
	88	#include <sys/kauth.h>
	89	#if DIAGNOSTIC
	90	#include <kern/assert.h>
	91	#endif /* DIAGNOSTIC */
	92	#include <kern/task.h>
	93	#include <kern/zalloc.h>
	94	#include <kern/lock.h>
	95
	96	#include <sys/fslog.h> /* fslog_io_error() */
	97
	98	#include <mach/mach_types.h>
	99	#include <mach/memory_object_types.h>
	100	#include <kern/sched_prim.h> /* thread_block() */
	101
	102	#include <vm/vm_kern.h>
	103
	104	#include <sys/kdebug.h>
	105
	106	#include <libkern/OSAtomic.h>
	107	#include <sys/ubc_internal.h>
	108
	109	#include <sys/sdt.h>
	110
	111	#if BALANCE_QUEUES
	112	static __inline__ void bufqinc(int q);
	113	static __inline__ void bufqdec(int q);
	114	#endif
	115
	116	static int bcleanbuf(buf_t bp);
	117	static int brecover_data(buf_t bp);
	118	static boolean_t incore(vnode_t vp, daddr64_t blkno);
	119	/* timeout is in msecs */
	120	static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
	121	static void bremfree_locked(buf_t bp);
	122	static void buf_reassign(buf_t bp, vnode_t newvp);
	123	static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
	124	static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
	125	static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
	126
	127	__private_extern__ int bdwrite_internal(buf_t, int);
	128
	129	/* zone allocated buffer headers */
	130	static void bufzoneinit(void) __attribute__((section("__TEXT, initcode")));
	131	static void bcleanbuf_thread_init(void) __attribute__((section("__TEXT, initcode")));
	132	static void bcleanbuf_thread(void);
	133
	134	static zone_t buf_hdr_zone;
	135	static int buf_hdr_count;
	136
	137
	138	/*
	139	* Definitions for the buffer hash lists.
	140	*/
	141	#define BUFHASH(dvp, lbn) \
	142	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
	143	LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
	144	u_long bufhash;
	145
	146	static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
	147
	148	/* Definitions for the buffer stats. */
	149	struct bufstats bufstats;
	150
	151	/* Number of delayed write buffers */
	152	long nbdwrite = 0;
	153	int blaundrycnt = 0;
	154	static int boot_nbuf_headers = 0;
	155
	156
	157	static TAILQ_HEAD(ioqueue, buf) iobufqueue;
	158	static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
	159	static int needbuffer;
	160	static int need_iobuffer;
	161
	162	static lck_grp_t *buf_mtx_grp;
	163	static lck_attr_t *buf_mtx_attr;
	164	static lck_grp_attr_t *buf_mtx_grp_attr;
	165	static lck_mtx_t *iobuffer_mtxp;
	166	static lck_mtx_t *buf_mtxp;
	167
	168	static __inline__ int
	169	buf_timestamp(void)
	170	{
	171	struct timeval t;
	172	microuptime(&t);
	173	return (t.tv_sec);
	174	}
	175
	176	/*
	177	* Insq/Remq for the buffer free lists.
	178	*/
	179	#if BALANCE_QUEUES
	180	#define binsheadfree(bp, dp, whichq) do { \
	181	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
	182	bufqinc((whichq)); \
	183	} while (0)
	184
	185	#define binstailfree(bp, dp, whichq) do { \
	186	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
	187	bufqinc((whichq)); \
	188	} while (0)
	189	#else
	190	#define binsheadfree(bp, dp, whichq) do { \
	191	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
	192	} while (0)
	193
	194	#define binstailfree(bp, dp, whichq) do { \
	195	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
	196	} while (0)
	197	#endif
	198
	199
	200	#define BHASHENTCHECK(bp) \
	201	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
	202	panic("%p: b_hash.le_prev is not deadbeef", (bp));
	203
	204	#define BLISTNONE(bp) \
	205	(bp)->b_hash.le_next = (struct buf *)0; \
	206	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
	207
	208	/*
	209	* Insq/Remq for the vnode usage lists.
	210	*/
	211	#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
	212	#define bufremvn(bp) { \
	213	LIST_REMOVE(bp, b_vnbufs); \
	214	(bp)->b_vnbufs.le_next = NOLIST; \
	215	}
	216
	217	/*
	218	* Time in seconds before a buffer on a list is
	219	* considered as a stale buffer
	220	*/
	221	#define LRU_IS_STALE 120 /* default value for the LRU */
	222	#define AGE_IS_STALE 60 /* default value for the AGE */
	223	#define META_IS_STALE 180 /* default value for the BQ_META */
	224
	225	int lru_is_stale = LRU_IS_STALE;
	226	int age_is_stale = AGE_IS_STALE;
	227	int meta_is_stale = META_IS_STALE;
	228
	229
	230
	231	/* LIST_INSERT_HEAD() with assertions */
	232	static __inline__ void
	233	blistenterhead(struct bufhashhdr * head, buf_t bp)
	234	{
	235	if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
	236	(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
	237	(head)->lh_first = bp;
	238	bp->b_hash.le_prev = &(head)->lh_first;
	239	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	240	panic("blistenterhead: le_prev is deadbeef");
	241	}
	242
	243	static __inline__ void
	244	binshash(buf_t bp, struct bufhashhdr *dp)
	245	{
	246	#if DIAGNOSTIC
	247	buf_t nbp;
	248	#endif /* DIAGNOSTIC */
	249
	250	BHASHENTCHECK(bp);
	251
	252	#if DIAGNOSTIC
	253	nbp = dp->lh_first;
	254	for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
	255	if(nbp == bp)
	256	panic("buf already in hashlist");
	257	}
	258	#endif /* DIAGNOSTIC */
	259
	260	blistenterhead(dp, bp);
	261	}
	262
	263	static __inline__ void
	264	bremhash(buf_t bp)
	265	{
	266	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	267	panic("bremhash le_prev is deadbeef");
	268	if (bp->b_hash.le_next == bp)
	269	panic("bremhash: next points to self");
	270
	271	if (bp->b_hash.le_next != NULL)
	272	bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
	273	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
	274	}
	275
	276
	277
	278
	279	int
	280	buf_valid(buf_t bp) {
	281
	282	if ( (bp->b_flags & (B_DONE \| B_DELWRI)) )
	283	return 1;
	284	return 0;
	285	}
	286
	287	int
	288	buf_fromcache(buf_t bp) {
	289
	290	if ( (bp->b_flags & B_CACHE) )
	291	return 1;
	292	return 0;
	293	}
	294
	295	void
	296	buf_markinvalid(buf_t bp) {
	297
	298	SET(bp->b_flags, B_INVAL);
	299	}
	300
	301	void
	302	buf_markdelayed(buf_t bp) {
	303
	304	if (!ISSET(bp->b_flags, B_DELWRI)) {
	305	SET(bp->b_flags, B_DELWRI);
	306
	307	OSAddAtomic(1, &nbdwrite);
	308	buf_reassign(bp, bp->b_vp);
	309	}
	310	SET(bp->b_flags, B_DONE);
	311	}
	312
	313	void
	314	buf_markeintr(buf_t bp) {
	315
	316	SET(bp->b_flags, B_EINTR);
	317	}
	318
	319
	320	void
	321	buf_markaged(buf_t bp) {
	322
	323	SET(bp->b_flags, B_AGE);
	324	}
	325
	326	int
	327	buf_fua(buf_t bp) {
	328
	329	if ((bp->b_flags & B_FUA) == B_FUA)
	330	return 1;
	331	return 0;
	332	}
	333
	334	void
	335	buf_markfua(buf_t bp) {
	336
	337	SET(bp->b_flags, B_FUA);
	338	}
	339
	340	errno_t
	341	buf_error(buf_t bp) {
	342
	343	return (bp->b_error);
	344	}
	345
	346	void
	347	buf_seterror(buf_t bp, errno_t error) {
	348
	349	if ((bp->b_error = error))
	350	SET(bp->b_flags, B_ERROR);
	351	else
	352	CLR(bp->b_flags, B_ERROR);
	353	}
	354
	355	void
	356	buf_setflags(buf_t bp, int32_t flags) {
	357
	358	SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
	359	}
	360
	361	void
	362	buf_clearflags(buf_t bp, int32_t flags) {
	363
	364	CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
	365	}
	366
	367	int32_t
	368	buf_flags(buf_t bp) {
	369
	370	return ((bp->b_flags & BUF_X_RDFLAGS));
	371	}
	372
	373	void
	374	buf_reset(buf_t bp, int32_t io_flags) {
	375
	376	CLR(bp->b_flags, (B_READ \| B_WRITE \| B_ERROR \| B_DONE \| B_INVAL \| B_ASYNC \| B_NOCACHE \| B_FUA));
	377	SET(bp->b_flags, (io_flags & (B_ASYNC \| B_READ \| B_WRITE \| B_NOCACHE)));
	378
	379	bp->b_error = 0;
	380	}
	381
	382	uint32_t
	383	buf_count(buf_t bp) {
	384
	385	return (bp->b_bcount);
	386	}
	387
	388	void
	389	buf_setcount(buf_t bp, uint32_t bcount) {
	390
	391	bp->b_bcount = bcount;
	392	}
	393
	394	uint32_t
	395	buf_size(buf_t bp) {
	396
	397	return (bp->b_bufsize);
	398	}
	399
	400	void
	401	buf_setsize(buf_t bp, uint32_t bufsize) {
	402
	403	bp->b_bufsize = bufsize;
	404	}
	405
	406	uint32_t
	407	buf_resid(buf_t bp) {
	408
	409	return (bp->b_resid);
	410	}
	411
	412	void
	413	buf_setresid(buf_t bp, uint32_t resid) {
	414
	415	bp->b_resid = resid;
	416	}
	417
	418	uint32_t
	419	buf_dirtyoff(buf_t bp) {
	420
	421	return (bp->b_dirtyoff);
	422	}
	423
	424	uint32_t
	425	buf_dirtyend(buf_t bp) {
	426
	427	return (bp->b_dirtyend);
	428	}
	429
	430	void
	431	buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
	432
	433	bp->b_dirtyoff = dirtyoff;
	434	}
	435
	436	void
	437	buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
	438
	439	bp->b_dirtyend = dirtyend;
	440	}
	441
	442	uintptr_t
	443	buf_dataptr(buf_t bp) {
	444
	445	return (bp->b_datap);
	446	}
	447
	448	void
	449	buf_setdataptr(buf_t bp, uintptr_t data) {
	450
	451	bp->b_datap = data;
	452	}
	453
	454	vnode_t
	455	buf_vnode(buf_t bp) {
	456
	457	return (bp->b_vp);
	458	}
	459
	460	void
	461	buf_setvnode(buf_t bp, vnode_t vp) {
	462
	463	bp->b_vp = vp;
	464	}
	465
	466
	467	void *
	468	buf_callback(buf_t bp)
	469	{
	470	if ( !(bp->b_flags & B_CALL) )
	471	return ((void *) NULL);
	472
	473	return ((void *)bp->b_iodone);
	474	}
	475
	476
	477	errno_t
	478	buf_setcallback(buf_t bp, void (callback)(buf_t, void ), void *transaction)
	479	{
	480	if (callback)
	481	bp->b_flags \|= (B_CALL \| B_ASYNC);
	482	else
	483	bp->b_flags &= ~B_CALL;
	484	bp->b_transaction = transaction;
	485	bp->b_iodone = callback;
	486
	487	return (0);
	488	}
	489
	490	errno_t
	491	buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
	492	{
	493
	494	if ( !(bp->b_lflags & BL_IOBUF) )
	495	return (EINVAL);
	496
	497	if (upl)
	498	bp->b_flags \|= B_CLUSTER;
	499	else
	500	bp->b_flags &= ~B_CLUSTER;
	501	bp->b_upl = upl;
	502	bp->b_uploffset = offset;
	503
	504	return (0);
	505	}
	506
	507	buf_t
	508	buf_clone(buf_t bp, int io_offset, int io_size, void (iodone)(buf_t, void ), void *arg)
	509	{
	510	buf_t io_bp;
	511
	512	if (io_offset < 0 \|\| io_size < 0)
	513	return (NULL);
	514
	515	if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
	516	return (NULL);
	517
	518	if (bp->b_flags & B_CLUSTER) {
	519	if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
	520	return (NULL);
	521
	522	if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
	523	return (NULL);
	524	}
	525	io_bp = alloc_io_buf(bp->b_vp, 0);
	526
	527	io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL \| B_META \| B_PAGEIO \| B_CLUSTER \| B_PHYS \| B_RAW \| B_ASYNC \| B_READ \| B_FUA);
	528
	529	if (iodone) {
	530	io_bp->b_transaction = arg;
	531	io_bp->b_iodone = iodone;
	532	io_bp->b_flags \|= B_CALL;
	533	}
	534	if (bp->b_flags & B_CLUSTER) {
	535	io_bp->b_upl = bp->b_upl;
	536	io_bp->b_uploffset = bp->b_uploffset + io_offset;
	537	} else {
	538	io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
	539	}
	540	io_bp->b_bcount = io_size;
	541
	542	return (io_bp);
	543	}
	544
	545
	546
	547	void
	548	buf_setfilter(buf_t bp, void (filter)(buf_t, void ), void *transaction,
	549	void old_iodone, void old_transaction)
	550	{
	551	if (old_iodone)
	552	old_iodone = (void )(bp->b_iodone);
	553	if (old_transaction)
	554	old_transaction = (void )(bp->b_transaction);
	555
	556	bp->b_transaction = transaction;
	557	bp->b_iodone = filter;
	558	if (filter)
	559	bp->b_flags \|= B_FILTER;
	560	else
	561	bp->b_flags &= ~B_FILTER;
	562	}
	563
	564
	565	daddr64_t
	566	buf_blkno(buf_t bp) {
	567
	568	return (bp->b_blkno);
	569	}
	570
	571	daddr64_t
	572	buf_lblkno(buf_t bp) {
	573
	574	return (bp->b_lblkno);
	575	}
	576
	577	void
	578	buf_setblkno(buf_t bp, daddr64_t blkno) {
	579
	580	bp->b_blkno = blkno;
	581	}
	582
	583	void
	584	buf_setlblkno(buf_t bp, daddr64_t lblkno) {
	585
	586	bp->b_lblkno = lblkno;
	587	}
	588
	589	dev_t
	590	buf_device(buf_t bp) {
	591
	592	return (bp->b_dev);
	593	}
	594
	595	errno_t
	596	buf_setdevice(buf_t bp, vnode_t vp) {
	597
	598	if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
	599	return EINVAL;
	600	bp->b_dev = vp->v_rdev;
	601
	602	return 0;
	603	}
	604
	605
	606	void *
	607	buf_drvdata(buf_t bp) {
	608
	609	return (bp->b_drvdata);
	610	}
	611
	612	void
	613	buf_setdrvdata(buf_t bp, void *drvdata) {
	614
	615	bp->b_drvdata = drvdata;
	616	}
	617
	618	void *
	619	buf_fsprivate(buf_t bp) {
	620
	621	return (bp->b_fsprivate);
	622	}
	623
	624	void
	625	buf_setfsprivate(buf_t bp, void *fsprivate) {
	626
	627	bp->b_fsprivate = fsprivate;
	628	}
	629
	630	ucred_t
	631	buf_rcred(buf_t bp) {
	632
	633	return (bp->b_rcred);
	634	}
	635
	636	ucred_t
	637	buf_wcred(buf_t bp) {
	638
	639	return (bp->b_wcred);
	640	}
	641
	642	void *
	643	buf_upl(buf_t bp) {
	644
	645	return (bp->b_upl);
	646	}
	647
	648	uint32_t
	649	buf_uploffset(buf_t bp) {
	650
	651	return ((uint32_t)(bp->b_uploffset));
	652	}
	653
	654	proc_t
	655	buf_proc(buf_t bp) {
	656
	657	return (bp->b_proc);
	658	}
	659
	660
	661	errno_t
	662	buf_map(buf_t bp, caddr_t *io_addr)
	663	{
	664	buf_t real_bp;
	665	vm_offset_t vaddr;
	666	kern_return_t kret;
	667
	668	if ( !(bp->b_flags & B_CLUSTER)) {
	669	*io_addr = (caddr_t)bp->b_datap;
	670	return (0);
	671	}
	672	real_bp = (buf_t)(bp->b_real_bp);
	673
	674	if (real_bp && real_bp->b_datap) {
	675	/*
	676	* b_real_bp is only valid if B_CLUSTER is SET
	677	* if it's non-zero, than someone did a cluster_bp call
	678	* if the backing physical pages were already mapped
	679	* in before the call to cluster_bp (non-zero b_datap),
	680	* than we just use that mapping
	681	*/
	682	*io_addr = (caddr_t)real_bp->b_datap;
	683	return (0);
	684	}
	685	kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
	686
	687	if (kret != KERN_SUCCESS) {
	688	*io_addr = NULL;
	689
	690	return(ENOMEM);
	691	}
	692	vaddr += bp->b_uploffset;
	693
	694	*io_addr = (caddr_t)vaddr;
	695
	696	return (0);
	697	}
	698
	699	errno_t
	700	buf_unmap(buf_t bp)
	701	{
	702	buf_t real_bp;
	703	kern_return_t kret;
	704
	705	if ( !(bp->b_flags & B_CLUSTER))
	706	return (0);
	707	/*
	708	* see buf_map for the explanation
	709	*/
	710	real_bp = (buf_t)(bp->b_real_bp);
	711
	712	if (real_bp && real_bp->b_datap)
	713	return (0);
	714
	715	if ((bp->b_lflags & BL_IOBUF) &&
	716	((bp->b_flags & (B_PAGEIO \| B_READ)) != (B_PAGEIO \| B_READ))) {
	717	/*
	718	* ignore pageins... the 'right' thing will
	719	* happen due to the way we handle speculative
	720	* clusters...
	721	*
	722	* when we commit these pages, we'll hit
	723	* it with UPL_COMMIT_INACTIVE which
	724	* will clear the reference bit that got
	725	* turned on when we touched the mapping
	726	*/
	727	bp->b_flags \|= B_AGE;
	728	}
	729	kret = ubc_upl_unmap(bp->b_upl);
	730
	731	if (kret != KERN_SUCCESS)
	732	return (EINVAL);
	733	return (0);
	734	}
	735
	736
	737	void
	738	buf_clear(buf_t bp) {
	739	caddr_t baddr;
	740
	741	if (buf_map(bp, &baddr) == 0) {
	742	bzero(baddr, bp->b_bcount);
	743	buf_unmap(bp);
	744	}
	745	bp->b_resid = 0;
	746	}
	747
	748
	749
	750	/*
	751	* Read or write a buffer that is not contiguous on disk.
	752	* buffer is marked done/error at the conclusion
	753	*/
	754	static int
	755	buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
	756	{
	757	vnode_t vp = buf_vnode(bp);
	758	buf_t io_bp; /* For reading or writing a single block */
	759	int io_direction;
	760	int io_resid;
	761	size_t io_contig_bytes;
	762	daddr64_t io_blkno;
	763	int error = 0;
	764	int bmap_flags;
	765
	766	/*
	767	* save our starting point... the bp was already mapped
	768	* in buf_strategy before we got called
	769	* no sense doing it again.
	770	*/
	771	io_blkno = bp->b_blkno;
	772	/*
	773	* Make sure we redo this mapping for the next I/O
	774	* i.e. this can never be a 'permanent' mapping
	775	*/
	776	bp->b_blkno = bp->b_lblkno;
	777
	778	/*
	779	* Get an io buffer to do the deblocking
	780	*/
	781	io_bp = alloc_io_buf(devvp, 0);
	782
	783	io_bp->b_lblkno = bp->b_lblkno;
	784	io_bp->b_datap = bp->b_datap;
	785	io_resid = bp->b_bcount;
	786	io_direction = bp->b_flags & B_READ;
	787	io_contig_bytes = contig_bytes;
	788
	789	if (bp->b_flags & B_READ)
	790	bmap_flags = VNODE_READ;
	791	else
	792	bmap_flags = VNODE_WRITE;
	793
	794	for (;;) {
	795	if (io_blkno == -1)
	796	/*
	797	* this is unexepected, but we'll allow for it
	798	*/
	799	bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
	800	else {
	801	io_bp->b_bcount = io_contig_bytes;
	802	io_bp->b_bufsize = io_contig_bytes;
	803	io_bp->b_resid = io_contig_bytes;
	804	io_bp->b_blkno = io_blkno;
	805
	806	buf_reset(io_bp, io_direction);
	807
	808	/*
	809	* Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
	810	*/
	811
	812	if (!ISSET(bp->b_flags, B_READ))
	813	OSAddAtomic(1, &devvp->v_numoutput);
	814
	815	if ((error = VNOP_STRATEGY(io_bp)))
	816	break;
	817	if ((error = (int)buf_biowait(io_bp)))
	818	break;
	819	if (io_bp->b_resid) {
	820	io_resid -= (io_contig_bytes - io_bp->b_resid);
	821	break;
	822	}
	823	}
	824	if ((io_resid -= io_contig_bytes) == 0)
	825	break;
	826	f_offset += io_contig_bytes;
	827	io_bp->b_datap += io_contig_bytes;
	828
	829	/*
	830	* Map the current position to a physical block number
	831	*/
	832	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
	833	break;
	834	}
	835	buf_free(io_bp);
	836
	837	if (error)
	838	buf_seterror(bp, error);
	839	bp->b_resid = io_resid;
	840	/*
	841	* This I/O is now complete
	842	*/
	843	buf_biodone(bp);
	844
	845	return error;
	846	}
	847
	848
	849	/*
	850	* struct vnop_strategy_args {
	851	* struct buf *a_bp;
	852	* } *ap;
	853	*/
	854	errno_t
	855	buf_strategy(vnode_t devvp, void *ap)
	856	{
	857	buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
	858	vnode_t vp = bp->b_vp;
	859	int bmap_flags;
	860	errno_t error;
	861
	862	if (vp == NULL \|\| vp->v_type == VCHR \|\| vp->v_type == VBLK)
	863	panic("buf_strategy: b_vp == NULL \|\| vtype == VCHR \| VBLK\n");
	864	/*
	865	* associate the physical device with
	866	* with this buf_t even if we don't
	867	* end up issuing the I/O...
	868	*/
	869	bp->b_dev = devvp->v_rdev;
	870	DTRACE_IO1(start, buf_t, bp);
	871
	872	if (bp->b_flags & B_READ)
	873	bmap_flags = VNODE_READ;
	874	else
	875	bmap_flags = VNODE_WRITE;
	876
	877	if ( !(bp->b_flags & B_CLUSTER)) {
	878
	879	if ( (bp->b_upl) ) {
	880	/*
	881	* we have a UPL associated with this bp
	882	* go through cluster_bp which knows how
	883	* to deal with filesystem block sizes
	884	* that aren't equal to the page size
	885	*/
	886	return (cluster_bp(bp));
	887	}
	888	if (bp->b_blkno == bp->b_lblkno) {
	889	off_t f_offset;
	890	size_t contig_bytes;
	891
	892	if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
	893	buf_seterror(bp, error);
	894	buf_biodone(bp);
	895
	896	return (error);
	897	}
	898	if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
	899	buf_seterror(bp, error);
	900	buf_biodone(bp);
	901
	902	return (error);
	903	}
	904	if (bp->b_blkno == -1)
	905	buf_clear(bp);
	906	else if ((long)contig_bytes < bp->b_bcount)
	907	return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
	908	}
	909	if (bp->b_blkno == -1) {
	910	buf_biodone(bp);
	911	return (0);
	912	}
	913	}
	914	/*
	915	* we can issue the I/O because...
	916	* either B_CLUSTER is set which
	917	* means that the I/O is properly set
	918	* up to be a multiple of the page size, or
	919	* we were able to successfully set up the
	920	* phsyical block mapping
	921	*/
	922	return (VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap));
	923	}
	924
	925
	926
	927	buf_t
	928	buf_alloc(vnode_t vp)
	929	{
	930	return(alloc_io_buf(vp, 0));
	931	}
	932
	933	void
	934	buf_free(buf_t bp) {
	935
	936	free_io_buf(bp);
	937	}
	938
	939
	940	/*
	941	* iterate buffers for the specified vp.
	942	* if BUF_SCAN_DIRTY is set, do the dirty list
	943	* if BUF_SCAN_CLEAN is set, do the clean list
	944	* if neither flag is set, default to BUF_SCAN_DIRTY
	945	* if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
	946	*/
	947
	948	struct buf_iterate_info_t {
	949	int flag;
	950	struct buflists *listhead;
	951	};
	952
	953	void
	954	buf_iterate(vnode_t vp, int (callout)(buf_t, void ), int flags, void *arg)
	955	{
	956	buf_t bp;
	957	int retval;
	958	struct buflists local_iterblkhd;
	959	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
	960	int notify_busy = flags & BUF_NOTIFY_BUSY;
	961	struct buf_iterate_info_t list[2];
	962	int num_lists, i;
	963
	964	if (flags & BUF_SKIP_LOCKED)
	965	lock_flags \|= BAC_SKIP_LOCKED;
	966	if (flags & BUF_SKIP_NONLOCKED)
	967	lock_flags \|= BAC_SKIP_NONLOCKED;
	968
	969	if ( !(flags & (BUF_SCAN_DIRTY \| BUF_SCAN_CLEAN)))
	970	flags \|= BUF_SCAN_DIRTY;
	971
	972	num_lists = 0;
	973
	974	if (flags & BUF_SCAN_DIRTY) {
	975	list[num_lists].flag = VBI_DIRTY;
	976	list[num_lists].listhead = &vp->v_dirtyblkhd;
	977	num_lists++;
	978	}
	979	if (flags & BUF_SCAN_CLEAN) {
	980	list[num_lists].flag = VBI_CLEAN;
	981	list[num_lists].listhead = &vp->v_cleanblkhd;
	982	num_lists++;
	983	}
	984
	985	for (i = 0; i < num_lists; i++) {
	986	lck_mtx_lock(buf_mtxp);
	987
	988	if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
	989	lck_mtx_unlock(buf_mtxp);
	990	continue;
	991	}
	992	while (!LIST_EMPTY(&local_iterblkhd)) {
	993	bp = LIST_FIRST(&local_iterblkhd);
	994	LIST_REMOVE(bp, b_vnbufs);
	995	LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
	996
	997	if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
	998	if (notify_busy) {
	999	bp = NULL;
	1000	} else {
	1001	continue;
	1002	}
	1003	}
	1004
	1005	lck_mtx_unlock(buf_mtxp);
	1006
	1007	retval = callout(bp, arg);
	1008
	1009	switch (retval) {
	1010	case BUF_RETURNED:
	1011	if (bp)
	1012	buf_brelse(bp);
	1013	break;
	1014	case BUF_CLAIMED:
	1015	break;
	1016	case BUF_RETURNED_DONE:
	1017	if (bp)
	1018	buf_brelse(bp);
	1019	lck_mtx_lock(buf_mtxp);
	1020	goto out;
	1021	case BUF_CLAIMED_DONE:
	1022	lck_mtx_lock(buf_mtxp);
	1023	goto out;
	1024	}
	1025	lck_mtx_lock(buf_mtxp);
	1026	} /* while list has more nodes */
	1027	out:
	1028	buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
	1029	lck_mtx_unlock(buf_mtxp);
	1030	} /* for each list */
	1031	} /* buf_iterate */
	1032
	1033
	1034	/*
	1035	* Flush out and invalidate all buffers associated with a vnode.
	1036	*/
	1037	int
	1038	buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
	1039	{
	1040	buf_t bp;
	1041	int error = 0;
	1042	int must_rescan = 1;
	1043	struct buflists local_iterblkhd;
	1044
	1045	lck_mtx_lock(buf_mtxp);
	1046
	1047	for (;;) {
	1048	if (must_rescan == 0)
	1049	/*
	1050	* the lists may not be empty, but all that's left at this
	1051	* point are metadata or B_LOCKED buffers which are being
	1052	* skipped... we know this because we made it through both
	1053	* the clean and dirty lists without dropping buf_mtxp...
	1054	* each time we drop buf_mtxp we bump "must_rescan"
	1055	*/
	1056	break;
	1057	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
	1058	break;
	1059	must_rescan = 0;
	1060	/*
	1061	* iterate the clean list
	1062	*/
	1063	if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
	1064	goto try_dirty_list;
	1065	}
	1066	while (!LIST_EMPTY(&local_iterblkhd)) {
	1067	bp = LIST_FIRST(&local_iterblkhd);
	1068
	1069	LIST_REMOVE(bp, b_vnbufs);
	1070	LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
	1071
	1072	/*
	1073	* some filesystems distinguish meta data blocks with a negative logical block #
	1074	*/
	1075	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 \|\| ISSET(bp->b_flags, B_META)))
	1076	continue;
	1077
	1078	if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE \| BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
	1079	if (error == EDEADLK)
	1080	/*
	1081	* this buffer was marked B_LOCKED...
	1082	* we didn't drop buf_mtxp, so we
	1083	* we don't need to rescan
	1084	*/
	1085	continue;
	1086	if (error == EAGAIN) {
	1087	/*
	1088	* found a busy buffer... we blocked and
	1089	* dropped buf_mtxp, so we're going to
	1090	* need to rescan after this pass is completed
	1091	*/
	1092	must_rescan++;
	1093	continue;
	1094	}
	1095	/*
	1096	* got some kind of 'real' error out of the msleep
	1097	* in buf_acquire_locked, terminate the scan and return the error
	1098	*/
	1099	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
	1100
	1101	lck_mtx_unlock(buf_mtxp);
	1102	return (error);
	1103	}
	1104	lck_mtx_unlock(buf_mtxp);
	1105
	1106	SET(bp->b_flags, B_INVAL);
	1107	buf_brelse(bp);
	1108
	1109	lck_mtx_lock(buf_mtxp);
	1110
	1111	/*
	1112	* by dropping buf_mtxp, we allow new
	1113	* buffers to be added to the vnode list(s)
	1114	* we'll have to rescan at least once more
	1115	* if the queues aren't empty
	1116	*/
	1117	must_rescan++;
	1118	}
	1119	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
	1120
	1121	try_dirty_list:
	1122	/*
	1123	* Now iterate on dirty blks
	1124	*/
	1125	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
	1126	continue;
	1127	}
	1128	while (!LIST_EMPTY(&local_iterblkhd)) {
	1129	bp = LIST_FIRST(&local_iterblkhd);
	1130
	1131	LIST_REMOVE(bp, b_vnbufs);
	1132	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
	1133
	1134	/*
	1135	* some filesystems distinguish meta data blocks with a negative logical block #
	1136	*/
	1137	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 \|\| ISSET(bp->b_flags, B_META)))
	1138	continue;
	1139
	1140	if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE \| BAC_SKIP_LOCKED, slpflag, slptimeo)) ) {
	1141	if (error == EDEADLK)
	1142	/*
	1143	* this buffer was marked B_LOCKED...
	1144	* we didn't drop buf_mtxp, so we
	1145	* we don't need to rescan
	1146	*/
	1147	continue;
	1148	if (error == EAGAIN) {
	1149	/*
	1150	* found a busy buffer... we blocked and
	1151	* dropped buf_mtxp, so we're going to
	1152	* need to rescan after this pass is completed
	1153	*/
	1154	must_rescan++;
	1155	continue;
	1156	}
	1157	/*
	1158	* got some kind of 'real' error out of the msleep
	1159	* in buf_acquire_locked, terminate the scan and return the error
	1160	*/
	1161	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1162
	1163	lck_mtx_unlock(buf_mtxp);
	1164	return (error);
	1165	}
	1166	lck_mtx_unlock(buf_mtxp);
	1167
	1168	SET(bp->b_flags, B_INVAL);
	1169
	1170	if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
	1171	(void) VNOP_BWRITE(bp);
	1172	else
	1173	buf_brelse(bp);
	1174
	1175	lck_mtx_lock(buf_mtxp);
	1176	/*
	1177	* by dropping buf_mtxp, we allow new
	1178	* buffers to be added to the vnode list(s)
	1179	* we'll have to rescan at least once more
	1180	* if the queues aren't empty
	1181	*/
	1182	must_rescan++;
	1183	}
	1184	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1185	}
	1186	lck_mtx_unlock(buf_mtxp);
	1187
	1188	return (0);
	1189	}
	1190
	1191	void
	1192	buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
	1193	buf_t bp;
	1194	int writes_issued = 0;
	1195	errno_t error;
	1196	int busy = 0;
	1197	struct buflists local_iterblkhd;
	1198	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
	1199
	1200	if (flags & BUF_SKIP_LOCKED)
	1201	lock_flags \|= BAC_SKIP_LOCKED;
	1202	if (flags & BUF_SKIP_NONLOCKED)
	1203	lock_flags \|= BAC_SKIP_NONLOCKED;
	1204	loop:
	1205	lck_mtx_lock(buf_mtxp);
	1206
	1207	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
	1208	while (!LIST_EMPTY(&local_iterblkhd)) {
	1209	bp = LIST_FIRST(&local_iterblkhd);
	1210	LIST_REMOVE(bp, b_vnbufs);
	1211	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
	1212
	1213	if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY)
	1214	busy++;
	1215	if (error)
	1216	continue;
	1217	lck_mtx_unlock(buf_mtxp);
	1218
	1219	bp->b_flags &= ~B_LOCKED;
	1220
	1221	/*
	1222	* Wait for I/O associated with indirect blocks to complete,
	1223	* since there is no way to quickly wait for them below.
	1224	*/
	1225	if ((bp->b_vp == vp) \|\| (wait == 0))
	1226	(void) buf_bawrite(bp);
	1227	else
	1228	(void) VNOP_BWRITE(bp);
	1229	writes_issued++;
	1230
	1231	lck_mtx_lock(buf_mtxp);
	1232	}
	1233	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1234	}
	1235	lck_mtx_unlock(buf_mtxp);
	1236
	1237	if (wait) {
	1238	(void)vnode_waitforwrites(vp, 0, 0, 0, msg);
	1239
	1240	if (vp->v_dirtyblkhd.lh_first && busy) {
	1241	/*
	1242	* we had one or more BUSY buffers on
	1243	* the dirtyblock list... most likely
	1244	* these are due to delayed writes that
	1245	* were moved to the bclean queue but
	1246	* have not yet been 'written'.
	1247	* if we issued some writes on the
	1248	* previous pass, we try again immediately
	1249	* if we didn't, we'll sleep for some time
	1250	* to allow the state to change...
	1251	*/
	1252	if (writes_issued == 0) {
	1253	(void)tsleep((caddr_t)&vp->v_numoutput,
	1254	PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
	1255	}
	1256	writes_issued = 0;
	1257	busy = 0;
	1258
	1259	goto loop;
	1260	}
	1261	}
	1262	}
	1263
	1264
	1265	/*
	1266	* called with buf_mtxp held...
	1267	* this lock protects the queue manipulation
	1268	*/
	1269	static int
	1270	buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
	1271	{
	1272	struct buflists * listheadp;
	1273
	1274	if (flags & VBI_DIRTY)
	1275	listheadp = &vp->v_dirtyblkhd;
	1276	else
	1277	listheadp = &vp->v_cleanblkhd;
	1278
	1279	while (vp->v_iterblkflags & VBI_ITER) {
	1280	vp->v_iterblkflags \|= VBI_ITERWANT;
	1281	msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
	1282	}
	1283	if (LIST_EMPTY(listheadp)) {
	1284	LIST_INIT(iterheadp);
	1285	return(EINVAL);
	1286	}
	1287	vp->v_iterblkflags \|= VBI_ITER;
	1288
	1289	iterheadp->lh_first = listheadp->lh_first;
	1290	listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
	1291	LIST_INIT(listheadp);
	1292
	1293	return(0);
	1294	}
	1295
	1296	/*
	1297	* called with buf_mtxp held...
	1298	* this lock protects the queue manipulation
	1299	*/
	1300	static void
	1301	buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
	1302	{
	1303	struct buflists * listheadp;
	1304	buf_t bp;
	1305
	1306	if (flags & VBI_DIRTY)
	1307	listheadp = &vp->v_dirtyblkhd;
	1308	else
	1309	listheadp = &vp->v_cleanblkhd;
	1310
	1311	while (!LIST_EMPTY(iterheadp)) {
	1312	bp = LIST_FIRST(iterheadp);
	1313	LIST_REMOVE(bp, b_vnbufs);
	1314	LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
	1315	}
	1316	vp->v_iterblkflags &= ~VBI_ITER;
	1317
	1318	if (vp->v_iterblkflags & VBI_ITERWANT) {
	1319	vp->v_iterblkflags &= ~VBI_ITERWANT;
	1320	wakeup(&vp->v_iterblkflags);
	1321	}
	1322	}
	1323
	1324
	1325	static void
	1326	bremfree_locked(buf_t bp)
	1327	{
	1328	struct bqueues *dp = NULL;
	1329	int whichq;
	1330	/*
	1331	* We only calculate the head of the freelist when removing
	1332	* the last element of the list as that is the only time that
	1333	* it is needed (e.g. to reset the tail pointer).
	1334	*
	1335	* NB: This makes an assumption about how tailq's are implemented.
	1336	*/
	1337	whichq = bp->b_whichq;
	1338
	1339	if (bp->b_freelist.tqe_next == NULL) {
	1340	dp = &bufqueues[whichq];
	1341
	1342	if (dp->tqh_last != &bp->b_freelist.tqe_next)
	1343	panic("bremfree: lost tail");
	1344	}
	1345	TAILQ_REMOVE(dp, bp, b_freelist);
	1346
	1347	#if BALANCE_QUEUES
	1348	bufqdec(whichq);
	1349	#endif
	1350	if (whichq == BQ_LAUNDRY)
	1351	blaundrycnt--;
	1352
	1353	bp->b_whichq = -1;
	1354	bp->b_timestamp = 0;
	1355	}
	1356
	1357	/*
	1358	* Associate a buffer with a vnode.
	1359	* buf_mtxp must be locked on entry
	1360	*/
	1361	static void
	1362	bgetvp_locked(vnode_t vp, buf_t bp)
	1363	{
	1364
	1365	if (bp->b_vp != vp)
	1366	panic("bgetvp_locked: not free");
	1367
	1368	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	1369	bp->b_dev = vp->v_rdev;
	1370	else
	1371	bp->b_dev = NODEV;
	1372	/*
	1373	* Insert onto list for new vnode.
	1374	*/
	1375	bufinsvn(bp, &vp->v_cleanblkhd);
	1376	}
	1377
	1378	/*
	1379	* Disassociate a buffer from a vnode.
	1380	* buf_mtxp must be locked on entry
	1381	*/
	1382	static void
	1383	brelvp_locked(buf_t bp)
	1384	{
	1385	/*
	1386	* Delete from old vnode list, if on one.
	1387	*/
	1388	if (bp->b_vnbufs.le_next != NOLIST)
	1389	bufremvn(bp);
	1390
	1391	bp->b_vp = (vnode_t)NULL;
	1392	}
	1393
	1394	/*
	1395	* Reassign a buffer from one vnode to another.
	1396	* Used to assign file specific control information
	1397	* (indirect blocks) to the vnode to which they belong.
	1398	*/
	1399	static void
	1400	buf_reassign(buf_t bp, vnode_t newvp)
	1401	{
	1402	register struct buflists *listheadp;
	1403
	1404	if (newvp == NULL) {
	1405	printf("buf_reassign: NULL");
	1406	return;
	1407	}
	1408	lck_mtx_lock_spin(buf_mtxp);
	1409
	1410	/*
	1411	* Delete from old vnode list, if on one.
	1412	*/
	1413	if (bp->b_vnbufs.le_next != NOLIST)
	1414	bufremvn(bp);
	1415	/*
	1416	* If dirty, put on list of dirty buffers;
	1417	* otherwise insert onto list of clean buffers.
	1418	*/
	1419	if (ISSET(bp->b_flags, B_DELWRI))
	1420	listheadp = &newvp->v_dirtyblkhd;
	1421	else
	1422	listheadp = &newvp->v_cleanblkhd;
	1423	bufinsvn(bp, listheadp);
	1424
	1425	lck_mtx_unlock(buf_mtxp);
	1426	}
	1427
	1428	static __inline__ void
	1429	bufhdrinit(buf_t bp)
	1430	{
	1431	bzero((char )bp, sizeof bp);
	1432	bp->b_dev = NODEV;
	1433	bp->b_rcred = NOCRED;
	1434	bp->b_wcred = NOCRED;
	1435	bp->b_vnbufs.le_next = NOLIST;
	1436	bp->b_flags = B_INVAL;
	1437
	1438	return;
	1439	}
	1440
	1441	/*
	1442	* Initialize buffers and hash links for buffers.
	1443	*/
	1444	__private_extern__ void
	1445	bufinit(void)
	1446	{
	1447	buf_t bp;
	1448	struct bqueues *dp;
	1449	int i;
	1450
	1451	nbuf_headers = 0;
	1452	/* Initialize the buffer queues ('freelists') and the hash table */
	1453	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
	1454	TAILQ_INIT(dp);
	1455	bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
	1456
	1457	/* Initialize the buffer headers */
	1458	for (i = 0; i < max_nbuf_headers; i++) {
	1459	nbuf_headers++;
	1460	bp = &buf_headers[i];
	1461	bufhdrinit(bp);
	1462
	1463	BLISTNONE(bp);
	1464	dp = &bufqueues[BQ_EMPTY];
	1465	bp->b_whichq = BQ_EMPTY;
	1466	bp->b_timestamp = buf_timestamp();
	1467	binsheadfree(bp, dp, BQ_EMPTY);
	1468	binshash(bp, &invalhash);
	1469	}
	1470
	1471	boot_nbuf_headers = nbuf_headers;
	1472	for (; i < nbuf_headers + niobuf_headers; i++) {
	1473	bp = &buf_headers[i];
	1474	bufhdrinit(bp);
	1475	bp->b_whichq = -1;
	1476	binsheadfree(bp, &iobufqueue, -1);
	1477	}
	1478
	1479	/*
	1480	* allocate lock group attribute and group
	1481	*/
	1482	buf_mtx_grp_attr = lck_grp_attr_alloc_init();
	1483	buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
	1484
	1485	/*
	1486	* allocate the lock attribute
	1487	*/
	1488	buf_mtx_attr = lck_attr_alloc_init();
	1489
	1490	/*
	1491	* allocate and initialize mutex's for the buffer and iobuffer pools
	1492	*/
	1493	buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
	1494	iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
	1495
	1496	if (iobuffer_mtxp == NULL)
	1497	panic("couldn't create iobuffer mutex");
	1498
	1499	if (buf_mtxp == NULL)
	1500	panic("couldn't create buf mutex");
	1501
	1502	/*
	1503	* allocate and initialize cluster specific global locks...
	1504	*/
	1505	cluster_init();
	1506
	1507	printf("using %d buffer headers and %d cluster IO buffer headers\n",
	1508	nbuf_headers, niobuf_headers);
	1509
	1510	/* Set up zones used by the buffer cache */
	1511	bufzoneinit();
	1512
	1513	/* start the bcleanbuf() thread */
	1514	bcleanbuf_thread_init();
	1515
	1516	#if BALANCE_QUEUES
	1517	{
	1518	static void bufq_balance_thread_init(void) __attribute__((section("__TEXT, initcode")));
	1519	/* create a thread to do dynamic buffer queue balancing */
	1520	bufq_balance_thread_init();
	1521	}
	1522	#endif /* notyet */
	1523	}
	1524
	1525
	1526
	1527	/*
	1528	* Zones for the meta data buffers
	1529	*/
	1530
	1531	#define MINMETA 512
	1532	#define MAXMETA 8192
	1533
	1534	struct meta_zone_entry {
	1535	zone_t mz_zone;
	1536	vm_size_t mz_size;
	1537	vm_size_t mz_max;
	1538	const char *mz_name;
	1539	};
	1540
	1541	struct meta_zone_entry meta_zones[] = {
	1542	{NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
	1543	{NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
	1544	{NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
	1545	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
	1546	{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
	1547	{NULL, 0, 0, "" } /* End */
	1548	};
	1549
	1550	/*
	1551	* Initialize the meta data zones
	1552	*/
	1553	static void
	1554	bufzoneinit(void)
	1555	{
	1556	int i;
	1557
	1558	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	1559	meta_zones[i].mz_zone =
	1560	zinit(meta_zones[i].mz_size,
	1561	meta_zones[i].mz_max,
	1562	PAGE_SIZE,
	1563	meta_zones[i].mz_name);
	1564	}
	1565	buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
	1566	}
	1567
	1568	static __inline__ zone_t
	1569	getbufzone(size_t size)
	1570	{
	1571	int i;
	1572
	1573	if ((size % 512) \|\| (size < MINMETA) \|\| (size > MAXMETA))
	1574	panic("getbufzone: incorect size = %lu", size);
	1575
	1576	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	1577	if (meta_zones[i].mz_size >= size)
	1578	break;
	1579	}
	1580
	1581	return (meta_zones[i].mz_zone);
	1582	}
	1583
	1584
	1585
	1586	static struct buf *
	1587	bio_doread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, int async, int queuetype)
	1588	{
	1589	buf_t bp;
	1590
	1591	bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
	1592
	1593	/*
	1594	* If buffer does not have data valid, start a read.
	1595	* Note that if buffer is B_INVAL, buf_getblk() won't return it.
	1596	* Therefore, it's valid if it's I/O has completed or been delayed.
	1597	*/
	1598	if (!ISSET(bp->b_flags, (B_DONE \| B_DELWRI))) {
	1599	struct proc *p;
	1600
	1601	p = current_proc();
	1602
	1603	/* Start I/O for the buffer (keeping credentials). */
	1604	SET(bp->b_flags, B_READ \| async);
	1605	if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
	1606	kauth_cred_ref(cred);
	1607	bp->b_rcred = cred;
	1608	}
	1609
	1610	VNOP_STRATEGY(bp);
	1611
	1612	trace(TR_BREADMISS, pack(vp, size), blkno);
	1613
	1614	/* Pay for the read. */
	1615	if (p && p->p_stats)
	1616	OSIncrementAtomic(&p->p_stats->p_ru.ru_inblock); /* XXX */
	1617
	1618	if (async) {
	1619	/*
	1620	* since we asked for an ASYNC I/O
	1621	* the biodone will do the brelse
	1622	* we don't want to pass back a bp
	1623	* that we don't 'own'
	1624	*/
	1625	bp = NULL;
	1626	}
	1627	} else if (async) {
	1628	buf_brelse(bp);
	1629	bp = NULL;
	1630	}
	1631
	1632	trace(TR_BREADHIT, pack(vp, size), blkno);
	1633
	1634	return (bp);
	1635	}
	1636
	1637	/*
	1638	* Perform the reads for buf_breadn() and buf_meta_breadn().
	1639	* Trivial modification to the breada algorithm presented in Bach (p.55).
	1640	*/
	1641	static errno_t
	1642	do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes,
	1643	int nrablks, ucred_t cred, buf_t *bpp, int queuetype)
	1644	{
	1645	buf_t bp;
	1646	int i;
	1647
	1648	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
	1649
	1650	/*
	1651	* For each of the read-ahead blocks, start a read, if necessary.
	1652	*/
	1653	for (i = 0; i < nrablks; i++) {
	1654	/* If it's in the cache, just go on to next one. */
	1655	if (incore(vp, rablks[i]))
	1656	continue;
	1657
	1658	/* Get a buffer for the read-ahead block */
	1659	(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
	1660	}
	1661
	1662	/* Otherwise, we had to start a read for it; wait until it's valid. */
	1663	return (buf_biowait(bp));
	1664	}
	1665
	1666
	1667	/*
	1668	* Read a disk block.
	1669	* This algorithm described in Bach (p.54).
	1670	*/
	1671	errno_t
	1672	buf_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
	1673	{
	1674	buf_t bp;
	1675
	1676	/* Get buffer for block. */
	1677	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
	1678
	1679	/* Wait for the read to complete, and return result. */
	1680	return (buf_biowait(bp));
	1681	}
	1682
	1683	/*
	1684	* Read a disk block. [bread() for meta-data]
	1685	* This algorithm described in Bach (p.54).
	1686	*/
	1687	errno_t
	1688	buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, ucred_t cred, buf_t *bpp)
	1689	{
	1690	buf_t bp;
	1691
	1692	/* Get buffer for block. */
	1693	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
	1694
	1695	/* Wait for the read to complete, and return result. */
	1696	return (buf_biowait(bp));
	1697	}
	1698
	1699	/*
	1700	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	1701	*/
	1702	errno_t
	1703	buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes, int nrablks, ucred_t cred, buf_t *bpp)
	1704	{
	1705	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
	1706	}
	1707
	1708	/*
	1709	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	1710	* [buf_breadn() for meta-data]
	1711	*/
	1712	errno_t
	1713	buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes, int nrablks, ucred_t cred, buf_t *bpp)
	1714	{
	1715	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
	1716	}
	1717
	1718	/*
	1719	* Block write. Described in Bach (p.56)
	1720	*/
	1721	errno_t
	1722	buf_bwrite(buf_t bp)
	1723	{
	1724	int sync, wasdelayed;
	1725	errno_t rv;
	1726	proc_t p = current_proc();
	1727	vnode_t vp = bp->b_vp;
	1728
	1729	if (bp->b_datap == 0) {
	1730	if (brecover_data(bp) == 0)
	1731	return (0);
	1732	}
	1733	/* Remember buffer type, to switch on it later. */
	1734	sync = !ISSET(bp->b_flags, B_ASYNC);
	1735	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
	1736	CLR(bp->b_flags, (B_READ \| B_DONE \| B_ERROR \| B_DELWRI));
	1737
	1738	if (wasdelayed)
	1739	OSAddAtomic(-1, &nbdwrite);
	1740
	1741	if (!sync) {
	1742	/*
	1743	* If not synchronous, pay for the I/O operation and make
	1744	* sure the buf is on the correct vnode queue. We have
	1745	* to do this now, because if we don't, the vnode may not
	1746	* be properly notified that its I/O has completed.
	1747	*/
	1748	if (wasdelayed)
	1749	buf_reassign(bp, vp);
	1750	else
	1751	if (p && p->p_stats)
	1752	OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock); /* XXX */
	1753	}
	1754	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
	1755
	1756	/* Initiate disk write. Make sure the appropriate party is charged. */
	1757
	1758	OSAddAtomic(1, &vp->v_numoutput);
	1759
	1760	VNOP_STRATEGY(bp);
	1761
	1762	if (sync) {
	1763	/*
	1764	* If I/O was synchronous, wait for it to complete.
	1765	*/
	1766	rv = buf_biowait(bp);
	1767
	1768	/*
	1769	* Pay for the I/O operation, if it's not been paid for, and
	1770	* make sure it's on the correct vnode queue. (async operatings
	1771	* were payed for above.)
	1772	*/
	1773	if (wasdelayed)
	1774	buf_reassign(bp, vp);
	1775	else
	1776	if (p && p->p_stats)
	1777	OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock); /* XXX */
	1778
	1779	/* Release the buffer. */
	1780	// XXXdbg - only if the unused bit is set
	1781	if (!ISSET(bp->b_flags, B_NORELSE)) {
	1782	buf_brelse(bp);
	1783	} else {
	1784	CLR(bp->b_flags, B_NORELSE);
	1785	}
	1786
	1787	return (rv);
	1788	} else {
	1789	return (0);
	1790	}
	1791	}
	1792
	1793	int
	1794	vn_bwrite(struct vnop_bwrite_args *ap)
	1795	{
	1796	return (buf_bwrite(ap->a_bp));
	1797	}
	1798
	1799	/*
	1800	* Delayed write.
	1801	*
	1802	* The buffer is marked dirty, but is not queued for I/O.
	1803	* This routine should be used when the buffer is expected
	1804	* to be modified again soon, typically a small write that
	1805	* partially fills a buffer.
	1806	*
	1807	* NB: magnetic tapes cannot be delayed; they must be
	1808	* written in the order that the writes are requested.
	1809	*
	1810	* Described in Leffler, et al. (pp. 208-213).
	1811	*
	1812	* Note: With the abilitty to allocate additional buffer
	1813	* headers, we can get in to the situation where "too" many
	1814	* buf_bdwrite()s can create situation where the kernel can create
	1815	* buffers faster than the disks can service. Doing a buf_bawrite() in
	1816	* cases were we have "too many" outstanding buf_bdwrite()s avoids that.
	1817	*/
	1818	__private_extern__ int
	1819	bdwrite_internal(buf_t bp, int return_error)
	1820	{
	1821	proc_t p = current_proc();
	1822	vnode_t vp = bp->b_vp;
	1823
	1824	/*
	1825	* If the block hasn't been seen before:
	1826	* (1) Mark it as having been seen,
	1827	* (2) Charge for the write.
	1828	* (3) Make sure it's on its vnode's correct block list,
	1829	*/
	1830	if (!ISSET(bp->b_flags, B_DELWRI)) {
	1831	SET(bp->b_flags, B_DELWRI);
	1832	if (p && p->p_stats)
	1833	OSIncrementAtomic(&p->p_stats->p_ru.ru_oublock); /* XXX */
	1834	OSAddAtomic(1, &nbdwrite);
	1835	buf_reassign(bp, vp);
	1836	}
	1837
	1838	/* If this is a tape block, write it the block now. */
	1839	if (ISSET(bp->b_flags, B_TAPE)) {
	1840	VNOP_BWRITE(bp);
	1841	return (0);
	1842	}
	1843
	1844	/*
	1845	* if we're not LOCKED, but the total number of delayed writes
	1846	* has climbed above 75% of the total buffers in the system
	1847	* return an error if the caller has indicated that it can
	1848	* handle one in this case, otherwise schedule the I/O now
	1849	* this is done to prevent us from allocating tons of extra
	1850	* buffers when dealing with virtual disks (i.e. DiskImages),
	1851	* because additional buffers are dynamically allocated to prevent
	1852	* deadlocks from occurring
	1853	*
	1854	* however, can't do a buf_bawrite() if the LOCKED bit is set because the
	1855	* buffer is part of a transaction and can't go to disk until
	1856	* the LOCKED bit is cleared.
	1857	*/
	1858	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
	1859	if (return_error)
	1860	return (EAGAIN);
	1861	/*
	1862	* If the vnode has "too many" write operations in progress
	1863	* wait for them to finish the IO
	1864	*/
	1865	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
	1866
	1867	return (buf_bawrite(bp));
	1868	}
	1869
	1870	/* Otherwise, the "write" is done, so mark and release the buffer. */
	1871	SET(bp->b_flags, B_DONE);
	1872	buf_brelse(bp);
	1873	return (0);
	1874	}
	1875
	1876	errno_t
	1877	buf_bdwrite(buf_t bp)
	1878	{
	1879	return (bdwrite_internal(bp, 0));
	1880	}
	1881
	1882
	1883	/*
	1884	* Asynchronous block write; just an asynchronous buf_bwrite().
	1885	*
	1886	* Note: With the abilitty to allocate additional buffer
	1887	* headers, we can get in to the situation where "too" many
	1888	* buf_bawrite()s can create situation where the kernel can create
	1889	* buffers faster than the disks can service.
	1890	* We limit the number of "in flight" writes a vnode can have to
	1891	* avoid this.
	1892	*/
	1893	static int
	1894	bawrite_internal(buf_t bp, int throttle)
	1895	{
	1896	vnode_t vp = bp->b_vp;
	1897
	1898	if (vp) {
	1899	if (throttle)
	1900	/*
	1901	* If the vnode has "too many" write operations in progress
	1902	* wait for them to finish the IO
	1903	*/
	1904	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
	1905	else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
	1906	/*
	1907	* return to the caller and
	1908	* let him decide what to do
	1909	*/
	1910	return (EWOULDBLOCK);
	1911	}
	1912	SET(bp->b_flags, B_ASYNC);
	1913
	1914	return (VNOP_BWRITE(bp));
	1915	}
	1916
	1917	errno_t
	1918	buf_bawrite(buf_t bp)
	1919	{
	1920	return (bawrite_internal(bp, 1));
	1921	}
	1922
	1923
	1924	/*
	1925	* Release a buffer on to the free lists.
	1926	* Described in Bach (p. 46).
	1927	*/
	1928	void
	1929	buf_brelse(buf_t bp)
	1930	{
	1931	struct bqueues *bufq;
	1932	long whichq;
	1933	upl_t upl;
	1934	int need_wakeup = 0;
	1935	int need_bp_wakeup = 0;
	1936
	1937
	1938	if (bp->b_whichq != -1 \|\| !(bp->b_lflags & BL_BUSY))
	1939	panic("buf_brelse: bad buffer = %p\n", bp);
	1940
	1941	#ifdef JOE_DEBUG
	1942	bp->b_stackbrelse[0] = (int)__builtin_return_address(0);
	1943	bp->b_stackbrelse[1] = (int)__builtin_return_address(1);
	1944	bp->b_stackbrelse[2] = (int)__builtin_return_address(2);
	1945	bp->b_stackbrelse[3] = (int)__builtin_return_address(3);
	1946	bp->b_stackbrelse[4] = (int)__builtin_return_address(4);
	1947	bp->b_stackbrelse[5] = (int)__builtin_return_address(5);
	1948
	1949	bp->b_lastbrelse = current_thread();
	1950	bp->b_tag = 0;
	1951	#endif
	1952	if (bp->b_lflags & BL_IOBUF) {
	1953	free_io_buf(bp);
	1954	return;
	1955	}
	1956
	1957	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_START,
	1958	bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_datap,
	1959	bp->b_flags, 0);
	1960
	1961	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	1962
	1963	/*
	1964	* if we're invalidating a buffer that has the B_FILTER bit
	1965	* set then call the b_iodone function so it gets cleaned
	1966	* up properly.
	1967	*
	1968	* the HFS journal code depends on this
	1969	*/
	1970	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
	1971	if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
	1972	void (iodone_func)(struct buf , void *) = bp->b_iodone;
	1973	void arg = (void )bp->b_transaction;
	1974
	1975	CLR(bp->b_flags, B_FILTER); /* but note callout done */
	1976	bp->b_iodone = NULL;
	1977	bp->b_transaction = NULL;
	1978
	1979	if (iodone_func == NULL) {
	1980	panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
	1981	}
	1982	(*iodone_func)(bp, arg);
	1983	}
	1984	}
	1985	/*
	1986	* I/O is done. Cleanup the UPL state
	1987	*/
	1988	upl = bp->b_upl;
	1989
	1990	if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
	1991	kern_return_t kret;
	1992	int upl_flags;
	1993
	1994	if ( (upl == NULL) ) {
	1995	if ( !ISSET(bp->b_flags, B_INVAL)) {
	1996	kret = ubc_create_upl(bp->b_vp,
	1997	ubc_blktooff(bp->b_vp, bp->b_lblkno),
	1998	bp->b_bufsize,
	1999	&upl,
	2000	NULL,
	2001	UPL_PRECIOUS);
	2002
	2003	if (kret != KERN_SUCCESS)
	2004	panic("brelse: Failed to create UPL");
	2005	#ifdef UPL_DEBUG
	2006	upl_ubc_alias_set(upl, bp, 5);
	2007	#endif /* UPL_DEBUG */
	2008	}
	2009	} else {
	2010	if (bp->b_datap) {
	2011	kret = ubc_upl_unmap(upl);
	2012
	2013	if (kret != KERN_SUCCESS)
	2014	panic("ubc_upl_unmap failed");
	2015	bp->b_datap = (uintptr_t)NULL;
	2016	}
	2017	}
	2018	if (upl) {
	2019	if (bp->b_flags & (B_ERROR \| B_INVAL)) {
	2020	if (bp->b_flags & (B_READ \| B_INVAL))
	2021	upl_flags = UPL_ABORT_DUMP_PAGES;
	2022	else
	2023	upl_flags = 0;
	2024
	2025	ubc_upl_abort(upl, upl_flags);
	2026	} else {
	2027	if (ISSET(bp->b_flags, B_DELWRI \| B_WASDIRTY))
	2028	upl_flags = UPL_COMMIT_SET_DIRTY ;
	2029	else
	2030	upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
	2031
	2032	ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags \|
	2033	UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2034	}
	2035	bp->b_upl = NULL;
	2036	}
	2037	} else {
	2038	if ( (upl) )
	2039	panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
	2040	}
	2041
	2042	/*
	2043	* If it's locked, don't report an error; try again later.
	2044	*/
	2045	if (ISSET(bp->b_flags, (B_LOCKED\|B_ERROR)) == (B_LOCKED\|B_ERROR))
	2046	CLR(bp->b_flags, B_ERROR);
	2047	/*
	2048	* If it's not cacheable, or an error, mark it invalid.
	2049	*/
	2050	if (ISSET(bp->b_flags, (B_NOCACHE\|B_ERROR)))
	2051	SET(bp->b_flags, B_INVAL);
	2052
	2053	if ((bp->b_bufsize <= 0) \|\| ISSET(bp->b_flags, B_INVAL)) {
	2054	/*
	2055	* If it's invalid or empty, dissociate it from its vnode,
	2056	* release its storage if B_META, and
	2057	* clean it up a bit and put it on the EMPTY queue
	2058	*/
	2059	if (ISSET(bp->b_flags, B_DELWRI))
	2060	OSAddAtomic(-1, &nbdwrite);
	2061
	2062	if (ISSET(bp->b_flags, B_META)) {
	2063	if (bp->b_bufsize) {
	2064	if (ISSET(bp->b_flags, B_ZALLOC)) {
	2065	zone_t z;
	2066
	2067	z = getbufzone(bp->b_bufsize);
	2068	zfree(z, (void *)bp->b_datap);
	2069	} else
	2070	kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
	2071
	2072	bp->b_datap = (uintptr_t)NULL;
	2073	bp->b_bufsize = 0;
	2074	}
	2075	}
	2076	/*
	2077	* nuke any credentials we were holding
	2078	*/
	2079	if (IS_VALID_CRED(bp->b_rcred)) {
	2080	kauth_cred_unref(&bp->b_rcred);
	2081	}
	2082	if (IS_VALID_CRED(bp->b_wcred)) {
	2083	kauth_cred_unref(&bp->b_wcred);
	2084	}
	2085	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED \| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
	2086
	2087	bufq = &bufqueues[BQ_EMPTY];
	2088	bp->b_whichq = BQ_EMPTY;
	2089
	2090	lck_mtx_lock_spin(buf_mtxp);
	2091
	2092	if (bp->b_vp)
	2093	brelvp_locked(bp);
	2094
	2095	bremhash(bp);
	2096	BLISTNONE(bp);
	2097	binshash(bp, &invalhash);
	2098
	2099	binsheadfree(bp, bufq, BQ_EMPTY);
	2100	} else {
	2101	/*
	2102	* It has valid data. Put it on the end of the appropriate
	2103	* queue, so that it'll stick around for as long as possible.
	2104	*/
	2105	if (ISSET(bp->b_flags, B_LOCKED))
	2106	whichq = BQ_LOCKED; /* locked in core */
	2107	else if (ISSET(bp->b_flags, B_META))
	2108	whichq = BQ_META; /* meta-data */
	2109	else if (ISSET(bp->b_flags, B_AGE))
	2110	whichq = BQ_AGE; /* stale but valid data */
	2111	else
	2112	whichq = BQ_LRU; /* valid data */
	2113	bufq = &bufqueues[whichq];
	2114
	2115	CLR(bp->b_flags, (B_AGE \| B_ASYNC \| B_NOCACHE));
	2116	bp->b_whichq = whichq;
	2117	bp->b_timestamp = buf_timestamp();
	2118
	2119	lck_mtx_lock_spin(buf_mtxp);
	2120
	2121	binstailfree(bp, bufq, whichq);
	2122	}
	2123	if (needbuffer) {
	2124	/*
	2125	* needbuffer is a global
	2126	* we're currently using buf_mtxp to protect it
	2127	* delay doing the actual wakeup until after
	2128	* we drop buf_mtxp
	2129	*/
	2130	needbuffer = 0;
	2131	need_wakeup = 1;
	2132	}
	2133	if (ISSET(bp->b_lflags, BL_WANTED)) {
	2134	/*
	2135	* delay the actual wakeup until after we
	2136	* clear BL_BUSY and we've dropped buf_mtxp
	2137	*/
	2138	need_bp_wakeup = 1;
	2139	}
	2140	/*
	2141	* Unlock the buffer.
	2142	*/
	2143	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
	2144
	2145	lck_mtx_unlock(buf_mtxp);
	2146
	2147	if (need_wakeup) {
	2148	/*
	2149	* Wake up any processes waiting for any buffer to become free.
	2150	*/
	2151	wakeup(&needbuffer);
	2152	}
	2153	if (need_bp_wakeup) {
	2154	/*
	2155	* Wake up any proceeses waiting for _this_ buffer to become free.
	2156	*/
	2157	wakeup(bp);
	2158	}
	2159	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_END,
	2160	(int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
	2161	}
	2162
	2163	/*
	2164	* Determine if a block is in the cache.
	2165	* Just look on what would be its hash chain. If it's there, return
	2166	* a pointer to it, unless it's marked invalid. If it's marked invalid,
	2167	* we normally don't return the buffer, unless the caller explicitly
	2168	* wants us to.
	2169	*/
	2170	static boolean_t
	2171	incore(vnode_t vp, daddr64_t blkno)
	2172	{
	2173	boolean_t retval;
	2174	struct bufhashhdr *dp;
	2175
	2176	dp = BUFHASH(vp, blkno);
	2177
	2178	lck_mtx_lock_spin(buf_mtxp);
	2179
	2180	if (incore_locked(vp, blkno, dp))
	2181	retval = TRUE;
	2182	else
	2183	retval = FALSE;
	2184	lck_mtx_unlock(buf_mtxp);
	2185
	2186	return (retval);
	2187	}
	2188
	2189
	2190	static buf_t
	2191	incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
	2192	{
	2193	struct buf *bp;
	2194
	2195	/* Search hash chain */
	2196	for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
	2197	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
	2198	!ISSET(bp->b_flags, B_INVAL)) {
	2199	return (bp);
	2200	}
	2201	}
	2202	return (NULL);
	2203	}
	2204
	2205
	2206	/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
	2207	/*
	2208	* Get a block of requested size that is associated with
	2209	* a given vnode and block offset. If it is found in the
	2210	* block cache, mark it as having been found, make it busy
	2211	* and return it. Otherwise, return an empty block of the
	2212	* correct size. It is up to the caller to insure that the
	2213	* cached blocks be of the correct size.
	2214	*/
	2215	buf_t
	2216	buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
	2217	{
	2218	buf_t bp;
	2219	int err;
	2220	upl_t upl;
	2221	upl_page_info_t *pl;
	2222	kern_return_t kret;
	2223	int ret_only_valid;
	2224	struct timespec ts;
	2225	int upl_flags;
	2226	struct bufhashhdr *dp;
	2227
	2228	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_START,
	2229	(int)(blkno * PAGE_SIZE), size, operation, 0, 0);
	2230
	2231	ret_only_valid = operation & BLK_ONLYVALID;
	2232	operation &= ~BLK_ONLYVALID;
	2233	dp = BUFHASH(vp, blkno);
	2234	start:
	2235	lck_mtx_lock_spin(buf_mtxp);
	2236	start_locked:
	2237	if ((bp = incore_locked(vp, blkno, dp))) {
	2238	/*
	2239	* Found in the Buffer Cache
	2240	*/
	2241	if (ISSET(bp->b_lflags, BL_BUSY)) {
	2242	/*
	2243	* but is busy
	2244	*/
	2245	switch (operation) {
	2246	case BLK_READ:
	2247	case BLK_WRITE:
	2248	case BLK_META:
	2249	SET(bp->b_lflags, BL_WANTED);
	2250	bufstats.bufs_busyincore++;
	2251
	2252	lck_mtx_convert_spin(buf_mtxp);
	2253	/*
	2254	* don't retake the mutex after being awakened...
	2255	* the time out is in msecs
	2256	*/
	2257	ts.tv_sec = (slptimeo/1000);
	2258	ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
	2259
	2260	err = msleep(bp, buf_mtxp, slpflag \| PDROP \| (PRIBIO + 1), "buf_getblk", &ts);
	2261
	2262	/*
	2263	* Callers who call with PCATCH or timeout are
	2264	* willing to deal with the NULL pointer
	2265	*/
	2266	if (err && ((slpflag & PCATCH) \|\| ((err == EWOULDBLOCK) && slptimeo)))
	2267	return (NULL);
	2268	goto start;
	2269	/NOTREACHED/
	2270	break;
	2271
	2272	default:
	2273	/*
	2274	* unknown operation requested
	2275	*/
	2276	panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
	2277	/NOTREACHED/
	2278	break;
	2279	}
	2280	} else {
	2281	/*
	2282	* buffer in core and not busy
	2283	*/
	2284	SET(bp->b_lflags, BL_BUSY);
	2285	SET(bp->b_flags, B_CACHE);
	2286
	2287	bremfree_locked(bp);
	2288	bufstats.bufs_incore++;
	2289
	2290	lck_mtx_unlock(buf_mtxp);
	2291	#ifdef JOE_DEBUG
	2292	bp->b_owner = current_thread();
	2293	bp->b_tag = 1;
	2294	#endif
	2295	if ( (bp->b_upl) )
	2296	panic("buffer has UPL, but not marked BUSY: %p", bp);
	2297
	2298	if ( !ret_only_valid && bp->b_bufsize != size)
	2299	allocbuf(bp, size);
	2300
	2301	upl_flags = 0;
	2302	switch (operation) {
	2303	case BLK_WRITE:
	2304	/*
	2305	* "write" operation: let the UPL subsystem
	2306	* know that we intend to modify the buffer
	2307	* cache pages we're gathering.
	2308	*/
	2309	upl_flags \|= UPL_WILL_MODIFY;
	2310	case BLK_READ:
	2311	upl_flags \|= UPL_PRECIOUS;
	2312	if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
	2313	kret = ubc_create_upl(vp,
	2314	ubc_blktooff(vp, bp->b_lblkno),
	2315	bp->b_bufsize,
	2316	&upl,
	2317	&pl,
	2318	upl_flags);
	2319	if (kret != KERN_SUCCESS)
	2320	panic("Failed to create UPL");
	2321
	2322	bp->b_upl = upl;
	2323
	2324	if (upl_valid_page(pl, 0)) {
	2325	if (upl_dirty_page(pl, 0))
	2326	SET(bp->b_flags, B_WASDIRTY);
	2327	else
	2328	CLR(bp->b_flags, B_WASDIRTY);
	2329	} else
	2330	CLR(bp->b_flags, (B_DONE \| B_CACHE \| B_WASDIRTY \| B_DELWRI));
	2331
	2332	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
	2333
	2334	if (kret != KERN_SUCCESS)
	2335	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	2336	}
	2337	break;
	2338
	2339	case BLK_META:
	2340	/*
	2341	* VM is not involved in IO for the meta data
	2342	* buffer already has valid data
	2343	*/
	2344	break;
	2345
	2346	default:
	2347	panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
	2348	/NOTREACHED/
	2349	break;
	2350	}
	2351	}
	2352	} else { /* not incore() */
	2353	int queue = BQ_EMPTY; /* Start with no preference */
	2354
	2355	if (ret_only_valid) {
	2356	lck_mtx_unlock(buf_mtxp);
	2357	return (NULL);
	2358	}
	2359	lck_mtx_convert_spin(buf_mtxp);
	2360
	2361	if ((vnode_isreg(vp) == 0) \|\| (UBCINFOEXISTS(vp) == 0) /\|\| (vnode_issystem(vp) == 1)/)
	2362	operation = BLK_META;
	2363
	2364	if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
	2365	goto start_locked;
	2366
	2367	/*
	2368	* getnewbuf may block for a number of different reasons...
	2369	* if it does, it's then possible for someone else to
	2370	* create a buffer for the same block and insert it into
	2371	* the hash... if we see it incore at this point we dump
	2372	* the buffer we were working on and start over
	2373	*/
	2374	if (incore_locked(vp, blkno, dp)) {
	2375	SET(bp->b_flags, B_INVAL);
	2376	binshash(bp, &invalhash);
	2377
	2378	lck_mtx_unlock(buf_mtxp);
	2379
	2380	buf_brelse(bp);
	2381	goto start;
	2382	}
	2383	/*
	2384	* NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
	2385	* CALLED! BE CAREFUL.
	2386	*/
	2387
	2388	/*
	2389	* mark the buffer as B_META if indicated
	2390	* so that when buffer is released it will goto META queue
	2391	*/
	2392	if (operation == BLK_META)
	2393	SET(bp->b_flags, B_META);
	2394
	2395	bp->b_blkno = bp->b_lblkno = blkno;
	2396	bp->b_vp = vp;
	2397
	2398	/*
	2399	* Insert in the hash so that incore() can find it
	2400	*/
	2401	binshash(bp, BUFHASH(vp, blkno));
	2402
	2403	bgetvp_locked(vp, bp);
	2404
	2405	lck_mtx_unlock(buf_mtxp);
	2406
	2407	allocbuf(bp, size);
	2408
	2409	upl_flags = 0;
	2410	switch (operation) {
	2411	case BLK_META:
	2412	/*
	2413	* buffer data is invalid...
	2414	*
	2415	* I don't want to have to retake buf_mtxp,
	2416	* so the miss and vmhits counters are done
	2417	* with Atomic updates... all other counters
	2418	* in bufstats are protected with either
	2419	* buf_mtxp or iobuffer_mtxp
	2420	*/
	2421	OSAddAtomic(1, &bufstats.bufs_miss);
	2422	break;
	2423
	2424	case BLK_WRITE:
	2425	/*
	2426	* "write" operation: let the UPL subsystem know
	2427	* that we intend to modify the buffer cache pages
	2428	* we're gathering.
	2429	*/
	2430	upl_flags \|= UPL_WILL_MODIFY;
	2431	case BLK_READ:
	2432	{ off_t f_offset;
	2433	size_t contig_bytes;
	2434	int bmap_flags;
	2435
	2436	if ( (bp->b_upl) )
	2437	panic("bp already has UPL: %p",bp);
	2438
	2439	f_offset = ubc_blktooff(vp, blkno);
	2440
	2441	upl_flags \|= UPL_PRECIOUS;
	2442	kret = ubc_create_upl(vp,
	2443	f_offset,
	2444	bp->b_bufsize,
	2445	&upl,
	2446	&pl,
	2447	upl_flags);
	2448
	2449	if (kret != KERN_SUCCESS)
	2450	panic("Failed to create UPL");
	2451	#ifdef UPL_DEBUG
	2452	upl_ubc_alias_set(upl, bp, 4);
	2453	#endif /* UPL_DEBUG */
	2454	bp->b_upl = upl;
	2455
	2456	if (upl_valid_page(pl, 0)) {
	2457
	2458	if (operation == BLK_READ)
	2459	bmap_flags = VNODE_READ;
	2460	else
	2461	bmap_flags = VNODE_WRITE;
	2462
	2463	SET(bp->b_flags, B_CACHE \| B_DONE);
	2464
	2465	OSAddAtomic(1, &bufstats.bufs_vmhits);
	2466
	2467	bp->b_validoff = 0;
	2468	bp->b_dirtyoff = 0;
	2469
	2470	if (upl_dirty_page(pl, 0)) {
	2471	/* page is dirty */
	2472	SET(bp->b_flags, B_WASDIRTY);
	2473
	2474	bp->b_validend = bp->b_bcount;
	2475	bp->b_dirtyend = bp->b_bcount;
	2476	} else {
	2477	/* page is clean */
	2478	bp->b_validend = bp->b_bcount;
	2479	bp->b_dirtyend = 0;
	2480	}
	2481	/*
	2482	* try to recreate the physical block number associated with
	2483	* this buffer...
	2484	*/
	2485	if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
	2486	panic("getblk: VNOP_BLOCKMAP failed");
	2487	/*
	2488	* if the extent represented by this buffer
	2489	* is not completely physically contiguous on
	2490	* disk, than we can't cache the physical mapping
	2491	* in the buffer header
	2492	*/
	2493	if ((long)contig_bytes < bp->b_bcount)
	2494	bp->b_blkno = bp->b_lblkno;
	2495	} else {
	2496	OSAddAtomic(1, &bufstats.bufs_miss);
	2497	}
	2498	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
	2499
	2500	if (kret != KERN_SUCCESS)
	2501	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	2502	break;
	2503	}
	2504	default:
	2505	panic("getblk: paging or unknown operation - %x", operation);
	2506	/NOTREACHED/
	2507	break;
	2508	}
	2509	}
	2510	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_END,
	2511	(int)bp, (int)bp->b_datap, bp->b_flags, 3, 0);
	2512
	2513	#ifdef JOE_DEBUG
	2514	bp->b_stackgetblk[0] = (int)__builtin_return_address(0);
	2515	bp->b_stackgetblk[1] = (int)__builtin_return_address(1);
	2516	bp->b_stackgetblk[2] = (int)__builtin_return_address(2);
	2517	bp->b_stackgetblk[3] = (int)__builtin_return_address(3);
	2518	bp->b_stackgetblk[4] = (int)__builtin_return_address(4);
	2519	bp->b_stackgetblk[5] = (int)__builtin_return_address(5);
	2520	#endif
	2521	return (bp);
	2522	}
	2523
	2524	/*
	2525	* Get an empty, disassociated buffer of given size.
	2526	*/
	2527	buf_t
	2528	buf_geteblk(int size)
	2529	{
	2530	buf_t bp;
	2531	int queue = BQ_EMPTY;
	2532
	2533	lck_mtx_lock(buf_mtxp);
	2534
	2535	while ((bp = getnewbuf(0, 0, &queue)) == 0)
	2536	;
	2537	SET(bp->b_flags, (B_META\|B_INVAL));
	2538
	2539	#if DIAGNOSTIC
	2540	assert(queue == BQ_EMPTY);
	2541	#endif /* DIAGNOSTIC */
	2542	/* XXX need to implement logic to deal with other queues */
	2543
	2544	binshash(bp, &invalhash);
	2545	bufstats.bufs_eblk++;
	2546
	2547	lck_mtx_unlock(buf_mtxp);
	2548
	2549	allocbuf(bp, size);
	2550
	2551	return (bp);
	2552	}
	2553
	2554
	2555	/*
	2556	* With UBC, there is no need to expand / shrink the file data
	2557	* buffer. The VM uses the same pages, hence no waste.
	2558	* All the file data buffers can have one size.
	2559	* In fact expand / shrink would be an expensive operation.
	2560	*
	2561	* Only exception to this is meta-data buffers. Most of the
	2562	* meta data operations are smaller than PAGE_SIZE. Having the
	2563	* meta-data buffers grow and shrink as needed, optimizes use
	2564	* of the kernel wired memory.
	2565	*/
	2566
	2567	int
	2568	allocbuf(buf_t bp, int size)
	2569	{
	2570	vm_size_t desired_size;
	2571
	2572	desired_size = roundup(size, CLBYTES);
	2573
	2574	if (desired_size < PAGE_SIZE)
	2575	desired_size = PAGE_SIZE;
	2576	if (desired_size > MAXBSIZE)
	2577	panic("allocbuf: buffer larger than MAXBSIZE requested");
	2578
	2579	if (ISSET(bp->b_flags, B_META)) {
	2580	zone_t zprev, z;
	2581	int nsize = roundup(size, MINMETA);
	2582
	2583	if (bp->b_datap) {
	2584	vm_offset_t elem = (vm_offset_t)bp->b_datap;
	2585
	2586	if (ISSET(bp->b_flags, B_ZALLOC)) {
	2587	if (bp->b_bufsize < nsize) {
	2588	/* reallocate to a bigger size */
	2589
	2590	zprev = getbufzone(bp->b_bufsize);
	2591	if (nsize <= MAXMETA) {
	2592	desired_size = nsize;
	2593	z = getbufzone(nsize);
	2594	/* b_datap not really a ptr */
	2595	(void *)(&bp->b_datap) = zalloc(z);
	2596	} else {
	2597	bp->b_datap = (uintptr_t)NULL;
	2598	kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	2599	CLR(bp->b_flags, B_ZALLOC);
	2600	}
	2601	bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
	2602	zfree(zprev, (void *)elem);
	2603	} else {
	2604	desired_size = bp->b_bufsize;
	2605	}
	2606
	2607	} else {
	2608	if ((vm_size_t)bp->b_bufsize < desired_size) {
	2609	/* reallocate to a bigger size */
	2610	bp->b_datap = (uintptr_t)NULL;
	2611	kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	2612	bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
	2613	kmem_free(kernel_map, elem, bp->b_bufsize);
	2614	} else {
	2615	desired_size = bp->b_bufsize;
	2616	}
	2617	}
	2618	} else {
	2619	/* new allocation */
	2620	if (nsize <= MAXMETA) {
	2621	desired_size = nsize;
	2622	z = getbufzone(nsize);
	2623	/* b_datap not really a ptr */
	2624	(void *)(&bp->b_datap) = zalloc(z);
	2625	SET(bp->b_flags, B_ZALLOC);
	2626	} else
	2627	kmem_alloc_wired(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	2628	}
	2629
	2630	if (bp->b_datap == 0)
	2631	panic("allocbuf: NULL b_datap");
	2632	}
	2633	bp->b_bufsize = desired_size;
	2634	bp->b_bcount = size;
	2635
	2636	return (0);
	2637	}
	2638
	2639	/*
	2640	* Get a new buffer from one of the free lists.
	2641	*
	2642	* Request for a queue is passes in. The queue from which the buffer was taken
	2643	* from is returned. Out of range queue requests get BQ_EMPTY. Request for
	2644	* BQUEUE means no preference. Use heuristics in that case.
	2645	* Heuristics is as follows:
	2646	* Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
	2647	* If none available block till one is made available.
	2648	* If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
	2649	* Pick the most stale buffer.
	2650	* If found buffer was marked delayed write, start the async. write
	2651	* and restart the search.
	2652	* Initialize the fields and disassociate the buffer from the vnode.
	2653	* Remove the buffer from the hash. Return the buffer and the queue
	2654	* on which it was found.
	2655	*
	2656	* buf_mtxp is held upon entry
	2657	* returns with buf_mtxp locked
	2658	*/
	2659
	2660	static buf_t
	2661	getnewbuf(int slpflag, int slptimeo, int * queue)
	2662	{
	2663	buf_t bp;
	2664	buf_t lru_bp;
	2665	buf_t age_bp;
	2666	buf_t meta_bp;
	2667	int age_time, lru_time, bp_time, meta_time;
	2668	int req = queue; / save it for restarts */
	2669	struct timespec ts;
	2670
	2671	start:
	2672	/*
	2673	* invalid request gets empty queue
	2674	*/
	2675	if ((queue >= BQUEUES) \|\| (queue < 0)
	2676	\|\| (queue == BQ_LAUNDRY) \|\| (queue == BQ_LOCKED))
	2677	*queue = BQ_EMPTY;
	2678
	2679
	2680	if (queue == BQ_EMPTY && (bp = bufqueues[queue].tqh_first))
	2681	goto found;
	2682
	2683	/*
	2684	* need to grow number of bufs, add another one rather than recycling
	2685	*/
	2686	if (nbuf_headers < max_nbuf_headers) {
	2687	/*
	2688	* Increment count now as lock
	2689	* is dropped for allocation.
	2690	* That avoids over commits
	2691	*/
	2692	nbuf_headers++;
	2693	goto add_newbufs;
	2694	}
	2695	/* Try for the requested queue first */
	2696	bp = bufqueues[*queue].tqh_first;
	2697	if (bp)
	2698	goto found;
	2699
	2700	/* Unable to use requested queue */
	2701	age_bp = bufqueues[BQ_AGE].tqh_first;
	2702	lru_bp = bufqueues[BQ_LRU].tqh_first;
	2703	meta_bp = bufqueues[BQ_META].tqh_first;
	2704
	2705	if (!age_bp && !lru_bp && !meta_bp) {
	2706	/*
	2707	* Unavailble on AGE or LRU or META queues
	2708	* Try the empty list first
	2709	*/
	2710	bp = bufqueues[BQ_EMPTY].tqh_first;
	2711	if (bp) {
	2712	*queue = BQ_EMPTY;
	2713	goto found;
	2714	}
	2715	/*
	2716	* We have seen is this is hard to trigger.
	2717	* This is an overcommit of nbufs but needed
	2718	* in some scenarios with diskiamges
	2719	*/
	2720
	2721	add_newbufs:
	2722	lck_mtx_unlock(buf_mtxp);
	2723
	2724	/* Create a new temporary buffer header */
	2725	bp = (struct buf *)zalloc(buf_hdr_zone);
	2726
	2727	if (bp) {
	2728	bufhdrinit(bp);
	2729	bp->b_whichq = BQ_EMPTY;
	2730	bp->b_timestamp = buf_timestamp();
	2731	BLISTNONE(bp);
	2732	SET(bp->b_flags, B_HDRALLOC);
	2733	*queue = BQ_EMPTY;
	2734	}
	2735	lck_mtx_lock(buf_mtxp);
	2736
	2737	if (bp) {
	2738	binshash(bp, &invalhash);
	2739	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
	2740	buf_hdr_count++;
	2741	goto found;
	2742	}
	2743	/* subtract already accounted bufcount */
	2744	nbuf_headers--;
	2745
	2746	bufstats.bufs_sleeps++;
	2747
	2748	/* wait for a free buffer of any kind */
	2749	needbuffer = 1;
	2750	/* hz value is 100 */
	2751	ts.tv_sec = (slptimeo/1000);
	2752	/* the hz value is 100; which leads to 10ms */
	2753	ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
	2754	msleep(&needbuffer, buf_mtxp, slpflag\|(PRIBIO+1), "getnewbuf", &ts);
	2755	return (NULL);
	2756	}
	2757
	2758	/* Buffer available either on AGE or LRU or META */
	2759	bp = NULL;
	2760	*queue = -1;
	2761
	2762	/* Buffer available either on AGE or LRU */
	2763	if (!age_bp) {
	2764	bp = lru_bp;
	2765	*queue = BQ_LRU;
	2766	} else if (!lru_bp) {
	2767	bp = age_bp;
	2768	*queue = BQ_AGE;
	2769	} else { /* buffer available on both AGE and LRU */
	2770	int t = buf_timestamp();
	2771
	2772	age_time = t - age_bp->b_timestamp;
	2773	lru_time = t - lru_bp->b_timestamp;
	2774	if ((age_time < 0) \|\| (lru_time < 0)) { /* time set backwards */
	2775	bp = age_bp;
	2776	*queue = BQ_AGE;
	2777	/*
	2778	* we should probably re-timestamp eveything in the
	2779	* queues at this point with the current time
	2780	*/
	2781	} else {
	2782	if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
	2783	bp = lru_bp;
	2784	*queue = BQ_LRU;
	2785	} else {
	2786	bp = age_bp;
	2787	*queue = BQ_AGE;
	2788	}
	2789	}
	2790	}
	2791
	2792	if (!bp) { /* Neither on AGE nor on LRU */
	2793	bp = meta_bp;
	2794	*queue = BQ_META;
	2795	} else if (meta_bp) {
	2796	int t = buf_timestamp();
	2797
	2798	bp_time = t - bp->b_timestamp;
	2799	meta_time = t - meta_bp->b_timestamp;
	2800
	2801	if (!(bp_time < 0) && !(meta_time < 0)) {
	2802	/* time not set backwards */
	2803	int bp_is_stale;
	2804	bp_is_stale = (*queue == BQ_LRU) ?
	2805	lru_is_stale : age_is_stale;
	2806
	2807	if ((meta_time >= meta_is_stale) &&
	2808	(bp_time < bp_is_stale)) {
	2809	bp = meta_bp;
	2810	*queue = BQ_META;
	2811	}
	2812	}
	2813	}
	2814	found:
	2815	if (ISSET(bp->b_flags, B_LOCKED) \|\| ISSET(bp->b_lflags, BL_BUSY))
	2816	panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%lx)\n", bp, bp->b_flags);
	2817
	2818	/* Clean it */
	2819	if (bcleanbuf(bp)) {
	2820	/*
	2821	* moved to the laundry thread, buffer not ready
	2822	*/
	2823	*queue = req;
	2824	goto start;
	2825	}
	2826	return (bp);
	2827	}
	2828
	2829
	2830	/*
	2831	* Clean a buffer.
	2832	* Returns 0 is buffer is ready to use,
	2833	* Returns 1 if issued a buf_bawrite() to indicate
	2834	* that the buffer is not ready.
	2835	*
	2836	* buf_mtxp is held upon entry
	2837	* returns with buf_mtxp locked
	2838	*/
	2839	static int
	2840	bcleanbuf(buf_t bp)
	2841	{
	2842	/* Remove from the queue */
	2843	bremfree_locked(bp);
	2844
	2845	#ifdef JOE_DEBUG
	2846	bp->b_owner = current_thread();
	2847	bp->b_tag = 2;
	2848	#endif
	2849	/*
	2850	* If buffer was a delayed write, start the IO by queuing
	2851	* it on the LAUNDRY queue, and return 1
	2852	*/
	2853	if (ISSET(bp->b_flags, B_DELWRI)) {
	2854	bp->b_whichq = BQ_LAUNDRY;
	2855	bp->b_timestamp = buf_timestamp();
	2856	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	2857	blaundrycnt++;
	2858
	2859	lck_mtx_unlock(buf_mtxp);
	2860
	2861	wakeup(&bufqueues[BQ_LAUNDRY]);
	2862	/*
	2863	* and give it a chance to run
	2864	*/
	2865	(void)thread_block(THREAD_CONTINUE_NULL);
	2866
	2867	lck_mtx_lock(buf_mtxp);
	2868
	2869	return (1);
	2870	}
	2871	#ifdef JOE_DEBUG
	2872	bp->b_owner = current_thread();
	2873	bp->b_tag = 8;
	2874	#endif
	2875	/*
	2876	* Buffer is no longer on any free list... we own it
	2877	*/
	2878	SET(bp->b_lflags, BL_BUSY);
	2879
	2880	bremhash(bp);
	2881
	2882	/*
	2883	* disassociate us from our vnode, if we had one...
	2884	*/
	2885	if (bp->b_vp)
	2886	brelvp_locked(bp);
	2887
	2888	lck_mtx_unlock(buf_mtxp);
	2889
	2890	BLISTNONE(bp);
	2891
	2892	if (ISSET(bp->b_flags, B_META)) {
	2893	vm_offset_t elem;
	2894
	2895	elem = (vm_offset_t)bp->b_datap;
	2896	bp->b_datap = (uintptr_t)0xdeadbeef;
	2897
	2898	if (ISSET(bp->b_flags, B_ZALLOC)) {
	2899	zone_t z;
	2900
	2901	z = getbufzone(bp->b_bufsize);
	2902	zfree(z, (void *)elem);
	2903	} else
	2904	kmem_free(kernel_map, elem, bp->b_bufsize);
	2905	}
	2906
	2907	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	2908
	2909	/* clear out various other fields */
	2910	bp->b_bufsize = 0;
	2911	bp->b_datap = (uintptr_t)NULL;
	2912	bp->b_upl = (void *)NULL;
	2913	/*
	2914	* preserve the state of whether this buffer
	2915	* was allocated on the fly or not...
	2916	* the only other flag that should be set at
	2917	* this point is BL_BUSY...
	2918	*/
	2919	#ifdef JOE_DEBUG
	2920	bp->b_owner = current_thread();
	2921	bp->b_tag = 3;
	2922	#endif
	2923	bp->b_lflags = BL_BUSY;
	2924	bp->b_flags = (bp->b_flags & B_HDRALLOC);
	2925	bp->b_dev = NODEV;
	2926	bp->b_blkno = bp->b_lblkno = 0;
	2927	bp->b_iodone = NULL;
	2928	bp->b_error = 0;
	2929	bp->b_resid = 0;
	2930	bp->b_bcount = 0;
	2931	bp->b_dirtyoff = bp->b_dirtyend = 0;
	2932	bp->b_validoff = bp->b_validend = 0;
	2933
	2934	/* nuke any credentials we were holding */
	2935	if (IS_VALID_CRED(bp->b_rcred)) {
	2936	kauth_cred_unref(&bp->b_rcred);
	2937	}
	2938	if (IS_VALID_CRED(bp->b_wcred)) {
	2939	kauth_cred_unref(&bp->b_wcred);
	2940	}
	2941	lck_mtx_lock(buf_mtxp);
	2942
	2943	return (0);
	2944	}
	2945
	2946
	2947
	2948	errno_t
	2949	buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
	2950	{
	2951	buf_t bp;
	2952	errno_t error;
	2953	struct bufhashhdr *dp;
	2954
	2955	dp = BUFHASH(vp, lblkno);
	2956
	2957	lck_mtx_lock(buf_mtxp);
	2958	relook:
	2959	if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
	2960	lck_mtx_unlock(buf_mtxp);
	2961	return (0);
	2962	}
	2963	if (ISSET(bp->b_lflags, BL_BUSY)) {
	2964	if ( !ISSET(flags, BUF_WAIT)) {
	2965	lck_mtx_unlock(buf_mtxp);
	2966	return (EBUSY);
	2967	}
	2968	SET(bp->b_lflags, BL_WANTED);
	2969
	2970	error = msleep((caddr_t)bp, buf_mtxp, (PRIBIO + 1), "buf_invalblkno", NULL);
	2971
	2972	if (error) {
	2973	lck_mtx_unlock(buf_mtxp);
	2974	return (error);
	2975	}
	2976	goto relook;
	2977	}
	2978	bremfree_locked(bp);
	2979	SET(bp->b_lflags, BL_BUSY);
	2980	SET(bp->b_flags, B_INVAL);
	2981	#ifdef JOE_DEBUG
	2982	bp->b_owner = current_thread();
	2983	bp->b_tag = 4;
	2984	#endif
	2985	lck_mtx_unlock(buf_mtxp);
	2986	buf_brelse(bp);
	2987
	2988	return (0);
	2989	}
	2990
	2991
	2992	void
	2993	buf_drop(buf_t bp)
	2994	{
	2995	int need_wakeup = 0;
	2996
	2997	lck_mtx_lock_spin(buf_mtxp);
	2998
	2999	if (ISSET(bp->b_lflags, BL_WANTED)) {
	3000	/*
	3001	* delay the actual wakeup until after we
	3002	* clear BL_BUSY and we've dropped buf_mtxp
	3003	*/
	3004	need_wakeup = 1;
	3005	}
	3006	#ifdef JOE_DEBUG
	3007	bp->b_owner = current_thread();
	3008	bp->b_tag = 9;
	3009	#endif
	3010	/*
	3011	* Unlock the buffer.
	3012	*/
	3013	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
	3014
	3015	lck_mtx_unlock(buf_mtxp);
	3016
	3017	if (need_wakeup) {
	3018	/*
	3019	* Wake up any proceeses waiting for _this_ buffer to become free.
	3020	*/
	3021	wakeup(bp);
	3022	}
	3023	}
	3024
	3025
	3026	errno_t
	3027	buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
	3028	errno_t error;
	3029
	3030	lck_mtx_lock(buf_mtxp);
	3031
	3032	error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
	3033
	3034	lck_mtx_unlock(buf_mtxp);
	3035
	3036	return (error);
	3037	}
	3038
	3039
	3040	static errno_t
	3041	buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
	3042	{
	3043	errno_t error;
	3044	struct timespec ts;
	3045
	3046	if (ISSET(bp->b_flags, B_LOCKED)) {
	3047	if ((flags & BAC_SKIP_LOCKED))
	3048	return (EDEADLK);
	3049	} else {
	3050	if ((flags & BAC_SKIP_NONLOCKED))
	3051	return (EDEADLK);
	3052	}
	3053	if (ISSET(bp->b_lflags, BL_BUSY)) {
	3054	/*
	3055	* since the mutex_lock may block, the buffer
	3056	* may become BUSY, so we need to
	3057	* recheck for a NOWAIT request
	3058	*/
	3059	if (flags & BAC_NOWAIT)
	3060	return (EBUSY);
	3061	SET(bp->b_lflags, BL_WANTED);
	3062
	3063	/* the hz value is 100; which leads to 10ms */
	3064	ts.tv_sec = (slptimeo/100);
	3065	ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
	3066	error = msleep((caddr_t)bp, buf_mtxp, slpflag \| (PRIBIO + 1), "buf_acquire", &ts);
	3067
	3068	if (error)
	3069	return (error);
	3070	return (EAGAIN);
	3071	}
	3072	if (flags & BAC_REMOVE)
	3073	bremfree_locked(bp);
	3074	SET(bp->b_lflags, BL_BUSY);
	3075	#ifdef JOE_DEBUG
	3076	bp->b_owner = current_thread();
	3077	bp->b_tag = 5;
	3078	#endif
	3079	return (0);
	3080	}
	3081
	3082
	3083	/*
	3084	* Wait for operations on the buffer to complete.
	3085	* When they do, extract and return the I/O's error value.
	3086	*/
	3087	errno_t
	3088	buf_biowait(buf_t bp)
	3089	{
	3090	lck_mtx_lock(buf_mtxp);
	3091
	3092	DTRACE_IO1(wait__start, buf_t, bp);
	3093	while (!ISSET(bp->b_flags, B_DONE))
	3094	(void) msleep(bp, buf_mtxp, (PRIBIO+1), "buf_biowait", NULL);
	3095	DTRACE_IO1(wait__done, buf_t, bp);
	3096
	3097	lck_mtx_unlock(buf_mtxp);
	3098
	3099	/* check for interruption of I/O (e.g. via NFS), then errors. */
	3100	if (ISSET(bp->b_flags, B_EINTR)) {
	3101	CLR(bp->b_flags, B_EINTR);
	3102	return (EINTR);
	3103	} else if (ISSET(bp->b_flags, B_ERROR))
	3104	return (bp->b_error ? bp->b_error : EIO);
	3105	else
	3106	return (0);
	3107	}
	3108
	3109	/*
	3110	* Wait for the callback operation on a B_CALL buffer to complete.
	3111	*/
	3112	void
	3113	buf_biowait_callback(buf_t bp)
	3114	{
	3115	lck_mtx_lock(buf_mtxp);
	3116
	3117	DTRACE_IO1(wait__start, buf_t, bp);
	3118	while (!ISSET(bp->b_lflags, BL_CALLDONE))
	3119	(void) msleep(bp, buf_mtxp, (PRIBIO+1), "buf_biowait", NULL);
	3120	DTRACE_IO1(wait__done, buf_t, bp);
	3121
	3122	lck_mtx_unlock(buf_mtxp);
	3123	}
	3124
	3125	/*
	3126	* Mark I/O complete on a buffer.
	3127	*
	3128	* If a callback has been requested, e.g. the pageout
	3129	* daemon, do so. Otherwise, awaken waiting processes.
	3130	*
	3131	* [ Leffler, et al., says on p.247:
	3132	* "This routine wakes up the blocked process, frees the buffer
	3133	* for an asynchronous write, or, for a request by the pagedaemon
	3134	* process, invokes a procedure specified in the buffer structure" ]
	3135	*
	3136	* In real life, the pagedaemon (or other system processes) wants
	3137	* to do async stuff to, and doesn't want the buffer buf_brelse()'d.
	3138	* (for swap pager, that puts swap buffers on the free lists (!!!),
	3139	* for the vn device, that puts malloc'd buffers on the free lists!)
	3140	*/
	3141	extern struct timeval priority_IO_timestamp_for_root;
	3142	extern int hard_throttle_on_root;
	3143
	3144	void
	3145	buf_biodone(buf_t bp)
	3146	{
	3147	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_START,
	3148	(int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
	3149
	3150	if (ISSET(bp->b_flags, B_DONE))
	3151	panic("biodone already");
	3152
	3153	if (ISSET(bp->b_flags, B_ERROR)) {
	3154	fslog_io_error(bp);
	3155	}
	3156
	3157	if (bp->b_vp && bp->b_vp->v_mount && (bp->b_flags & B_READ) == 0) {
	3158	update_last_io_time(bp->b_vp->v_mount);
	3159	}
	3160
	3161	if (kdebug_enable) {
	3162	int code = DKIO_DONE;
	3163
	3164	if (bp->b_flags & B_READ)
	3165	code \|= DKIO_READ;
	3166	if (bp->b_flags & B_ASYNC)
	3167	code \|= DKIO_ASYNC;
	3168
	3169	if (bp->b_flags & B_META)
	3170	code \|= DKIO_META;
	3171	else if (bp->b_flags & B_PAGEIO)
	3172	code \|= DKIO_PAGING;
	3173
	3174	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) \| DBG_FUNC_NONE,
	3175	(unsigned int)bp, (unsigned int)bp->b_vp,
	3176	bp->b_resid, bp->b_error, 0);
	3177	}
	3178	if ((bp->b_vp != NULLVP) &&
	3179	((bp->b_flags & (B_PAGEIO \| B_READ)) == (B_PAGEIO \| B_READ)) &&
	3180	(bp->b_vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
	3181	microuptime(&priority_IO_timestamp_for_root);
	3182	hard_throttle_on_root = 0;
	3183	}
	3184	/*
	3185	* I/O was done, so don't believe
	3186	* the DIRTY state from VM anymore
	3187	*/
	3188	CLR(bp->b_flags, B_WASDIRTY);
	3189	DTRACE_IO1(done, buf_t, bp);
	3190
	3191	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
	3192	/*
	3193	* wake up any writer's blocked
	3194	* on throttle or waiting for I/O
	3195	* to drain
	3196	*/
	3197	vnode_writedone(bp->b_vp);
	3198
	3199	if (ISSET(bp->b_flags, (B_CALL \| B_FILTER))) { /* if necessary, call out */
	3200	void (iodone_func)(struct buf , void *) = bp->b_iodone;
	3201	void arg = (void )bp->b_transaction;
	3202	int callout = ISSET(bp->b_flags, B_CALL);
	3203
	3204	CLR(bp->b_flags, (B_CALL \| B_FILTER)); /* filters and callouts are one-shot */
	3205	bp->b_iodone = NULL;
	3206	bp->b_transaction = NULL;
	3207
	3208	if (iodone_func == NULL) {
	3209	panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
	3210	} else {
	3211	if (callout)
	3212	SET(bp->b_flags, B_DONE); /* note that it's done */
	3213	(*iodone_func)(bp, arg);
	3214	}
	3215	if (callout) {
	3216	int need_wakeup = 0;
	3217
	3218	/*
	3219	* assumes that the callback function takes
	3220	* ownership of the bp and deals with releasing it if necessary
	3221	* BL_WANTED indicates that we've decided to wait on the
	3222	* completion of this I/O in a synchronous manner... we
	3223	* still call the callback function, but in addition we
	3224	* will do a wakeup... BL_CALLDONE indicates that the callback
	3225	* routine has completed and its ok for the waiter to take
	3226	* 'ownership' of this bp back
	3227	*/
	3228	lck_mtx_lock_spin(buf_mtxp);
	3229
	3230	if (bp->b_lflags & BL_WANTED) {
	3231	CLR(bp->b_lflags, BL_WANTED);
	3232	need_wakeup = 1;
	3233	}
	3234	SET(bp->b_lflags, BL_CALLDONE);
	3235
	3236	lck_mtx_unlock(buf_mtxp);
	3237
	3238	if (need_wakeup)
	3239	wakeup(bp);
	3240
	3241	goto biodone_done;
	3242	}
	3243	/*
	3244	* in this case the call back function is acting
	3245	* strictly as a filter... it does not take
	3246	* ownership of the bp and is expecting us
	3247	* to finish cleaning up... this is currently used
	3248	* by the HFS journaling code
	3249	*/
	3250	}
	3251	if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
	3252	SET(bp->b_flags, B_DONE); /* note that it's done */
	3253
	3254	buf_brelse(bp);
	3255	} else { /* or just wakeup the buffer */
	3256	/*
	3257	* by taking the mutex, we serialize
	3258	* the buf owner calling buf_biowait so that we'll
	3259	* only see him in one of 2 states...
	3260	* state 1: B_DONE wasn't set and he's
	3261	* blocked in msleep
	3262	* state 2: he's blocked trying to take the
	3263	* mutex before looking at B_DONE
	3264	* BL_WANTED is cleared in case anyone else
	3265	* is blocked waiting for the buffer... note
	3266	* that we haven't cleared B_BUSY yet, so if
	3267	* they do get to run, their going to re-set
	3268	* BL_WANTED and go back to sleep
	3269	*/
	3270	lck_mtx_lock_spin(buf_mtxp);
	3271
	3272	CLR(bp->b_lflags, BL_WANTED);
	3273	SET(bp->b_flags, B_DONE); /* note that it's done */
	3274
	3275	lck_mtx_unlock(buf_mtxp);
	3276
	3277	wakeup(bp);
	3278	}
	3279	biodone_done:
	3280	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_END,
	3281	(int)bp, (int)bp->b_datap, bp->b_flags, 0, 0);
	3282	}
	3283
	3284	/*
	3285	* Return a count of buffers on the "locked" queue.
	3286	*/
	3287	int
	3288	count_lock_queue(void)
	3289	{
	3290	buf_t bp;
	3291	int n = 0;
	3292
	3293	lck_mtx_lock(buf_mtxp);
	3294
	3295	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
	3296	bp = bp->b_freelist.tqe_next)
	3297	n++;
	3298	lck_mtx_unlock(buf_mtxp);
	3299
	3300	return (n);
	3301	}
	3302
	3303	/*
	3304	* Return a count of 'busy' buffers. Used at the time of shutdown.
	3305	*/
	3306	int
	3307	count_busy_buffers(void)
	3308	{
	3309	buf_t bp;
	3310	int nbusy = 0;
	3311
	3312	lck_mtx_lock(buf_mtxp);
	3313	for (bp = &buf_headers[boot_nbuf_headers]; --bp >= buf_headers; )
	3314	if (!ISSET(bp->b_flags, B_INVAL) && ISSET(bp->b_lflags, BL_BUSY))
	3315	nbusy++;
	3316	lck_mtx_unlock(buf_mtxp);
	3317
	3318	return (nbusy);
	3319	}
	3320
	3321	#if DIAGNOSTIC
	3322	/*
	3323	* Print out statistics on the current allocation of the buffer pool.
	3324	* Can be enabled to print out on every ``sync'' by setting "syncprt"
	3325	* in vfs_syscalls.c using sysctl.
	3326	*/
	3327	void
	3328	vfs_bufstats()
	3329	{
	3330	int i, j, count;
	3331	register struct buf *bp;
	3332	register struct bqueues *dp;
	3333	int counts[MAXBSIZE/CLBYTES+1];
	3334	static char *bname[BQUEUES] =
	3335	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	3336
	3337	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
	3338	count = 0;
	3339	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	3340	counts[j] = 0;
	3341
	3342	lck_mtx_lock(buf_mtxp);
	3343
	3344	for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
	3345	counts[bp->b_bufsize/CLBYTES]++;
	3346	count++;
	3347	}
	3348	lck_mtx_unlock(buf_mtxp);
	3349
	3350	printf("%s: total-%d", bname[i], count);
	3351	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	3352	if (counts[j] != 0)
	3353	printf(", %d-%d", j * CLBYTES, counts[j]);
	3354	printf("\n");
	3355	}
	3356	}
	3357	#endif /* DIAGNOSTIC */
	3358
	3359	#define NRESERVEDIOBUFS 64
	3360
	3361
	3362	buf_t
	3363	alloc_io_buf(vnode_t vp, int priv)
	3364	{
	3365	buf_t bp;
	3366
	3367	lck_mtx_lock(iobuffer_mtxp);
	3368
	3369	while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) \|\|
	3370	(bp = iobufqueue.tqh_first) == NULL) {
	3371	bufstats.bufs_iobufsleeps++;
	3372
	3373	need_iobuffer = 1;
	3374	(void) msleep(&need_iobuffer, iobuffer_mtxp, (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
	3375	}
	3376	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
	3377
	3378	bufstats.bufs_iobufinuse++;
	3379	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
	3380	bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
	3381
	3382	lck_mtx_unlock(iobuffer_mtxp);
	3383
	3384	/*
	3385	* initialize various fields
	3386	* we don't need to hold the mutex since the buffer
	3387	* is now private... the vp should have a reference
	3388	* on it and is not protected by this mutex in any event
	3389	*/
	3390	bp->b_timestamp = 0;
	3391	bp->b_proc = NULL;
	3392
	3393	bp->b_datap = 0;
	3394	bp->b_flags = 0;
	3395	bp->b_lflags = BL_BUSY \| BL_IOBUF;
	3396	bp->b_blkno = bp->b_lblkno = 0;
	3397	#ifdef JOE_DEBUG
	3398	bp->b_owner = current_thread();
	3399	bp->b_tag = 6;
	3400	#endif
	3401	bp->b_iodone = NULL;
	3402	bp->b_error = 0;
	3403	bp->b_resid = 0;
	3404	bp->b_bcount = 0;
	3405	bp->b_bufsize = 0;
	3406	bp->b_upl = NULL;
	3407	bp->b_vp = vp;
	3408
	3409	if (vp && (vp->v_type == VBLK \|\| vp->v_type == VCHR))
	3410	bp->b_dev = vp->v_rdev;
	3411	else
	3412	bp->b_dev = NODEV;
	3413
	3414	return (bp);
	3415	}
	3416
	3417
	3418	void
	3419	free_io_buf(buf_t bp)
	3420	{
	3421	int need_wakeup = 0;
	3422
	3423	/*
	3424	* put buffer back on the head of the iobufqueue
	3425	*/
	3426	bp->b_vp = NULL;
	3427	bp->b_flags = B_INVAL;
	3428
	3429	lck_mtx_lock_spin(iobuffer_mtxp);
	3430
	3431	binsheadfree(bp, &iobufqueue, -1);
	3432
	3433	if (need_iobuffer) {
	3434	/*
	3435	* Wake up any processes waiting because they need an io buffer
	3436	*
	3437	* do the wakeup after we drop the mutex... it's possible that the
	3438	* wakeup will be superfluous if need_iobuffer gets set again and
	3439	* another thread runs this path, but it's highly unlikely, doesn't
	3440	* hurt, and it means we don't hold up I/O progress if the wakeup blocks
	3441	* trying to grab a task related lock...
	3442	*/
	3443	need_iobuffer = 0;
	3444	need_wakeup = 1;
	3445	}
	3446	bufstats.bufs_iobufinuse--;
	3447
	3448	lck_mtx_unlock(iobuffer_mtxp);
	3449
	3450	if (need_wakeup)
	3451	wakeup(&need_iobuffer);
	3452	}
	3453
	3454
	3455	void
	3456	buf_list_lock(void)
	3457	{
	3458	lck_mtx_lock(buf_mtxp);
	3459	}
	3460
	3461	void
	3462	buf_list_unlock(void)
	3463	{
	3464	lck_mtx_unlock(buf_mtxp);
	3465	}
	3466
	3467	/*
	3468	* If getnewbuf() calls bcleanbuf() on the same thread
	3469	* there is a potential for stack overrun and deadlocks.
	3470	* So we always handoff the work to a worker thread for completion
	3471	*/
	3472
	3473
	3474	static void
	3475	bcleanbuf_thread_init(void)
	3476	{
	3477	/* create worker thread */
	3478	kernel_thread(kernel_task, bcleanbuf_thread);
	3479	}
	3480
	3481	static void
	3482	bcleanbuf_thread(void)
	3483	{
	3484	struct buf *bp;
	3485	int error = 0;
	3486	int loopcnt = 0;
	3487
	3488	for (;;) {
	3489	lck_mtx_lock(buf_mtxp);
	3490
	3491	while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL)
	3492	(void)msleep((void *)&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO, "blaundry", NULL);
	3493
	3494	/*
	3495	* Remove from the queue
	3496	*/
	3497	bremfree_locked(bp);
	3498
	3499	/*
	3500	* Buffer is no longer on any free list
	3501	*/
	3502	SET(bp->b_lflags, BL_BUSY);
	3503
	3504	#ifdef JOE_DEBUG
	3505	bp->b_owner = current_thread();
	3506	bp->b_tag = 10;
	3507	#endif
	3508
	3509	lck_mtx_unlock(buf_mtxp);
	3510	/*
	3511	* do the IO
	3512	*/
	3513	error = bawrite_internal(bp, 0);
	3514
	3515	if (error) {
	3516	bp->b_whichq = BQ_LAUNDRY;
	3517	bp->b_timestamp = buf_timestamp();
	3518
	3519	lck_mtx_lock_spin(buf_mtxp);
	3520
	3521	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	3522	blaundrycnt++;
	3523
	3524	/* we never leave a busy page on the laundary queue */
	3525	CLR(bp->b_lflags, BL_BUSY);
	3526	#ifdef JOE_DEBUG
	3527	bp->b_owner = current_thread();
	3528	bp->b_tag = 11;
	3529	#endif
	3530
	3531	lck_mtx_unlock(buf_mtxp);
	3532
	3533	if (loopcnt > 10) {
	3534	(void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
	3535	loopcnt = 0;
	3536	} else {
	3537	(void)thread_block(THREAD_CONTINUE_NULL);
	3538	loopcnt++;
	3539	}
	3540	}
	3541	}
	3542	}
	3543
	3544
	3545	static int
	3546	brecover_data(buf_t bp)
	3547	{
	3548	int upl_offset;
	3549	upl_t upl;
	3550	upl_page_info_t *pl;
	3551	kern_return_t kret;
	3552	vnode_t vp = bp->b_vp;
	3553	int upl_flags;
	3554
	3555
	3556	if ( !UBCINFOEXISTS(vp) \|\| bp->b_bufsize == 0)
	3557	goto dump_buffer;
	3558
	3559	upl_flags = UPL_PRECIOUS;
	3560	if (! (buf_flags(bp) & B_READ)) {
	3561	/*
	3562	* "write" operation: let the UPL subsystem know
	3563	* that we intend to modify the buffer cache pages we're
	3564	* gathering.
	3565	*/
	3566	upl_flags \|= UPL_WILL_MODIFY;
	3567	}
	3568
	3569	kret = ubc_create_upl(vp,
	3570	ubc_blktooff(vp, bp->b_lblkno),
	3571	bp->b_bufsize,
	3572	&upl,
	3573	&pl,
	3574	upl_flags);
	3575	if (kret != KERN_SUCCESS)
	3576	panic("Failed to create UPL");
	3577
	3578	for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
	3579
	3580	if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) \|\| !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
	3581	ubc_upl_abort(upl, 0);
	3582	goto dump_buffer;
	3583	}
	3584	}
	3585	bp->b_upl = upl;
	3586
	3587	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_datap));
	3588
	3589	if (kret != KERN_SUCCESS)
	3590	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	3591	return (1);
	3592
	3593	dump_buffer:
	3594	bp->b_bufsize = 0;
	3595	SET(bp->b_flags, B_INVAL);
	3596	buf_brelse(bp);
	3597
	3598	return(0);
	3599	}
	3600
	3601
	3602
	3603	/*
	3604	* disabled for now
	3605	*/
	3606
	3607	#if FLUSH_QUEUES
	3608
	3609	#define NFLUSH 32
	3610
	3611	static int
	3612	bp_cmp(void a, void b)
	3613	{
	3614	buf_t bp_a = (buf_t **)a,
	3615	bp_b = (buf_t **)b;
	3616	daddr64_t res;
	3617
	3618	// don't have to worry about negative block
	3619	// numbers so this is ok to do.
	3620	//
	3621	res = (bp_a->b_blkno - bp_b->b_blkno);
	3622
	3623	return (int)res;
	3624	}
	3625
	3626
	3627	int
	3628	bflushq(int whichq, mount_t mp)
	3629	{
	3630	buf_t bp, next;
	3631	int i, buf_count;
	3632	int total_writes = 0;
	3633	static buf_t flush_table[NFLUSH];
	3634
	3635	if (whichq < 0 \|\| whichq >= BQUEUES) {
	3636	return (0);
	3637	}
	3638
	3639	restart:
	3640	lck_mtx_lock(buf_mtxp);
	3641
	3642	bp = TAILQ_FIRST(&bufqueues[whichq]);
	3643
	3644	for (buf_count = 0; bp; bp = next) {
	3645	next = bp->b_freelist.tqe_next;
	3646
	3647	if (bp->b_vp == NULL \|\| bp->b_vp->v_mount != mp) {
	3648	continue;
	3649	}
	3650
	3651	if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
	3652
	3653	bremfree_locked(bp);
	3654	#ifdef JOE_DEBUG
	3655	bp->b_owner = current_thread();
	3656	bp->b_tag = 7;
	3657	#endif
	3658	SET(bp->b_lflags, BL_BUSY);
	3659	flush_table[buf_count] = bp;
	3660	buf_count++;
	3661	total_writes++;
	3662
	3663	if (buf_count >= NFLUSH) {
	3664	lck_mtx_unlock(buf_mtxp);
	3665
	3666	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	3667
	3668	for (i = 0; i < buf_count; i++) {
	3669	buf_bawrite(flush_table[i]);
	3670	}
	3671	goto restart;
	3672	}
	3673	}
	3674	}
	3675	lck_mtx_unlock(buf_mtxp);
	3676
	3677	if (buf_count > 0) {
	3678	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	3679
	3680	for (i = 0; i < buf_count; i++) {
	3681	buf_bawrite(flush_table[i]);
	3682	}
	3683	}
	3684
	3685	return (total_writes);
	3686	}
	3687	#endif
	3688
	3689
	3690	#if BALANCE_QUEUES
	3691
	3692	/* XXX move this to a separate file */
	3693
	3694	/*
	3695	* NOTE: THIS CODE HAS NOT BEEN UPDATED
	3696	* WITH RESPECT TO THE NEW LOCKING MODEL
	3697	*/
	3698
	3699
	3700	/*
	3701	* Dynamic Scaling of the Buffer Queues
	3702	*/
	3703
	3704	typedef long long blsize_t;
	3705
	3706	blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
	3707	/* Global tunable limits */
	3708	blsize_t nbufh; /* number of buffer headers */
	3709	blsize_t nbuflow; /* minimum number of buffer headers required */
	3710	blsize_t nbufhigh; /* maximum number of buffer headers allowed */
	3711	blsize_t nbuftarget; /* preferred number of buffer headers */
	3712
	3713	/*
	3714	* assertions:
	3715	*
	3716	* 1. 0 < nbuflow <= nbufh <= nbufhigh
	3717	* 2. nbufhigh <= MAXNBUF
	3718	* 3. 0 < nbuflow <= nbuftarget <= nbufhigh
	3719	* 4. nbufh can not be set by sysctl().
	3720	*/
	3721
	3722	/* Per queue tunable limits */
	3723
	3724	struct bufqlim {
	3725	blsize_t bl_nlow; /* minimum number of buffer headers required */
	3726	blsize_t bl_num; /* number of buffer headers on the queue */
	3727	blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
	3728	blsize_t bl_target; /* preferred number of buffer headers */
	3729	long bl_stale; /* Seconds after which a buffer is considered stale */
	3730	} bufqlim[BQUEUES];
	3731
	3732	/*
	3733	* assertions:
	3734	*
	3735	* 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
	3736	* 2. bl_nlhigh <= MAXNBUF
	3737	* 3. bufqlim[BQ_META].bl_nlow != 0
	3738	* 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
	3739	* file system IO operations)
	3740	* 5. bl_num can not be set by sysctl().
	3741	* 6. bl_nhigh <= nbufhigh
	3742	*/
	3743
	3744	/*
	3745	* Rationale:
	3746	* ----------
	3747	* Defining it blsize_t as long permits 2^31 buffer headers per queue.
	3748	* Which can describe (2^31 * PAGE_SIZE) memory per queue.
	3749	*
	3750	* These limits are exported to by means of sysctl().
	3751	* It was decided to define blsize_t as a 64 bit quantity.
	3752	* This will make sure that we will not be required to change it
	3753	* as long as we do not exceed 64 bit address space for the kernel.
	3754	*
	3755	* low and high numbers parameters initialized at compile time
	3756	* and boot arguments can be used to override them. sysctl()
	3757	* would not change the value. sysctl() can get all the values
	3758	* but can set only target. num is the current level.
	3759	*
	3760	* Advantages of having a "bufqscan" thread doing the balancing are,
	3761	* Keep enough bufs on BQ_EMPTY.
	3762	* getnewbuf() by default will always select a buffer from the BQ_EMPTY.
	3763	* getnewbuf() perfoms best if a buffer was found there.
	3764	* Also this minimizes the possibility of starting IO
	3765	* from getnewbuf(). That's a performance win, too.
	3766	*
	3767	* Localize complex logic [balancing as well as time aging]
	3768	* to balancebufq().
	3769	*
	3770	* Simplify getnewbuf() logic by elimination of time aging code.
	3771	*/
	3772
	3773	/*
	3774	* Algorithm:
	3775	* -----------
	3776	* The goal of the dynamic scaling of the buffer queues to to keep
	3777	* the size of the LRU close to bl_target. Buffers on a queue would
	3778	* be time aged.
	3779	*
	3780	* There would be a thread which will be responsible for "balancing"
	3781	* the buffer cache queues.
	3782	*
	3783	* The scan order would be: AGE, LRU, META, EMPTY.
	3784	*/
	3785
	3786	long bufqscanwait = 0;
	3787
	3788	static void bufqscan_thread();
	3789	static int balancebufq(int q);
	3790	static int btrimempty(int n);
	3791	static __inline__ int initbufqscan(void);
	3792	static __inline__ int nextbufq(int q);
	3793	static void buqlimprt(int all);
	3794
	3795
	3796	static __inline__ void
	3797	bufqinc(int q)
	3798	{
	3799	if ((q < 0) \|\| (q >= BQUEUES))
	3800	return;
	3801
	3802	bufqlim[q].bl_num++;
	3803	return;
	3804	}
	3805
	3806	static __inline__ void
	3807	bufqdec(int q)
	3808	{
	3809	if ((q < 0) \|\| (q >= BQUEUES))
	3810	return;
	3811
	3812	bufqlim[q].bl_num--;
	3813	return;
	3814	}
	3815
	3816	static void
	3817	bufq_balance_thread_init(void)
	3818	{
	3819
	3820	if (bufqscanwait++ == 0) {
	3821
	3822	/* Initalize globals */
	3823	MAXNBUF = (sane_size / PAGE_SIZE);
	3824	nbufh = nbuf_headers;
	3825	nbuflow = min(nbufh, 100);
	3826	nbufhigh = min(MAXNBUF, max(nbufh, 2048));
	3827	nbuftarget = (sane_size >> 5) / PAGE_SIZE;
	3828	nbuftarget = max(nbuflow, nbuftarget);
	3829	nbuftarget = min(nbufhigh, nbuftarget);
	3830
	3831	/*
	3832	* Initialize the bufqlim
	3833	*/
	3834
	3835	/* LOCKED queue */
	3836	bufqlim[BQ_LOCKED].bl_nlow = 0;
	3837	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	3838	bufqlim[BQ_LOCKED].bl_target = 0;
	3839	bufqlim[BQ_LOCKED].bl_stale = 30;
	3840
	3841	/* LRU queue */
	3842	bufqlim[BQ_LRU].bl_nlow = 0;
	3843	bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
	3844	bufqlim[BQ_LRU].bl_target = nbuftarget/4;
	3845	bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
	3846
	3847	/* AGE queue */
	3848	bufqlim[BQ_AGE].bl_nlow = 0;
	3849	bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
	3850	bufqlim[BQ_AGE].bl_target = nbuftarget/4;
	3851	bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
	3852
	3853	/* EMPTY queue */
	3854	bufqlim[BQ_EMPTY].bl_nlow = 0;
	3855	bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
	3856	bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
	3857	bufqlim[BQ_EMPTY].bl_stale = 600000;
	3858
	3859	/* META queue */
	3860	bufqlim[BQ_META].bl_nlow = 0;
	3861	bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
	3862	bufqlim[BQ_META].bl_target = nbuftarget/4;
	3863	bufqlim[BQ_META].bl_stale = META_IS_STALE;
	3864
	3865	/* LAUNDRY queue */
	3866	bufqlim[BQ_LOCKED].bl_nlow = 0;
	3867	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	3868	bufqlim[BQ_LOCKED].bl_target = 0;
	3869	bufqlim[BQ_LOCKED].bl_stale = 30;
	3870
	3871	buqlimprt(1);
	3872	}
	3873
	3874	/* create worker thread */
	3875	kernel_thread(kernel_task, bufqscan_thread);
	3876	}
	3877
	3878	/* The workloop for the buffer balancing thread */
	3879	static void
	3880	bufqscan_thread()
	3881	{
	3882	int moretodo = 0;
	3883
	3884	for(;;) {
	3885	do {
	3886	int q; /* buffer queue to process */
	3887
	3888	q = initbufqscan();
	3889	for (; q; ) {
	3890	moretodo \|= balancebufq(q);
	3891	q = nextbufq(q);
	3892	}
	3893	} while (moretodo);
	3894
	3895	#if DIAGNOSTIC
	3896	vfs_bufstats();
	3897	buqlimprt(0);
	3898	#endif
	3899	(void)tsleep((void )&bufqscanwait, PRIBIO, "bufqscanwait", 60 hz);
	3900	moretodo = 0;
	3901	}
	3902	}
	3903
	3904	/* Seed for the buffer queue balancing */
	3905	static __inline__ int
	3906	initbufqscan()
	3907	{
	3908	/* Start with AGE queue */
	3909	return (BQ_AGE);
	3910	}
	3911
	3912	/* Pick next buffer queue to balance */
	3913	static __inline__ int
	3914	nextbufq(int q)
	3915	{
	3916	int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
	3917
	3918	q++;
	3919	q %= sizeof(order);
	3920	return (order[q]);
	3921	}
	3922
	3923	/* function to balance the buffer queues */
	3924	static int
	3925	balancebufq(int q)
	3926	{
	3927	int moretodo = 0;
	3928	int n, t;
	3929
	3930	/* reject invalid q */
	3931	if ((q < 0) \|\| (q >= BQUEUES))
	3932	goto out;
	3933
	3934	/* LOCKED or LAUNDRY queue MUST not be balanced */
	3935	if ((q == BQ_LOCKED) \|\| (q == BQ_LAUNDRY))
	3936	goto out;
	3937
	3938	n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
	3939
	3940	/* If queue has less than target nothing more to do */
	3941	if (n < 0)
	3942	goto out;
	3943
	3944	if ( n > 8 ) {
	3945	/* Balance only a small amount (12.5%) at a time */
	3946	n >>= 3;
	3947	}
	3948
	3949	/* EMPTY queue needs special handling */
	3950	if (q == BQ_EMPTY) {
	3951	moretodo \|= btrimempty(n);
	3952	goto out;
	3953	}
	3954
	3955	t = buf_timestamp():
	3956
	3957	for (; n > 0; n--) {
	3958	struct buf *bp = bufqueues[q].tqh_first;
	3959	if (!bp)
	3960	break;
	3961
	3962	/* check if it's stale */
	3963	if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
	3964	if (bcleanbuf(bp)) {
	3965	/* buf_bawrite() issued, bp not ready */
	3966	moretodo = 1;
	3967	} else {
	3968	/* release the cleaned buffer to BQ_EMPTY */
	3969	SET(bp->b_flags, B_INVAL);
	3970	buf_brelse(bp);
	3971	}
	3972	} else
	3973	break;
	3974	}
	3975
	3976	out:
	3977	return (moretodo);
	3978	}
	3979
	3980	static int
	3981	btrimempty(int n)
	3982	{
	3983	/*
	3984	* When struct buf are allocated dynamically, this would
	3985	* reclaim upto 'n' struct buf from the empty queue.
	3986	*/
	3987
	3988	return (0);
	3989	}
	3990
	3991	static void
	3992	buqlimprt(int all)
	3993	{
	3994	int i;
	3995	static char *bname[BQUEUES] =
	3996	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	3997
	3998	if (all)
	3999	for (i = 0; i < BQUEUES; i++) {
	4000	printf("%s : ", bname[i]);
	4001	printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
	4002	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	4003	printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
	4004	printf("target = %ld, ", (long)bufqlim[i].bl_target);
	4005	printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
	4006	}
	4007	else
	4008	for (i = 0; i < BQUEUES; i++) {
	4009	printf("%s : ", bname[i]);
	4010	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	4011	}
	4012	}
	4013
	4014	#endif
	4015
	4016