git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*-
	30	* Copyright (c) 1994 Christopher G. Demetriou
	31	* Copyright (c) 1982, 1986, 1989, 1993
	32	* The Regents of the University of California. All rights reserved.
	33	* (c) UNIX System Laboratories, Inc.
	34	* All or some portions of this file are derived from material licensed
	35	* to the University of California by American Telephone and Telegraph
	36	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	37	* the permission of UNIX System Laboratories, Inc.
	38	*
	39	* Redistribution and use in source and binary forms, with or without
	40	* modification, are permitted provided that the following conditions
	41	* are met:
	42	* 1. Redistributions of source code must retain the above copyright
	43	* notice, this list of conditions and the following disclaimer.
	44	* 2. Redistributions in binary form must reproduce the above copyright
	45	* notice, this list of conditions and the following disclaimer in the
	46	* documentation and/or other materials provided with the distribution.
	47	* 3. All advertising materials mentioning features or use of this software
	48	* must display the following acknowledgement:
	49	* This product includes software developed by the University of
	50	* California, Berkeley and its contributors.
	51	* 4. Neither the name of the University nor the names of its contributors
	52	* may be used to endorse or promote products derived from this software
	53	* without specific prior written permission.
	54	*
	55	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	56	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	57	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	58	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	59	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	60	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	61	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	62	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	63	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	64	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	65	* SUCH DAMAGE.
	66	*
	67	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
	68	*/
	69
	70	/*
	71	* Some references:
	72	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
	73	* Leffler, et al.: The Design and Implementation of the 4.3BSD
	74	* UNIX Operating System (Addison Welley, 1989)
	75	*/
	76
	77	#include <sys/param.h>
	78	#include <sys/systm.h>
	79	#include <sys/proc_internal.h>
	80	#include <sys/buf_internal.h>
	81	#include <sys/vnode_internal.h>
	82	#include <sys/mount_internal.h>
	83	#include <sys/trace.h>
	84	#include <sys/malloc.h>
	85	#include <sys/resourcevar.h>
	86	#include <miscfs/specfs/specdev.h>
	87	#include <sys/ubc.h>
	88	#include <sys/kauth.h>
	89	#if DIAGNOSTIC
	90	#include <kern/assert.h>
	91	#endif /* DIAGNOSTIC */
	92	#include <kern/task.h>
	93	#include <kern/zalloc.h>
	94	#include <kern/lock.h>
	95
	96	#include <sys/fslog.h> /* fslog_io_error() */
	97
	98	#include <mach/mach_types.h>
	99	#include <mach/memory_object_types.h>
	100	#include <kern/sched_prim.h> /* thread_block() */
	101
	102	#include <vm/vm_kern.h>
	103	#include <vm/vm_pageout.h>
	104
	105	#include <sys/kdebug.h>
	106
	107	#include <libkern/OSAtomic.h>
	108	#include <libkern/OSDebug.h>
	109	#include <sys/ubc_internal.h>
	110
	111	#include <sys/sdt.h>
	112	#include <sys/cprotect.h>
	113
	114
	115	#if BALANCE_QUEUES
	116	static __inline__ void bufqinc(int q);
	117	static __inline__ void bufqdec(int q);
	118	#endif
	119
	120	int bcleanbuf(buf_t bp, boolean_t discard);
	121	static int brecover_data(buf_t bp);
	122	static boolean_t incore(vnode_t vp, daddr64_t blkno);
	123	/* timeout is in msecs */
	124	static buf_t getnewbuf(int slpflag, int slptimeo, int *queue);
	125	static void bremfree_locked(buf_t bp);
	126	static void buf_reassign(buf_t bp, vnode_t newvp);
	127	static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
	128	static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
	129	static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
	130	static boolean_t buffer_cache_gc(int);
	131	static buf_t buf_brelse_shadow(buf_t bp);
	132	static void buf_free_meta_store(buf_t bp);
	133
	134	static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
	135	uintptr_t external_storage, void (iodone)(buf_t, void ), void *arg, int priv);
	136
	137
	138	__private_extern__ int bdwrite_internal(buf_t, int);
	139
	140	/* zone allocated buffer headers */
	141	static void bufzoneinit(void);
	142	static void bcleanbuf_thread_init(void);
	143	static void bcleanbuf_thread(void);
	144
	145	static zone_t buf_hdr_zone;
	146	static int buf_hdr_count;
	147
	148
	149	/*
	150	* Definitions for the buffer hash lists.
	151	*/
	152	#define BUFHASH(dvp, lbn) \
	153	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
	154	LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
	155	u_long bufhash;
	156
	157	static buf_t incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
	158
	159	/* Definitions for the buffer stats. */
	160	struct bufstats bufstats;
	161
	162	/* Number of delayed write buffers */
	163	long nbdwrite = 0;
	164	int blaundrycnt = 0;
	165	static int boot_nbuf_headers = 0;
	166
	167	static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
	168
	169	static TAILQ_HEAD(ioqueue, buf) iobufqueue;
	170	static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
	171	static int needbuffer;
	172	static int need_iobuffer;
	173
	174	static lck_grp_t *buf_mtx_grp;
	175	static lck_attr_t *buf_mtx_attr;
	176	static lck_grp_attr_t *buf_mtx_grp_attr;
	177	static lck_mtx_t *iobuffer_mtxp;
	178	static lck_mtx_t *buf_mtxp;
	179
	180	static int buf_busycount;
	181
	182	static __inline__ int
	183	buf_timestamp(void)
	184	{
	185	struct timeval t;
	186	microuptime(&t);
	187	return (t.tv_sec);
	188	}
	189
	190	/*
	191	* Insq/Remq for the buffer free lists.
	192	*/
	193	#if BALANCE_QUEUES
	194	#define binsheadfree(bp, dp, whichq) do { \
	195	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
	196	bufqinc((whichq)); \
	197	} while (0)
	198
	199	#define binstailfree(bp, dp, whichq) do { \
	200	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
	201	bufqinc((whichq)); \
	202	} while (0)
	203	#else
	204	#define binsheadfree(bp, dp, whichq) do { \
	205	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
	206	} while (0)
	207
	208	#define binstailfree(bp, dp, whichq) do { \
	209	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
	210	} while (0)
	211	#endif
	212
	213
	214	#define BHASHENTCHECK(bp) \
	215	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
	216	panic("%p: b_hash.le_prev is not deadbeef", (bp));
	217
	218	#define BLISTNONE(bp) \
	219	(bp)->b_hash.le_next = (struct buf *)0; \
	220	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
	221
	222	/*
	223	* Insq/Remq for the vnode usage lists.
	224	*/
	225	#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
	226	#define bufremvn(bp) { \
	227	LIST_REMOVE(bp, b_vnbufs); \
	228	(bp)->b_vnbufs.le_next = NOLIST; \
	229	}
	230
	231	/*
	232	* Time in seconds before a buffer on a list is
	233	* considered as a stale buffer
	234	*/
	235	#define LRU_IS_STALE 120 /* default value for the LRU */
	236	#define AGE_IS_STALE 60 /* default value for the AGE */
	237	#define META_IS_STALE 180 /* default value for the BQ_META */
	238
	239	int lru_is_stale = LRU_IS_STALE;
	240	int age_is_stale = AGE_IS_STALE;
	241	int meta_is_stale = META_IS_STALE;
	242
	243	#define MAXLAUNDRY 10
	244
	245	/* LIST_INSERT_HEAD() with assertions */
	246	static __inline__ void
	247	blistenterhead(struct bufhashhdr * head, buf_t bp)
	248	{
	249	if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
	250	(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
	251	(head)->lh_first = bp;
	252	bp->b_hash.le_prev = &(head)->lh_first;
	253	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	254	panic("blistenterhead: le_prev is deadbeef");
	255	}
	256
	257	static __inline__ void
	258	binshash(buf_t bp, struct bufhashhdr *dp)
	259	{
	260	#if DIAGNOSTIC
	261	buf_t nbp;
	262	#endif /* DIAGNOSTIC */
	263
	264	BHASHENTCHECK(bp);
	265
	266	#if DIAGNOSTIC
	267	nbp = dp->lh_first;
	268	for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
	269	if(nbp == bp)
	270	panic("buf already in hashlist");
	271	}
	272	#endif /* DIAGNOSTIC */
	273
	274	blistenterhead(dp, bp);
	275	}
	276
	277	static __inline__ void
	278	bremhash(buf_t bp)
	279	{
	280	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	281	panic("bremhash le_prev is deadbeef");
	282	if (bp->b_hash.le_next == bp)
	283	panic("bremhash: next points to self");
	284
	285	if (bp->b_hash.le_next != NULL)
	286	bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
	287	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
	288	}
	289
	290	/*
	291	* buf_mtxp held.
	292	*/
	293	static __inline__ void
	294	bmovelaundry(buf_t bp)
	295	{
	296	bp->b_whichq = BQ_LAUNDRY;
	297	bp->b_timestamp = buf_timestamp();
	298	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	299	blaundrycnt++;
	300	}
	301
	302	static __inline__ void
	303	buf_release_credentials(buf_t bp)
	304	{
	305	if (IS_VALID_CRED(bp->b_rcred)) {
	306	kauth_cred_unref(&bp->b_rcred);
	307	}
	308	if (IS_VALID_CRED(bp->b_wcred)) {
	309	kauth_cred_unref(&bp->b_wcred);
	310	}
	311	}
	312
	313
	314	int
	315	buf_valid(buf_t bp) {
	316
	317	if ( (bp->b_flags & (B_DONE \| B_DELWRI)) )
	318	return 1;
	319	return 0;
	320	}
	321
	322	int
	323	buf_fromcache(buf_t bp) {
	324
	325	if ( (bp->b_flags & B_CACHE) )
	326	return 1;
	327	return 0;
	328	}
	329
	330	void
	331	buf_markinvalid(buf_t bp) {
	332
	333	SET(bp->b_flags, B_INVAL);
	334	}
	335
	336	void
	337	buf_markdelayed(buf_t bp) {
	338
	339	if (!ISSET(bp->b_flags, B_DELWRI)) {
	340	SET(bp->b_flags, B_DELWRI);
	341
	342	OSAddAtomicLong(1, &nbdwrite);
	343	buf_reassign(bp, bp->b_vp);
	344	}
	345	SET(bp->b_flags, B_DONE);
	346	}
	347
	348	void
	349	buf_markclean(buf_t bp) {
	350
	351	if (ISSET(bp->b_flags, B_DELWRI)) {
	352	CLR(bp->b_flags, B_DELWRI);
	353
	354	OSAddAtomicLong(-1, &nbdwrite);
	355	buf_reassign(bp, bp->b_vp);
	356	}
	357	}
	358
	359	void
	360	buf_markeintr(buf_t bp) {
	361
	362	SET(bp->b_flags, B_EINTR);
	363	}
	364
	365
	366	void
	367	buf_markaged(buf_t bp) {
	368
	369	SET(bp->b_flags, B_AGE);
	370	}
	371
	372	int
	373	buf_fua(buf_t bp) {
	374
	375	if ((bp->b_flags & B_FUA) == B_FUA)
	376	return 1;
	377	return 0;
	378	}
	379
	380	void
	381	buf_markfua(buf_t bp) {
	382
	383	SET(bp->b_flags, B_FUA);
	384	}
	385
	386	#if CONFIG_PROTECT
	387	void
	388	buf_setcpaddr(buf_t bp, struct cprotect *entry) {
	389	bp->b_attr.ba_cpentry = entry;
	390	}
	391
	392	void
	393	buf_setcpoff (buf_t bp, uint64_t foffset) {
	394	bp->b_attr.ba_cp_file_off = foffset;
	395	}
	396
	397	void *
	398	bufattr_cpaddr(bufattr_t bap) {
	399	return (bap->ba_cpentry);
	400	}
	401
	402	uint64_t
	403	bufattr_cpoff(bufattr_t bap) {
	404	return (bap->ba_cp_file_off);
	405	}
	406
	407	void
	408	bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) {
	409	bap->ba_cpentry = cp_entry_addr;
	410	}
	411
	412	void
	413	bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
	414	bap->ba_cp_file_off = foffset;
	415	}
	416
	417	#else
	418	void *
	419	bufattr_cpaddr(bufattr_t bap __unused) {
	420	return NULL;
	421	}
	422
	423	uint64_t
	424	bufattr_cpoff(bufattr_t bap __unused) {
	425	return 0;
	426	}
	427
	428	void
	429	bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) {
	430	}
	431
	432	void
	433	bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
	434	return;
	435	}
	436	#endif /* CONFIG_PROTECT */
	437
	438	bufattr_t
	439	bufattr_alloc() {
	440	bufattr_t bap;
	441	MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
	442	if (bap == NULL)
	443	return NULL;
	444
	445	bzero(bap, sizeof(struct bufattr));
	446	return bap;
	447	}
	448
	449	void
	450	bufattr_free(bufattr_t bap) {
	451	if (bap)
	452	FREE(bap, M_TEMP);
	453	}
	454
	455	int
	456	bufattr_rawencrypted(bufattr_t bap) {
	457	if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
	458	return 1;
	459	return 0;
	460	}
	461
	462	int
	463	bufattr_throttled(bufattr_t bap) {
	464	return (GET_BUFATTR_IO_TIER(bap));
	465	}
	466
	467	int
	468	bufattr_nocache(bufattr_t bap) {
	469	if ( (bap->ba_flags & BA_NOCACHE) )
	470	return 1;
	471	return 0;
	472	}
	473
	474	int
	475	bufattr_meta(bufattr_t bap) {
	476	if ( (bap->ba_flags & BA_META) )
	477	return 1;
	478	return 0;
	479	}
	480
	481	int
	482	bufattr_delayidlesleep(bufattr_t bap)
	483	{
	484	if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
	485	return 1;
	486	return 0;
	487	}
	488
	489	bufattr_t
	490	buf_attr(buf_t bp) {
	491	return &bp->b_attr;
	492	}
	493
	494	void
	495	buf_markstatic(buf_t bp __unused) {
	496	SET(bp->b_flags, B_STATICCONTENT);
	497	}
	498
	499	int
	500	buf_static(buf_t bp) {
	501	if ( (bp->b_flags & B_STATICCONTENT) )
	502	return 1;
	503	return 0;
	504	}
	505
	506	void
	507	bufattr_markgreedymode(bufattr_t bap) {
	508	SET(bap->ba_flags, BA_GREEDY_MODE);
	509	}
	510
	511	int
	512	bufattr_greedymode(bufattr_t bap) {
	513	if ( (bap->ba_flags & BA_GREEDY_MODE) )
	514	return 1;
	515	return 0;
	516	}
	517
	518	void
	519	bufattr_markquickcomplete(bufattr_t bap) {
	520	SET(bap->ba_flags, BA_QUICK_COMPLETE);
	521	}
	522
	523	int
	524	bufattr_quickcomplete(bufattr_t bap) {
	525	if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
	526	return 1;
	527	return 0;
	528	}
	529
	530	errno_t
	531	buf_error(buf_t bp) {
	532
	533	return (bp->b_error);
	534	}
	535
	536	void
	537	buf_seterror(buf_t bp, errno_t error) {
	538
	539	if ((bp->b_error = error))
	540	SET(bp->b_flags, B_ERROR);
	541	else
	542	CLR(bp->b_flags, B_ERROR);
	543	}
	544
	545	void
	546	buf_setflags(buf_t bp, int32_t flags) {
	547
	548	SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
	549	}
	550
	551	void
	552	buf_clearflags(buf_t bp, int32_t flags) {
	553
	554	CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
	555	}
	556
	557	int32_t
	558	buf_flags(buf_t bp) {
	559
	560	return ((bp->b_flags & BUF_X_RDFLAGS));
	561	}
	562
	563	void
	564	buf_reset(buf_t bp, int32_t io_flags) {
	565
	566	CLR(bp->b_flags, (B_READ \| B_WRITE \| B_ERROR \| B_DONE \| B_INVAL \| B_ASYNC \| B_NOCACHE \| B_FUA));
	567	SET(bp->b_flags, (io_flags & (B_ASYNC \| B_READ \| B_WRITE \| B_NOCACHE)));
	568
	569	bp->b_error = 0;
	570	}
	571
	572	uint32_t
	573	buf_count(buf_t bp) {
	574
	575	return (bp->b_bcount);
	576	}
	577
	578	void
	579	buf_setcount(buf_t bp, uint32_t bcount) {
	580
	581	bp->b_bcount = bcount;
	582	}
	583
	584	uint32_t
	585	buf_size(buf_t bp) {
	586
	587	return (bp->b_bufsize);
	588	}
	589
	590	void
	591	buf_setsize(buf_t bp, uint32_t bufsize) {
	592
	593	bp->b_bufsize = bufsize;
	594	}
	595
	596	uint32_t
	597	buf_resid(buf_t bp) {
	598
	599	return (bp->b_resid);
	600	}
	601
	602	void
	603	buf_setresid(buf_t bp, uint32_t resid) {
	604
	605	bp->b_resid = resid;
	606	}
	607
	608	uint32_t
	609	buf_dirtyoff(buf_t bp) {
	610
	611	return (bp->b_dirtyoff);
	612	}
	613
	614	uint32_t
	615	buf_dirtyend(buf_t bp) {
	616
	617	return (bp->b_dirtyend);
	618	}
	619
	620	void
	621	buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
	622
	623	bp->b_dirtyoff = dirtyoff;
	624	}
	625
	626	void
	627	buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
	628
	629	bp->b_dirtyend = dirtyend;
	630	}
	631
	632	uintptr_t
	633	buf_dataptr(buf_t bp) {
	634
	635	return (bp->b_datap);
	636	}
	637
	638	void
	639	buf_setdataptr(buf_t bp, uintptr_t data) {
	640
	641	bp->b_datap = data;
	642	}
	643
	644	vnode_t
	645	buf_vnode(buf_t bp) {
	646
	647	return (bp->b_vp);
	648	}
	649
	650	void
	651	buf_setvnode(buf_t bp, vnode_t vp) {
	652
	653	bp->b_vp = vp;
	654	}
	655
	656
	657	void *
	658	buf_callback(buf_t bp)
	659	{
	660	if ( !(bp->b_flags & B_CALL) )
	661	return ((void *) NULL);
	662
	663	return ((void *)bp->b_iodone);
	664	}
	665
	666
	667	errno_t
	668	buf_setcallback(buf_t bp, void (callback)(buf_t, void ), void *transaction)
	669	{
	670	if (callback)
	671	bp->b_flags \|= (B_CALL \| B_ASYNC);
	672	else
	673	bp->b_flags &= ~B_CALL;
	674	bp->b_transaction = transaction;
	675	bp->b_iodone = callback;
	676
	677	return (0);
	678	}
	679
	680	errno_t
	681	buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
	682	{
	683
	684	if ( !(bp->b_lflags & BL_IOBUF) )
	685	return (EINVAL);
	686
	687	if (upl)
	688	bp->b_flags \|= B_CLUSTER;
	689	else
	690	bp->b_flags &= ~B_CLUSTER;
	691	bp->b_upl = upl;
	692	bp->b_uploffset = offset;
	693
	694	return (0);
	695	}
	696
	697	buf_t
	698	buf_clone(buf_t bp, int io_offset, int io_size, void (iodone)(buf_t, void ), void *arg)
	699	{
	700	buf_t io_bp;
	701
	702	if (io_offset < 0 \|\| io_size < 0)
	703	return (NULL);
	704
	705	if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
	706	return (NULL);
	707
	708	if (bp->b_flags & B_CLUSTER) {
	709	if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
	710	return (NULL);
	711
	712	if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
	713	return (NULL);
	714	}
	715	io_bp = alloc_io_buf(bp->b_vp, 0);
	716
	717	io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL \| B_META \| B_PAGEIO \| B_CLUSTER \| B_PHYS \| B_RAW \| B_ASYNC \| B_READ \| B_FUA);
	718
	719	if (iodone) {
	720	io_bp->b_transaction = arg;
	721	io_bp->b_iodone = iodone;
	722	io_bp->b_flags \|= B_CALL;
	723	}
	724	if (bp->b_flags & B_CLUSTER) {
	725	io_bp->b_upl = bp->b_upl;
	726	io_bp->b_uploffset = bp->b_uploffset + io_offset;
	727	} else {
	728	io_bp->b_datap = (uintptr_t)(((char *)bp->b_datap) + io_offset);
	729	}
	730	io_bp->b_bcount = io_size;
	731
	732	return (io_bp);
	733	}
	734
	735
	736	int
	737	buf_shadow(buf_t bp)
	738	{
	739	if (bp->b_lflags & BL_SHADOW)
	740	return 1;
	741	return 0;
	742	}
	743
	744
	745	buf_t
	746	buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void ), void *arg)
	747	{
	748	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
	749	}
	750
	751	buf_t
	752	buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void ), void *arg)
	753	{
	754	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
	755	}
	756
	757
	758	static buf_t
	759	buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (iodone)(buf_t, void ), void *arg, int priv)
	760	{
	761	buf_t io_bp;
	762
	763	KERNEL_DEBUG(0xbbbbc000 \| DBG_FUNC_START, bp, 0, 0, 0, 0);
	764
	765	if ( !(bp->b_flags & B_META) \|\| (bp->b_lflags & BL_IOBUF)) {
	766
	767	KERNEL_DEBUG(0xbbbbc000 \| DBG_FUNC_END, bp, 0, 0, 0, 0);
	768	return (NULL);
	769	}
	770	#ifdef BUF_MAKE_PRIVATE
	771	if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
	772	panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
	773	#endif
	774	io_bp = alloc_io_buf(bp->b_vp, priv);
	775
	776	io_bp->b_flags = bp->b_flags & (B_META \| B_ZALLOC \| B_ASYNC \| B_READ \| B_FUA);
	777	io_bp->b_blkno = bp->b_blkno;
	778	io_bp->b_lblkno = bp->b_lblkno;
	779
	780	if (iodone) {
	781	io_bp->b_transaction = arg;
	782	io_bp->b_iodone = iodone;
	783	io_bp->b_flags \|= B_CALL;
	784	}
	785	if (force_copy == FALSE) {
	786	io_bp->b_bcount = bp->b_bcount;
	787	io_bp->b_bufsize = bp->b_bufsize;
	788
	789	if (external_storage) {
	790	io_bp->b_datap = external_storage;
	791	#ifdef BUF_MAKE_PRIVATE
	792	io_bp->b_data_store = NULL;
	793	#endif
	794	} else {
	795	io_bp->b_datap = bp->b_datap;
	796	#ifdef BUF_MAKE_PRIVATE
	797	io_bp->b_data_store = bp;
	798	#endif
	799	}
	800	(buf_t )(&io_bp->b_orig) = bp;
	801
	802	lck_mtx_lock_spin(buf_mtxp);
	803
	804	io_bp->b_lflags \|= BL_SHADOW;
	805	io_bp->b_shadow = bp->b_shadow;
	806	bp->b_shadow = io_bp;
	807	bp->b_shadow_ref++;
	808
	809	#ifdef BUF_MAKE_PRIVATE
	810	if (external_storage)
	811	io_bp->b_lflags \|= BL_EXTERNAL;
	812	else
	813	bp->b_data_ref++;
	814	#endif
	815	lck_mtx_unlock(buf_mtxp);
	816	} else {
	817	if (external_storage) {
	818	#ifdef BUF_MAKE_PRIVATE
	819	io_bp->b_lflags \|= BL_EXTERNAL;
	820	#endif
	821	io_bp->b_bcount = bp->b_bcount;
	822	io_bp->b_bufsize = bp->b_bufsize;
	823	io_bp->b_datap = external_storage;
	824	} else {
	825	allocbuf(io_bp, bp->b_bcount);
	826
	827	io_bp->b_lflags \|= BL_IOBUF_ALLOC;
	828	}
	829	bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
	830
	831	#ifdef BUF_MAKE_PRIVATE
	832	io_bp->b_data_store = NULL;
	833	#endif
	834	}
	835	KERNEL_DEBUG(0xbbbbc000 \| DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
	836
	837	return (io_bp);
	838	}
	839
	840
	841	#ifdef BUF_MAKE_PRIVATE
	842	errno_t
	843	buf_make_private(buf_t bp)
	844	{
	845	buf_t ds_bp;
	846	buf_t t_bp;
	847	struct buf my_buf;
	848
	849	KERNEL_DEBUG(0xbbbbc004 \| DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
	850
	851	if (bp->b_shadow_ref == 0 \|\| bp->b_data_ref == 0 \|\| ISSET(bp->b_lflags, BL_SHADOW)) {
	852
	853	KERNEL_DEBUG(0xbbbbc004 \| DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
	854	return (EINVAL);
	855	}
	856	my_buf.b_flags = B_META;
	857	my_buf.b_datap = (uintptr_t)NULL;
	858	allocbuf(&my_buf, bp->b_bcount);
	859
	860	bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
	861
	862	lck_mtx_lock_spin(buf_mtxp);
	863
	864	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
	865	if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
	866	break;
	867	}
	868	ds_bp = t_bp;
	869
	870	if (ds_bp == NULL && bp->b_data_ref)
	871	panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
	872
	873	if (ds_bp && (bp->b_data_ref == 0 \|\| bp->b_shadow_ref == 0))
	874	panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
	875
	876	if (ds_bp == NULL) {
	877	lck_mtx_unlock(buf_mtxp);
	878
	879	buf_free_meta_store(&my_buf);
	880
	881	KERNEL_DEBUG(0xbbbbc004 \| DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
	882	return (EINVAL);
	883	}
	884	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
	885	if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
	886	t_bp->b_data_store = ds_bp;
	887	}
	888	ds_bp->b_data_ref = bp->b_data_ref;
	889
	890	bp->b_data_ref = 0;
	891	bp->b_datap = my_buf.b_datap;
	892
	893	lck_mtx_unlock(buf_mtxp);
	894
	895	KERNEL_DEBUG(0xbbbbc004 \| DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
	896	return (0);
	897	}
	898	#endif
	899
	900
	901	void
	902	buf_setfilter(buf_t bp, void (filter)(buf_t, void ), void *transaction,
	903	void (*old_iodone)(buf_t, void ), void **old_transaction)
	904	{
	905	if (old_iodone)
	906	*old_iodone = bp->b_iodone;
	907	if (old_transaction)
	908	*old_transaction = bp->b_transaction;
	909
	910	bp->b_transaction = transaction;
	911	bp->b_iodone = filter;
	912	if (filter)
	913	bp->b_flags \|= B_FILTER;
	914	else
	915	bp->b_flags &= ~B_FILTER;
	916	}
	917
	918
	919	daddr64_t
	920	buf_blkno(buf_t bp) {
	921
	922	return (bp->b_blkno);
	923	}
	924
	925	daddr64_t
	926	buf_lblkno(buf_t bp) {
	927
	928	return (bp->b_lblkno);
	929	}
	930
	931	void
	932	buf_setblkno(buf_t bp, daddr64_t blkno) {
	933
	934	bp->b_blkno = blkno;
	935	}
	936
	937	void
	938	buf_setlblkno(buf_t bp, daddr64_t lblkno) {
	939
	940	bp->b_lblkno = lblkno;
	941	}
	942
	943	dev_t
	944	buf_device(buf_t bp) {
	945
	946	return (bp->b_dev);
	947	}
	948
	949	errno_t
	950	buf_setdevice(buf_t bp, vnode_t vp) {
	951
	952	if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
	953	return EINVAL;
	954	bp->b_dev = vp->v_rdev;
	955
	956	return 0;
	957	}
	958
	959
	960	void *
	961	buf_drvdata(buf_t bp) {
	962
	963	return (bp->b_drvdata);
	964	}
	965
	966	void
	967	buf_setdrvdata(buf_t bp, void *drvdata) {
	968
	969	bp->b_drvdata = drvdata;
	970	}
	971
	972	void *
	973	buf_fsprivate(buf_t bp) {
	974
	975	return (bp->b_fsprivate);
	976	}
	977
	978	void
	979	buf_setfsprivate(buf_t bp, void *fsprivate) {
	980
	981	bp->b_fsprivate = fsprivate;
	982	}
	983
	984	kauth_cred_t
	985	buf_rcred(buf_t bp) {
	986
	987	return (bp->b_rcred);
	988	}
	989
	990	kauth_cred_t
	991	buf_wcred(buf_t bp) {
	992
	993	return (bp->b_wcred);
	994	}
	995
	996	void *
	997	buf_upl(buf_t bp) {
	998
	999	return (bp->b_upl);
	1000	}
	1001
	1002	uint32_t
	1003	buf_uploffset(buf_t bp) {
	1004
	1005	return ((uint32_t)(bp->b_uploffset));
	1006	}
	1007
	1008	proc_t
	1009	buf_proc(buf_t bp) {
	1010
	1011	return (bp->b_proc);
	1012	}
	1013
	1014
	1015	errno_t
	1016	buf_map(buf_t bp, caddr_t *io_addr)
	1017	{
	1018	buf_t real_bp;
	1019	vm_offset_t vaddr;
	1020	kern_return_t kret;
	1021
	1022	if ( !(bp->b_flags & B_CLUSTER)) {
	1023	*io_addr = (caddr_t)bp->b_datap;
	1024	return (0);
	1025	}
	1026	real_bp = (buf_t)(bp->b_real_bp);
	1027
	1028	if (real_bp && real_bp->b_datap) {
	1029	/*
	1030	* b_real_bp is only valid if B_CLUSTER is SET
	1031	* if it's non-zero, than someone did a cluster_bp call
	1032	* if the backing physical pages were already mapped
	1033	* in before the call to cluster_bp (non-zero b_datap),
	1034	* than we just use that mapping
	1035	*/
	1036	*io_addr = (caddr_t)real_bp->b_datap;
	1037	return (0);
	1038	}
	1039	kret = ubc_upl_map(bp->b_upl, &vaddr); /* Map it in */
	1040
	1041	if (kret != KERN_SUCCESS) {
	1042	*io_addr = NULL;
	1043
	1044	return(ENOMEM);
	1045	}
	1046	vaddr += bp->b_uploffset;
	1047
	1048	*io_addr = (caddr_t)vaddr;
	1049
	1050	return (0);
	1051	}
	1052
	1053	errno_t
	1054	buf_unmap(buf_t bp)
	1055	{
	1056	buf_t real_bp;
	1057	kern_return_t kret;
	1058
	1059	if ( !(bp->b_flags & B_CLUSTER))
	1060	return (0);
	1061	/*
	1062	* see buf_map for the explanation
	1063	*/
	1064	real_bp = (buf_t)(bp->b_real_bp);
	1065
	1066	if (real_bp && real_bp->b_datap)
	1067	return (0);
	1068
	1069	if ((bp->b_lflags & BL_IOBUF) &&
	1070	((bp->b_flags & (B_PAGEIO \| B_READ)) != (B_PAGEIO \| B_READ))) {
	1071	/*
	1072	* ignore pageins... the 'right' thing will
	1073	* happen due to the way we handle speculative
	1074	* clusters...
	1075	*
	1076	* when we commit these pages, we'll hit
	1077	* it with UPL_COMMIT_INACTIVE which
	1078	* will clear the reference bit that got
	1079	* turned on when we touched the mapping
	1080	*/
	1081	bp->b_flags \|= B_AGE;
	1082	}
	1083	kret = ubc_upl_unmap(bp->b_upl);
	1084
	1085	if (kret != KERN_SUCCESS)
	1086	return (EINVAL);
	1087	return (0);
	1088	}
	1089
	1090
	1091	void
	1092	buf_clear(buf_t bp) {
	1093	caddr_t baddr;
	1094
	1095	if (buf_map(bp, &baddr) == 0) {
	1096	bzero(baddr, bp->b_bcount);
	1097	buf_unmap(bp);
	1098	}
	1099	bp->b_resid = 0;
	1100	}
	1101
	1102	/*
	1103	* Read or write a buffer that is not contiguous on disk.
	1104	* buffer is marked done/error at the conclusion
	1105	*/
	1106	static int
	1107	buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
	1108	{
	1109	vnode_t vp = buf_vnode(bp);
	1110	buf_t io_bp; /* For reading or writing a single block */
	1111	int io_direction;
	1112	int io_resid;
	1113	size_t io_contig_bytes;
	1114	daddr64_t io_blkno;
	1115	int error = 0;
	1116	int bmap_flags;
	1117
	1118	/*
	1119	* save our starting point... the bp was already mapped
	1120	* in buf_strategy before we got called
	1121	* no sense doing it again.
	1122	*/
	1123	io_blkno = bp->b_blkno;
	1124	/*
	1125	* Make sure we redo this mapping for the next I/O
	1126	* i.e. this can never be a 'permanent' mapping
	1127	*/
	1128	bp->b_blkno = bp->b_lblkno;
	1129
	1130	/*
	1131	* Get an io buffer to do the deblocking
	1132	*/
	1133	io_bp = alloc_io_buf(devvp, 0);
	1134
	1135	io_bp->b_lblkno = bp->b_lblkno;
	1136	io_bp->b_datap = bp->b_datap;
	1137	io_resid = bp->b_bcount;
	1138	io_direction = bp->b_flags & B_READ;
	1139	io_contig_bytes = contig_bytes;
	1140
	1141	if (bp->b_flags & B_READ)
	1142	bmap_flags = VNODE_READ;
	1143	else
	1144	bmap_flags = VNODE_WRITE;
	1145
	1146	for (;;) {
	1147	if (io_blkno == -1)
	1148	/*
	1149	* this is unexepected, but we'll allow for it
	1150	*/
	1151	bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
	1152	else {
	1153	io_bp->b_bcount = io_contig_bytes;
	1154	io_bp->b_bufsize = io_contig_bytes;
	1155	io_bp->b_resid = io_contig_bytes;
	1156	io_bp->b_blkno = io_blkno;
	1157
	1158	buf_reset(io_bp, io_direction);
	1159
	1160	/*
	1161	* Call the device to do the I/O and wait for it. Make sure the appropriate party is charged for write
	1162	*/
	1163
	1164	if (!ISSET(bp->b_flags, B_READ))
	1165	OSAddAtomic(1, &devvp->v_numoutput);
	1166
	1167	if ((error = VNOP_STRATEGY(io_bp)))
	1168	break;
	1169	if ((error = (int)buf_biowait(io_bp)))
	1170	break;
	1171	if (io_bp->b_resid) {
	1172	io_resid -= (io_contig_bytes - io_bp->b_resid);
	1173	break;
	1174	}
	1175	}
	1176	if ((io_resid -= io_contig_bytes) == 0)
	1177	break;
	1178	f_offset += io_contig_bytes;
	1179	io_bp->b_datap += io_contig_bytes;
	1180
	1181	/*
	1182	* Map the current position to a physical block number
	1183	*/
	1184	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
	1185	break;
	1186	}
	1187	buf_free(io_bp);
	1188
	1189	if (error)
	1190	buf_seterror(bp, error);
	1191	bp->b_resid = io_resid;
	1192	/*
	1193	* This I/O is now complete
	1194	*/
	1195	buf_biodone(bp);
	1196
	1197	return error;
	1198	}
	1199
	1200
	1201	/*
	1202	* struct vnop_strategy_args {
	1203	* struct buf *a_bp;
	1204	* } *ap;
	1205	*/
	1206	errno_t
	1207	buf_strategy(vnode_t devvp, void *ap)
	1208	{
	1209	buf_t bp = ((struct vnop_strategy_args *)ap)->a_bp;
	1210	vnode_t vp = bp->b_vp;
	1211	int bmap_flags;
	1212	errno_t error;
	1213	#if CONFIG_DTRACE
	1214	int dtrace_io_start_flag = 0; /* We only want to trip the io:::start
	1215	* probe once, with the true physical
	1216	* block in place (b_blkno)
	1217	*/
	1218
	1219	#endif
	1220
	1221	if (vp == NULL \|\| vp->v_type == VCHR \|\| vp->v_type == VBLK)
	1222	panic("buf_strategy: b_vp == NULL \|\| vtype == VCHR \| VBLK\n");
	1223	/*
	1224	* associate the physical device with
	1225	* with this buf_t even if we don't
	1226	* end up issuing the I/O...
	1227	*/
	1228	bp->b_dev = devvp->v_rdev;
	1229
	1230	if (bp->b_flags & B_READ)
	1231	bmap_flags = VNODE_READ;
	1232	else
	1233	bmap_flags = VNODE_WRITE;
	1234
	1235	if ( !(bp->b_flags & B_CLUSTER)) {
	1236
	1237	if ( (bp->b_upl) ) {
	1238	/*
	1239	* we have a UPL associated with this bp
	1240	* go through cluster_bp which knows how
	1241	* to deal with filesystem block sizes
	1242	* that aren't equal to the page size
	1243	*/
	1244	DTRACE_IO1(start, buf_t, bp);
	1245	return (cluster_bp(bp));
	1246	}
	1247	if (bp->b_blkno == bp->b_lblkno) {
	1248	off_t f_offset;
	1249	size_t contig_bytes;
	1250
	1251	if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
	1252	DTRACE_IO1(start, buf_t, bp);
	1253	buf_seterror(bp, error);
	1254	buf_biodone(bp);
	1255
	1256	return (error);
	1257	}
	1258
	1259	if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
	1260	DTRACE_IO1(start, buf_t, bp);
	1261	buf_seterror(bp, error);
	1262	buf_biodone(bp);
	1263
	1264	return (error);
	1265	}
	1266
	1267	DTRACE_IO1(start, buf_t, bp);
	1268	#if CONFIG_DTRACE
	1269	dtrace_io_start_flag = 1;
	1270	#endif /* CONFIG_DTRACE */
	1271
	1272	if ((bp->b_blkno == -1) \|\| (contig_bytes == 0)) {
	1273	/* Set block number to force biodone later */
	1274	bp->b_blkno = -1;
	1275	buf_clear(bp);
	1276	}
	1277	else if ((long)contig_bytes < bp->b_bcount) {
	1278	return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
	1279	}
	1280	}
	1281
	1282	#if CONFIG_DTRACE
	1283	if (dtrace_io_start_flag == 0) {
	1284	DTRACE_IO1(start, buf_t, bp);
	1285	dtrace_io_start_flag = 1;
	1286	}
	1287	#endif /* CONFIG_DTRACE */
	1288
	1289	if (bp->b_blkno == -1) {
	1290	buf_biodone(bp);
	1291	return (0);
	1292	}
	1293	}
	1294
	1295	#if CONFIG_DTRACE
	1296	if (dtrace_io_start_flag == 0)
	1297	DTRACE_IO1(start, buf_t, bp);
	1298	#endif /* CONFIG_DTRACE */
	1299
	1300	#if CONFIG_PROTECT
	1301	/* Capture f_offset in the bufattr*/
	1302	if (bp->b_attr.ba_cpentry != 0) {
	1303	/* No need to go here for older EAs */
	1304	if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
	1305	off_t f_offset;
	1306	if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
	1307	return error;
	1308
	1309	/*
	1310	* Attach the file offset to this buffer. The
	1311	* bufattr attributes will be passed down the stack
	1312	* until they reach IOFlashStorage. IOFlashStorage
	1313	* will retain the offset in a local variable when it
	1314	* issues its I/Os to the NAND controller.
	1315	*
	1316	* Note that LwVM may end up splitting this I/O
	1317	* into sub-I/Os if it crosses a chunk boundary. In this
	1318	* case, LwVM will update this field when it dispatches
	1319	* each I/O to IOFlashStorage. But from our perspective
	1320	* we have only issued a single I/O.
	1321	*/
	1322	bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
	1323	}
	1324	}
	1325	#endif
	1326
	1327	/*
	1328	* we can issue the I/O because...
	1329	* either B_CLUSTER is set which
	1330	* means that the I/O is properly set
	1331	* up to be a multiple of the page size, or
	1332	* we were able to successfully set up the
	1333	* physical block mapping
	1334	*/
	1335	error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
	1336	DTRACE_FSINFO(strategy, vnode_t, vp);
	1337	return (error);
	1338	}
	1339
	1340
	1341
	1342	buf_t
	1343	buf_alloc(vnode_t vp)
	1344	{
	1345	return(alloc_io_buf(vp, 0));
	1346	}
	1347
	1348	void
	1349	buf_free(buf_t bp) {
	1350
	1351	free_io_buf(bp);
	1352	}
	1353
	1354
	1355	/*
	1356	* iterate buffers for the specified vp.
	1357	* if BUF_SCAN_DIRTY is set, do the dirty list
	1358	* if BUF_SCAN_CLEAN is set, do the clean list
	1359	* if neither flag is set, default to BUF_SCAN_DIRTY
	1360	* if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
	1361	*/
	1362
	1363	struct buf_iterate_info_t {
	1364	int flag;
	1365	struct buflists *listhead;
	1366	};
	1367
	1368	void
	1369	buf_iterate(vnode_t vp, int (callout)(buf_t, void ), int flags, void *arg)
	1370	{
	1371	buf_t bp;
	1372	int retval;
	1373	struct buflists local_iterblkhd;
	1374	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
	1375	int notify_busy = flags & BUF_NOTIFY_BUSY;
	1376	struct buf_iterate_info_t list[2];
	1377	int num_lists, i;
	1378
	1379	if (flags & BUF_SKIP_LOCKED)
	1380	lock_flags \|= BAC_SKIP_LOCKED;
	1381	if (flags & BUF_SKIP_NONLOCKED)
	1382	lock_flags \|= BAC_SKIP_NONLOCKED;
	1383
	1384	if ( !(flags & (BUF_SCAN_DIRTY \| BUF_SCAN_CLEAN)))
	1385	flags \|= BUF_SCAN_DIRTY;
	1386
	1387	num_lists = 0;
	1388
	1389	if (flags & BUF_SCAN_DIRTY) {
	1390	list[num_lists].flag = VBI_DIRTY;
	1391	list[num_lists].listhead = &vp->v_dirtyblkhd;
	1392	num_lists++;
	1393	}
	1394	if (flags & BUF_SCAN_CLEAN) {
	1395	list[num_lists].flag = VBI_CLEAN;
	1396	list[num_lists].listhead = &vp->v_cleanblkhd;
	1397	num_lists++;
	1398	}
	1399
	1400	for (i = 0; i < num_lists; i++) {
	1401	lck_mtx_lock(buf_mtxp);
	1402
	1403	if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag)) {
	1404	lck_mtx_unlock(buf_mtxp);
	1405	continue;
	1406	}
	1407	while (!LIST_EMPTY(&local_iterblkhd)) {
	1408	bp = LIST_FIRST(&local_iterblkhd);
	1409	LIST_REMOVE(bp, b_vnbufs);
	1410	LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
	1411
	1412	if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
	1413	if (notify_busy) {
	1414	bp = NULL;
	1415	} else {
	1416	continue;
	1417	}
	1418	}
	1419
	1420	lck_mtx_unlock(buf_mtxp);
	1421
	1422	retval = callout(bp, arg);
	1423
	1424	switch (retval) {
	1425	case BUF_RETURNED:
	1426	if (bp)
	1427	buf_brelse(bp);
	1428	break;
	1429	case BUF_CLAIMED:
	1430	break;
	1431	case BUF_RETURNED_DONE:
	1432	if (bp)
	1433	buf_brelse(bp);
	1434	lck_mtx_lock(buf_mtxp);
	1435	goto out;
	1436	case BUF_CLAIMED_DONE:
	1437	lck_mtx_lock(buf_mtxp);
	1438	goto out;
	1439	}
	1440	lck_mtx_lock(buf_mtxp);
	1441	} /* while list has more nodes */
	1442	out:
	1443	buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
	1444	lck_mtx_unlock(buf_mtxp);
	1445	} /* for each list */
	1446	} /* buf_iterate */
	1447
	1448
	1449	/*
	1450	* Flush out and invalidate all buffers associated with a vnode.
	1451	*/
	1452	int
	1453	buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
	1454	{
	1455	buf_t bp;
	1456	int aflags;
	1457	int error = 0;
	1458	int must_rescan = 1;
	1459	struct buflists local_iterblkhd;
	1460
	1461
	1462	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
	1463	return (0);
	1464
	1465	lck_mtx_lock(buf_mtxp);
	1466
	1467	for (;;) {
	1468	if (must_rescan == 0)
	1469	/*
	1470	* the lists may not be empty, but all that's left at this
	1471	* point are metadata or B_LOCKED buffers which are being
	1472	* skipped... we know this because we made it through both
	1473	* the clean and dirty lists without dropping buf_mtxp...
	1474	* each time we drop buf_mtxp we bump "must_rescan"
	1475	*/
	1476	break;
	1477	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
	1478	break;
	1479	must_rescan = 0;
	1480	/*
	1481	* iterate the clean list
	1482	*/
	1483	if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
	1484	goto try_dirty_list;
	1485	}
	1486	while (!LIST_EMPTY(&local_iterblkhd)) {
	1487
	1488	bp = LIST_FIRST(&local_iterblkhd);
	1489
	1490	LIST_REMOVE(bp, b_vnbufs);
	1491	LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
	1492
	1493	/*
	1494	* some filesystems distinguish meta data blocks with a negative logical block #
	1495	*/
	1496	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 \|\| ISSET(bp->b_flags, B_META)))
	1497	continue;
	1498
	1499	aflags = BAC_REMOVE;
	1500
	1501	if ( !(flags & BUF_INVALIDATE_LOCKED) )
	1502	aflags \|= BAC_SKIP_LOCKED;
	1503
	1504	if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
	1505	if (error == EDEADLK)
	1506	/*
	1507	* this buffer was marked B_LOCKED...
	1508	* we didn't drop buf_mtxp, so we
	1509	* we don't need to rescan
	1510	*/
	1511	continue;
	1512	if (error == EAGAIN) {
	1513	/*
	1514	* found a busy buffer... we blocked and
	1515	* dropped buf_mtxp, so we're going to
	1516	* need to rescan after this pass is completed
	1517	*/
	1518	must_rescan++;
	1519	continue;
	1520	}
	1521	/*
	1522	* got some kind of 'real' error out of the msleep
	1523	* in buf_acquire_locked, terminate the scan and return the error
	1524	*/
	1525	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
	1526
	1527	lck_mtx_unlock(buf_mtxp);
	1528	return (error);
	1529	}
	1530	lck_mtx_unlock(buf_mtxp);
	1531
	1532	if (bp->b_flags & B_LOCKED)
	1533	KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
	1534
	1535	CLR(bp->b_flags, B_LOCKED);
	1536	SET(bp->b_flags, B_INVAL);
	1537	buf_brelse(bp);
	1538
	1539	lck_mtx_lock(buf_mtxp);
	1540
	1541	/*
	1542	* by dropping buf_mtxp, we allow new
	1543	* buffers to be added to the vnode list(s)
	1544	* we'll have to rescan at least once more
	1545	* if the queues aren't empty
	1546	*/
	1547	must_rescan++;
	1548	}
	1549	buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
	1550
	1551	try_dirty_list:
	1552	/*
	1553	* Now iterate on dirty blks
	1554	*/
	1555	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
	1556	continue;
	1557	}
	1558	while (!LIST_EMPTY(&local_iterblkhd)) {
	1559	bp = LIST_FIRST(&local_iterblkhd);
	1560
	1561	LIST_REMOVE(bp, b_vnbufs);
	1562	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
	1563
	1564	/*
	1565	* some filesystems distinguish meta data blocks with a negative logical block #
	1566	*/
	1567	if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 \|\| ISSET(bp->b_flags, B_META)))
	1568	continue;
	1569
	1570	aflags = BAC_REMOVE;
	1571
	1572	if ( !(flags & BUF_INVALIDATE_LOCKED) )
	1573	aflags \|= BAC_SKIP_LOCKED;
	1574
	1575	if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
	1576	if (error == EDEADLK)
	1577	/*
	1578	* this buffer was marked B_LOCKED...
	1579	* we didn't drop buf_mtxp, so we
	1580	* we don't need to rescan
	1581	*/
	1582	continue;
	1583	if (error == EAGAIN) {
	1584	/*
	1585	* found a busy buffer... we blocked and
	1586	* dropped buf_mtxp, so we're going to
	1587	* need to rescan after this pass is completed
	1588	*/
	1589	must_rescan++;
	1590	continue;
	1591	}
	1592	/*
	1593	* got some kind of 'real' error out of the msleep
	1594	* in buf_acquire_locked, terminate the scan and return the error
	1595	*/
	1596	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1597
	1598	lck_mtx_unlock(buf_mtxp);
	1599	return (error);
	1600	}
	1601	lck_mtx_unlock(buf_mtxp);
	1602
	1603	if (bp->b_flags & B_LOCKED)
	1604	KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
	1605
	1606	CLR(bp->b_flags, B_LOCKED);
	1607	SET(bp->b_flags, B_INVAL);
	1608
	1609	if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
	1610	(void) VNOP_BWRITE(bp);
	1611	else
	1612	buf_brelse(bp);
	1613
	1614	lck_mtx_lock(buf_mtxp);
	1615	/*
	1616	* by dropping buf_mtxp, we allow new
	1617	* buffers to be added to the vnode list(s)
	1618	* we'll have to rescan at least once more
	1619	* if the queues aren't empty
	1620	*/
	1621	must_rescan++;
	1622	}
	1623	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1624	}
	1625	lck_mtx_unlock(buf_mtxp);
	1626
	1627	return (0);
	1628	}
	1629
	1630	void
	1631	buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
	1632
	1633	(void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
	1634	return;
	1635	}
	1636
	1637	int
	1638	buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
	1639	buf_t bp;
	1640	int writes_issued = 0;
	1641	errno_t error;
	1642	int busy = 0;
	1643	struct buflists local_iterblkhd;
	1644	int lock_flags = BAC_NOWAIT \| BAC_REMOVE;
	1645	int any_locked = 0;
	1646
	1647	if (flags & BUF_SKIP_LOCKED)
	1648	lock_flags \|= BAC_SKIP_LOCKED;
	1649	if (flags & BUF_SKIP_NONLOCKED)
	1650	lock_flags \|= BAC_SKIP_NONLOCKED;
	1651	loop:
	1652	lck_mtx_lock(buf_mtxp);
	1653
	1654	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0) {
	1655	while (!LIST_EMPTY(&local_iterblkhd)) {
	1656	bp = LIST_FIRST(&local_iterblkhd);
	1657	LIST_REMOVE(bp, b_vnbufs);
	1658	LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
	1659
	1660	if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
	1661	busy++;
	1662	}
	1663	if (error) {
	1664	/*
	1665	* If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
	1666	* we may want to do somethign differently if a locked or unlocked
	1667	* buffer was encountered (depending on the arg specified).
	1668	* In this case, we know that one of those two was set, and the
	1669	* buf acquisition failed above.
	1670	*
	1671	* If it failed with EDEADLK, then save state which can be emitted
	1672	* later on to the caller. Most callers should not care.
	1673	*/
	1674	if (error == EDEADLK) {
	1675	any_locked++;
	1676	}
	1677	continue;
	1678	}
	1679	lck_mtx_unlock(buf_mtxp);
	1680
	1681	bp->b_flags &= ~B_LOCKED;
	1682
	1683	/*
	1684	* Wait for I/O associated with indirect blocks to complete,
	1685	* since there is no way to quickly wait for them below.
	1686	*/
	1687	if ((bp->b_vp == vp) \|\| (wait == 0))
	1688	(void) buf_bawrite(bp);
	1689	else
	1690	(void) VNOP_BWRITE(bp);
	1691	writes_issued++;
	1692
	1693	lck_mtx_lock(buf_mtxp);
	1694	}
	1695	buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
	1696	}
	1697	lck_mtx_unlock(buf_mtxp);
	1698
	1699	if (wait) {
	1700	(void)vnode_waitforwrites(vp, 0, 0, 0, msg);
	1701
	1702	if (vp->v_dirtyblkhd.lh_first && busy) {
	1703	/*
	1704	* we had one or more BUSY buffers on
	1705	* the dirtyblock list... most likely
	1706	* these are due to delayed writes that
	1707	* were moved to the bclean queue but
	1708	* have not yet been 'written'.
	1709	* if we issued some writes on the
	1710	* previous pass, we try again immediately
	1711	* if we didn't, we'll sleep for some time
	1712	* to allow the state to change...
	1713	*/
	1714	if (writes_issued == 0) {
	1715	(void)tsleep((caddr_t)&vp->v_numoutput,
	1716	PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
	1717	}
	1718	writes_issued = 0;
	1719	busy = 0;
	1720
	1721	goto loop;
	1722	}
	1723	}
	1724
	1725	return any_locked;
	1726	}
	1727
	1728
	1729	/*
	1730	* called with buf_mtxp held...
	1731	* this lock protects the queue manipulation
	1732	*/
	1733	static int
	1734	buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
	1735	{
	1736	struct buflists * listheadp;
	1737
	1738	if (flags & VBI_DIRTY)
	1739	listheadp = &vp->v_dirtyblkhd;
	1740	else
	1741	listheadp = &vp->v_cleanblkhd;
	1742
	1743	while (vp->v_iterblkflags & VBI_ITER) {
	1744	vp->v_iterblkflags \|= VBI_ITERWANT;
	1745	msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
	1746	}
	1747	if (LIST_EMPTY(listheadp)) {
	1748	LIST_INIT(iterheadp);
	1749	return(EINVAL);
	1750	}
	1751	vp->v_iterblkflags \|= VBI_ITER;
	1752
	1753	iterheadp->lh_first = listheadp->lh_first;
	1754	listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
	1755	LIST_INIT(listheadp);
	1756
	1757	return(0);
	1758	}
	1759
	1760	/*
	1761	* called with buf_mtxp held...
	1762	* this lock protects the queue manipulation
	1763	*/
	1764	static void
	1765	buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
	1766	{
	1767	struct buflists * listheadp;
	1768	buf_t bp;
	1769
	1770	if (flags & VBI_DIRTY)
	1771	listheadp = &vp->v_dirtyblkhd;
	1772	else
	1773	listheadp = &vp->v_cleanblkhd;
	1774
	1775	while (!LIST_EMPTY(iterheadp)) {
	1776	bp = LIST_FIRST(iterheadp);
	1777	LIST_REMOVE(bp, b_vnbufs);
	1778	LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
	1779	}
	1780	vp->v_iterblkflags &= ~VBI_ITER;
	1781
	1782	if (vp->v_iterblkflags & VBI_ITERWANT) {
	1783	vp->v_iterblkflags &= ~VBI_ITERWANT;
	1784	wakeup(&vp->v_iterblkflags);
	1785	}
	1786	}
	1787
	1788
	1789	static void
	1790	bremfree_locked(buf_t bp)
	1791	{
	1792	struct bqueues *dp = NULL;
	1793	int whichq;
	1794
	1795	whichq = bp->b_whichq;
	1796
	1797	if (whichq == -1) {
	1798	if (bp->b_shadow_ref == 0)
	1799	panic("bremfree_locked: %p not on freelist", bp);
	1800	/*
	1801	* there are clones pointing to 'bp'...
	1802	* therefore, it was not put on a freelist
	1803	* when buf_brelse was last called on 'bp'
	1804	*/
	1805	return;
	1806	}
	1807	/*
	1808	* We only calculate the head of the freelist when removing
	1809	* the last element of the list as that is the only time that
	1810	* it is needed (e.g. to reset the tail pointer).
	1811	*
	1812	* NB: This makes an assumption about how tailq's are implemented.
	1813	*/
	1814	if (bp->b_freelist.tqe_next == NULL) {
	1815	dp = &bufqueues[whichq];
	1816
	1817	if (dp->tqh_last != &bp->b_freelist.tqe_next)
	1818	panic("bremfree: lost tail");
	1819	}
	1820	TAILQ_REMOVE(dp, bp, b_freelist);
	1821
	1822	#if BALANCE_QUEUES
	1823	bufqdec(whichq);
	1824	#endif
	1825	if (whichq == BQ_LAUNDRY)
	1826	blaundrycnt--;
	1827
	1828	bp->b_whichq = -1;
	1829	bp->b_timestamp = 0;
	1830	bp->b_shadow = 0;
	1831	}
	1832
	1833	/*
	1834	* Associate a buffer with a vnode.
	1835	* buf_mtxp must be locked on entry
	1836	*/
	1837	static void
	1838	bgetvp_locked(vnode_t vp, buf_t bp)
	1839	{
	1840
	1841	if (bp->b_vp != vp)
	1842	panic("bgetvp_locked: not free");
	1843
	1844	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	1845	bp->b_dev = vp->v_rdev;
	1846	else
	1847	bp->b_dev = NODEV;
	1848	/*
	1849	* Insert onto list for new vnode.
	1850	*/
	1851	bufinsvn(bp, &vp->v_cleanblkhd);
	1852	}
	1853
	1854	/*
	1855	* Disassociate a buffer from a vnode.
	1856	* buf_mtxp must be locked on entry
	1857	*/
	1858	static void
	1859	brelvp_locked(buf_t bp)
	1860	{
	1861	/*
	1862	* Delete from old vnode list, if on one.
	1863	*/
	1864	if (bp->b_vnbufs.le_next != NOLIST)
	1865	bufremvn(bp);
	1866
	1867	bp->b_vp = (vnode_t)NULL;
	1868	}
	1869
	1870	/*
	1871	* Reassign a buffer from one vnode to another.
	1872	* Used to assign file specific control information
	1873	* (indirect blocks) to the vnode to which they belong.
	1874	*/
	1875	static void
	1876	buf_reassign(buf_t bp, vnode_t newvp)
	1877	{
	1878	struct buflists *listheadp;
	1879
	1880	if (newvp == NULL) {
	1881	printf("buf_reassign: NULL");
	1882	return;
	1883	}
	1884	lck_mtx_lock_spin(buf_mtxp);
	1885
	1886	/*
	1887	* Delete from old vnode list, if on one.
	1888	*/
	1889	if (bp->b_vnbufs.le_next != NOLIST)
	1890	bufremvn(bp);
	1891	/*
	1892	* If dirty, put on list of dirty buffers;
	1893	* otherwise insert onto list of clean buffers.
	1894	*/
	1895	if (ISSET(bp->b_flags, B_DELWRI))
	1896	listheadp = &newvp->v_dirtyblkhd;
	1897	else
	1898	listheadp = &newvp->v_cleanblkhd;
	1899	bufinsvn(bp, listheadp);
	1900
	1901	lck_mtx_unlock(buf_mtxp);
	1902	}
	1903
	1904	static __inline__ void
	1905	bufhdrinit(buf_t bp)
	1906	{
	1907	bzero((char )bp, sizeof bp);
	1908	bp->b_dev = NODEV;
	1909	bp->b_rcred = NOCRED;
	1910	bp->b_wcred = NOCRED;
	1911	bp->b_vnbufs.le_next = NOLIST;
	1912	bp->b_flags = B_INVAL;
	1913
	1914	return;
	1915	}
	1916
	1917	/*
	1918	* Initialize buffers and hash links for buffers.
	1919	*/
	1920	__private_extern__ void
	1921	bufinit(void)
	1922	{
	1923	buf_t bp;
	1924	struct bqueues *dp;
	1925	int i;
	1926
	1927	nbuf_headers = 0;
	1928	/* Initialize the buffer queues ('freelists') and the hash table */
	1929	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
	1930	TAILQ_INIT(dp);
	1931	bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
	1932
	1933	buf_busycount = 0;
	1934
	1935	/* Initialize the buffer headers */
	1936	for (i = 0; i < max_nbuf_headers; i++) {
	1937	nbuf_headers++;
	1938	bp = &buf_headers[i];
	1939	bufhdrinit(bp);
	1940
	1941	BLISTNONE(bp);
	1942	dp = &bufqueues[BQ_EMPTY];
	1943	bp->b_whichq = BQ_EMPTY;
	1944	bp->b_timestamp = buf_timestamp();
	1945	binsheadfree(bp, dp, BQ_EMPTY);
	1946	binshash(bp, &invalhash);
	1947	}
	1948	boot_nbuf_headers = nbuf_headers;
	1949
	1950	TAILQ_INIT(&iobufqueue);
	1951	TAILQ_INIT(&delaybufqueue);
	1952
	1953	for (; i < nbuf_headers + niobuf_headers; i++) {
	1954	bp = &buf_headers[i];
	1955	bufhdrinit(bp);
	1956	bp->b_whichq = -1;
	1957	binsheadfree(bp, &iobufqueue, -1);
	1958	}
	1959
	1960	/*
	1961	* allocate lock group attribute and group
	1962	*/
	1963	buf_mtx_grp_attr = lck_grp_attr_alloc_init();
	1964	buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
	1965
	1966	/*
	1967	* allocate the lock attribute
	1968	*/
	1969	buf_mtx_attr = lck_attr_alloc_init();
	1970
	1971	/*
	1972	* allocate and initialize mutex's for the buffer and iobuffer pools
	1973	*/
	1974	buf_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
	1975	iobuffer_mtxp = lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
	1976
	1977	if (iobuffer_mtxp == NULL)
	1978	panic("couldn't create iobuffer mutex");
	1979
	1980	if (buf_mtxp == NULL)
	1981	panic("couldn't create buf mutex");
	1982
	1983	/*
	1984	* allocate and initialize cluster specific global locks...
	1985	*/
	1986	cluster_init();
	1987
	1988	printf("using %d buffer headers and %d cluster IO buffer headers\n",
	1989	nbuf_headers, niobuf_headers);
	1990
	1991	/* Set up zones used by the buffer cache */
	1992	bufzoneinit();
	1993
	1994	/* start the bcleanbuf() thread */
	1995	bcleanbuf_thread_init();
	1996
	1997	/* Register a callout for relieving vm pressure */
	1998	if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
	1999	panic("Couldn't register buffer cache callout for vm pressure!\n");
	2000	}
	2001
	2002	#if BALANCE_QUEUES
	2003	{
	2004	static void bufq_balance_thread_init(void);
	2005	/* create a thread to do dynamic buffer queue balancing */
	2006	bufq_balance_thread_init();
	2007	}
	2008	#endif /* notyet */
	2009	}
	2010
	2011
	2012
	2013	/*
	2014	* Zones for the meta data buffers
	2015	*/
	2016
	2017	#define MINMETA 512
	2018	#define MAXMETA 8192
	2019
	2020	struct meta_zone_entry {
	2021	zone_t mz_zone;
	2022	vm_size_t mz_size;
	2023	vm_size_t mz_max;
	2024	const char *mz_name;
	2025	};
	2026
	2027	struct meta_zone_entry meta_zones[] = {
	2028	{NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
	2029	{NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
	2030	{NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
	2031	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
	2032	{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
	2033	{NULL, 0, 0, "" } /* End */
	2034	};
	2035
	2036	/*
	2037	* Initialize the meta data zones
	2038	*/
	2039	static void
	2040	bufzoneinit(void)
	2041	{
	2042	int i;
	2043
	2044	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	2045	meta_zones[i].mz_zone =
	2046	zinit(meta_zones[i].mz_size,
	2047	meta_zones[i].mz_max,
	2048	PAGE_SIZE,
	2049	meta_zones[i].mz_name);
	2050	zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
	2051	}
	2052	buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
	2053	zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
	2054	}
	2055
	2056	static __inline__ zone_t
	2057	getbufzone(size_t size)
	2058	{
	2059	int i;
	2060
	2061	if ((size % 512) \|\| (size < MINMETA) \|\| (size > MAXMETA))
	2062	panic("getbufzone: incorect size = %lu", size);
	2063
	2064	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	2065	if (meta_zones[i].mz_size >= size)
	2066	break;
	2067	}
	2068
	2069	return (meta_zones[i].mz_zone);
	2070	}
	2071
	2072
	2073
	2074	static struct buf *
	2075	bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
	2076	{
	2077	buf_t bp;
	2078
	2079	bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
	2080
	2081	/*
	2082	* If buffer does not have data valid, start a read.
	2083	* Note that if buffer is B_INVAL, buf_getblk() won't return it.
	2084	* Therefore, it's valid if it's I/O has completed or been delayed.
	2085	*/
	2086	if (!ISSET(bp->b_flags, (B_DONE \| B_DELWRI))) {
	2087	struct proc *p;
	2088
	2089	p = current_proc();
	2090
	2091	/* Start I/O for the buffer (keeping credentials). */
	2092	SET(bp->b_flags, B_READ \| async);
	2093	if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
	2094	kauth_cred_ref(cred);
	2095	bp->b_rcred = cred;
	2096	}
	2097
	2098	VNOP_STRATEGY(bp);
	2099
	2100	trace(TR_BREADMISS, pack(vp, size), blkno);
	2101
	2102	/* Pay for the read. */
	2103	if (p && p->p_stats) {
	2104	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock); /* XXX */
	2105	OSAddAtomic64(size, &p->p_stats->ri_diskiobytes.ri_bytesread);
	2106	}
	2107
	2108	if (async) {
	2109	/*
	2110	* since we asked for an ASYNC I/O
	2111	* the biodone will do the brelse
	2112	* we don't want to pass back a bp
	2113	* that we don't 'own'
	2114	*/
	2115	bp = NULL;
	2116	}
	2117	} else if (async) {
	2118	buf_brelse(bp);
	2119	bp = NULL;
	2120	}
	2121
	2122	trace(TR_BREADHIT, pack(vp, size), blkno);
	2123
	2124	return (bp);
	2125	}
	2126
	2127	/*
	2128	* Perform the reads for buf_breadn() and buf_meta_breadn().
	2129	* Trivial modification to the breada algorithm presented in Bach (p.55).
	2130	*/
	2131	static errno_t
	2132	do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes,
	2133	int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
	2134	{
	2135	buf_t bp;
	2136	int i;
	2137
	2138	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
	2139
	2140	/*
	2141	* For each of the read-ahead blocks, start a read, if necessary.
	2142	*/
	2143	for (i = 0; i < nrablks; i++) {
	2144	/* If it's in the cache, just go on to next one. */
	2145	if (incore(vp, rablks[i]))
	2146	continue;
	2147
	2148	/* Get a buffer for the read-ahead block */
	2149	(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
	2150	}
	2151
	2152	/* Otherwise, we had to start a read for it; wait until it's valid. */
	2153	return (buf_biowait(bp));
	2154	}
	2155
	2156
	2157	/*
	2158	* Read a disk block.
	2159	* This algorithm described in Bach (p.54).
	2160	*/
	2161	errno_t
	2162	buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
	2163	{
	2164	buf_t bp;
	2165
	2166	/* Get buffer for block. */
	2167	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
	2168
	2169	/* Wait for the read to complete, and return result. */
	2170	return (buf_biowait(bp));
	2171	}
	2172
	2173	/*
	2174	* Read a disk block. [bread() for meta-data]
	2175	* This algorithm described in Bach (p.54).
	2176	*/
	2177	errno_t
	2178	buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
	2179	{
	2180	buf_t bp;
	2181
	2182	/* Get buffer for block. */
	2183	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
	2184
	2185	/* Wait for the read to complete, and return result. */
	2186	return (buf_biowait(bp));
	2187	}
	2188
	2189	/*
	2190	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	2191	*/
	2192	errno_t
	2193	buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
	2194	{
	2195	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
	2196	}
	2197
	2198	/*
	2199	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	2200	* [buf_breadn() for meta-data]
	2201	*/
	2202	errno_t
	2203	buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t rablks, int rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
	2204	{
	2205	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
	2206	}
	2207
	2208	/*
	2209	* Block write. Described in Bach (p.56)
	2210	*/
	2211	errno_t
	2212	buf_bwrite(buf_t bp)
	2213	{
	2214	int sync, wasdelayed;
	2215	errno_t rv;
	2216	proc_t p = current_proc();
	2217	vnode_t vp = bp->b_vp;
	2218
	2219	if (bp->b_datap == 0) {
	2220	if (brecover_data(bp) == 0)
	2221	return (0);
	2222	}
	2223	/* Remember buffer type, to switch on it later. */
	2224	sync = !ISSET(bp->b_flags, B_ASYNC);
	2225	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
	2226	CLR(bp->b_flags, (B_READ \| B_DONE \| B_ERROR \| B_DELWRI));
	2227
	2228	if (wasdelayed)
	2229	OSAddAtomicLong(-1, &nbdwrite);
	2230
	2231	if (!sync) {
	2232	/*
	2233	* If not synchronous, pay for the I/O operation and make
	2234	* sure the buf is on the correct vnode queue. We have
	2235	* to do this now, because if we don't, the vnode may not
	2236	* be properly notified that its I/O has completed.
	2237	*/
	2238	if (wasdelayed)
	2239	buf_reassign(bp, vp);
	2240	else
	2241	if (p && p->p_stats) {
	2242	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
	2243	OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
	2244	}
	2245	}
	2246	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
	2247
	2248	/* Initiate disk write. Make sure the appropriate party is charged. */
	2249
	2250	OSAddAtomic(1, &vp->v_numoutput);
	2251
	2252	VNOP_STRATEGY(bp);
	2253
	2254	if (sync) {
	2255	/*
	2256	* If I/O was synchronous, wait for it to complete.
	2257	*/
	2258	rv = buf_biowait(bp);
	2259
	2260	/*
	2261	* Pay for the I/O operation, if it's not been paid for, and
	2262	* make sure it's on the correct vnode queue. (async operatings
	2263	* were payed for above.)
	2264	*/
	2265	if (wasdelayed)
	2266	buf_reassign(bp, vp);
	2267	else
	2268	if (p && p->p_stats) {
	2269	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
	2270	OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
	2271	}
	2272
	2273	/* Release the buffer. */
	2274	// XXXdbg - only if the unused bit is set
	2275	if (!ISSET(bp->b_flags, B_NORELSE)) {
	2276	buf_brelse(bp);
	2277	} else {
	2278	CLR(bp->b_flags, B_NORELSE);
	2279	}
	2280
	2281	return (rv);
	2282	} else {
	2283	return (0);
	2284	}
	2285	}
	2286
	2287	int
	2288	vn_bwrite(struct vnop_bwrite_args *ap)
	2289	{
	2290	return (buf_bwrite(ap->a_bp));
	2291	}
	2292
	2293	/*
	2294	* Delayed write.
	2295	*
	2296	* The buffer is marked dirty, but is not queued for I/O.
	2297	* This routine should be used when the buffer is expected
	2298	* to be modified again soon, typically a small write that
	2299	* partially fills a buffer.
	2300	*
	2301	* NB: magnetic tapes cannot be delayed; they must be
	2302	* written in the order that the writes are requested.
	2303	*
	2304	* Described in Leffler, et al. (pp. 208-213).
	2305	*
	2306	* Note: With the ability to allocate additional buffer
	2307	* headers, we can get in to the situation where "too" many
	2308	* buf_bdwrite()s can create situation where the kernel can create
	2309	* buffers faster than the disks can service. Doing a buf_bawrite() in
	2310	* cases where we have "too many" outstanding buf_bdwrite()s avoids that.
	2311	*/
	2312	__private_extern__ int
	2313	bdwrite_internal(buf_t bp, int return_error)
	2314	{
	2315	proc_t p = current_proc();
	2316	vnode_t vp = bp->b_vp;
	2317
	2318	/*
	2319	* If the block hasn't been seen before:
	2320	* (1) Mark it as having been seen,
	2321	* (2) Charge for the write.
	2322	* (3) Make sure it's on its vnode's correct block list,
	2323	*/
	2324	if (!ISSET(bp->b_flags, B_DELWRI)) {
	2325	SET(bp->b_flags, B_DELWRI);
	2326	if (p && p->p_stats) {
	2327	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock); /* XXX */
	2328	OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
	2329	}
	2330	OSAddAtomicLong(1, &nbdwrite);
	2331	buf_reassign(bp, vp);
	2332	}
	2333
	2334	/*
	2335	* if we're not LOCKED, but the total number of delayed writes
	2336	* has climbed above 75% of the total buffers in the system
	2337	* return an error if the caller has indicated that it can
	2338	* handle one in this case, otherwise schedule the I/O now
	2339	* this is done to prevent us from allocating tons of extra
	2340	* buffers when dealing with virtual disks (i.e. DiskImages),
	2341	* because additional buffers are dynamically allocated to prevent
	2342	* deadlocks from occurring
	2343	*
	2344	* however, can't do a buf_bawrite() if the LOCKED bit is set because the
	2345	* buffer is part of a transaction and can't go to disk until
	2346	* the LOCKED bit is cleared.
	2347	*/
	2348	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
	2349	if (return_error)
	2350	return (EAGAIN);
	2351	/*
	2352	* If the vnode has "too many" write operations in progress
	2353	* wait for them to finish the IO
	2354	*/
	2355	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
	2356
	2357	return (buf_bawrite(bp));
	2358	}
	2359
	2360	/* Otherwise, the "write" is done, so mark and release the buffer. */
	2361	SET(bp->b_flags, B_DONE);
	2362	buf_brelse(bp);
	2363	return (0);
	2364	}
	2365
	2366	errno_t
	2367	buf_bdwrite(buf_t bp)
	2368	{
	2369	return (bdwrite_internal(bp, 0));
	2370	}
	2371
	2372
	2373	/*
	2374	* Asynchronous block write; just an asynchronous buf_bwrite().
	2375	*
	2376	* Note: With the abilitty to allocate additional buffer
	2377	* headers, we can get in to the situation where "too" many
	2378	* buf_bawrite()s can create situation where the kernel can create
	2379	* buffers faster than the disks can service.
	2380	* We limit the number of "in flight" writes a vnode can have to
	2381	* avoid this.
	2382	*/
	2383	static int
	2384	bawrite_internal(buf_t bp, int throttle)
	2385	{
	2386	vnode_t vp = bp->b_vp;
	2387
	2388	if (vp) {
	2389	if (throttle)
	2390	/*
	2391	* If the vnode has "too many" write operations in progress
	2392	* wait for them to finish the IO
	2393	*/
	2394	(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
	2395	else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
	2396	/*
	2397	* return to the caller and
	2398	* let him decide what to do
	2399	*/
	2400	return (EWOULDBLOCK);
	2401	}
	2402	SET(bp->b_flags, B_ASYNC);
	2403
	2404	return (VNOP_BWRITE(bp));
	2405	}
	2406
	2407	errno_t
	2408	buf_bawrite(buf_t bp)
	2409	{
	2410	return (bawrite_internal(bp, 1));
	2411	}
	2412
	2413
	2414
	2415	static void
	2416	buf_free_meta_store(buf_t bp)
	2417	{
	2418	if (bp->b_bufsize) {
	2419	if (ISSET(bp->b_flags, B_ZALLOC)) {
	2420	zone_t z;
	2421
	2422	z = getbufzone(bp->b_bufsize);
	2423	zfree(z, (void *)bp->b_datap);
	2424	} else
	2425	kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
	2426
	2427	bp->b_datap = (uintptr_t)NULL;
	2428	bp->b_bufsize = 0;
	2429	}
	2430	}
	2431
	2432
	2433	static buf_t
	2434	buf_brelse_shadow(buf_t bp)
	2435	{
	2436	buf_t bp_head;
	2437	buf_t bp_temp;
	2438	buf_t bp_return = NULL;
	2439	#ifdef BUF_MAKE_PRIVATE
	2440	buf_t bp_data;
	2441	int data_ref = 0;
	2442	#endif
	2443	int need_wakeup = 0;
	2444
	2445	lck_mtx_lock_spin(buf_mtxp);
	2446
	2447	bp_head = (buf_t)bp->b_orig;
	2448
	2449	if (bp_head->b_whichq != -1)
	2450	panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
	2451
	2452	#ifdef BUF_MAKE_PRIVATE
	2453	if (bp_data = bp->b_data_store) {
	2454	bp_data->b_data_ref--;
	2455	/*
	2456	* snapshot the ref count so that we can check it
	2457	* outside of the lock... we only want the guy going
	2458	* from 1 -> 0 to try and release the storage
	2459	*/
	2460	data_ref = bp_data->b_data_ref;
	2461	}
	2462	#endif
	2463	KERNEL_DEBUG(0xbbbbc008 \| DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
	2464
	2465	bp_head->b_shadow_ref--;
	2466
	2467	for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
	2468
	2469	if (bp_temp == NULL)
	2470	panic("buf_brelse_shadow: bp not on list %p", bp_head);
	2471
	2472	bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
	2473
	2474	#ifdef BUF_MAKE_PRIVATE
	2475	/*
	2476	* we're about to free the current 'owner' of the data buffer and
	2477	* there is at least one other shadow buf_t still pointing at it
	2478	* so transfer it to the first shadow buf left in the chain
	2479	*/
	2480	if (bp == bp_data && data_ref) {
	2481	if ((bp_data = bp_head->b_shadow) == NULL)
	2482	panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
	2483
	2484	for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
	2485	bp_temp->b_data_store = bp_data;
	2486	bp_data->b_data_ref = data_ref;
	2487	}
	2488	#endif
	2489	if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
	2490	panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp);
	2491	if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
	2492	panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp);
	2493
	2494	if (bp_head->b_shadow_ref == 0) {
	2495	if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
	2496
	2497	CLR(bp_head->b_flags, B_AGE);
	2498	bp_head->b_timestamp = buf_timestamp();
	2499
	2500	if (ISSET(bp_head->b_flags, B_LOCKED)) {
	2501	bp_head->b_whichq = BQ_LOCKED;
	2502	binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
	2503	} else {
	2504	bp_head->b_whichq = BQ_META;
	2505	binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
	2506	}
	2507	} else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
	2508	CLR(bp_head->b_lflags, BL_WAITSHADOW);
	2509
	2510	bp_return = bp_head;
	2511	}
	2512	if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
	2513	CLR(bp_head->b_lflags, BL_WANTED_REF);
	2514	need_wakeup = 1;
	2515	}
	2516	}
	2517	lck_mtx_unlock(buf_mtxp);
	2518
	2519	if (need_wakeup)
	2520	wakeup(bp_head);
	2521
	2522	#ifdef BUF_MAKE_PRIVATE
	2523	if (bp == bp_data && data_ref == 0)
	2524	buf_free_meta_store(bp);
	2525
	2526	bp->b_data_store = NULL;
	2527	#endif
	2528	KERNEL_DEBUG(0xbbbbc008 \| DBG_FUNC_END, bp, 0, 0, 0, 0);
	2529
	2530	return (bp_return);
	2531	}
	2532
	2533
	2534	/*
	2535	* Release a buffer on to the free lists.
	2536	* Described in Bach (p. 46).
	2537	*/
	2538	void
	2539	buf_brelse(buf_t bp)
	2540	{
	2541	struct bqueues *bufq;
	2542	long whichq;
	2543	upl_t upl;
	2544	int need_wakeup = 0;
	2545	int need_bp_wakeup = 0;
	2546
	2547
	2548	if (bp->b_whichq != -1 \|\| !(bp->b_lflags & BL_BUSY))
	2549	panic("buf_brelse: bad buffer = %p\n", bp);
	2550
	2551	#ifdef JOE_DEBUG
	2552	(void) OSBacktrace(&bp->b_stackbrelse[0], 6);
	2553
	2554	bp->b_lastbrelse = current_thread();
	2555	bp->b_tag = 0;
	2556	#endif
	2557	if (bp->b_lflags & BL_IOBUF) {
	2558	buf_t shadow_master_bp = NULL;
	2559
	2560	if (ISSET(bp->b_lflags, BL_SHADOW))
	2561	shadow_master_bp = buf_brelse_shadow(bp);
	2562	else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
	2563	buf_free_meta_store(bp);
	2564	free_io_buf(bp);
	2565
	2566	if (shadow_master_bp) {
	2567	bp = shadow_master_bp;
	2568	goto finish_shadow_master;
	2569	}
	2570	return;
	2571	}
	2572
	2573	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_START,
	2574	bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
	2575	bp->b_flags, 0);
	2576
	2577	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	2578
	2579	/*
	2580	* if we're invalidating a buffer that has the B_FILTER bit
	2581	* set then call the b_iodone function so it gets cleaned
	2582	* up properly.
	2583	*
	2584	* the HFS journal code depends on this
	2585	*/
	2586	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
	2587	if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */
	2588	void (iodone_func)(struct buf , void *) = bp->b_iodone;
	2589	void *arg = bp->b_transaction;
	2590
	2591	CLR(bp->b_flags, B_FILTER); /* but note callout done */
	2592	bp->b_iodone = NULL;
	2593	bp->b_transaction = NULL;
	2594
	2595	if (iodone_func == NULL) {
	2596	panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
	2597	}
	2598	(*iodone_func)(bp, arg);
	2599	}
	2600	}
	2601	/*
	2602	* I/O is done. Cleanup the UPL state
	2603	*/
	2604	upl = bp->b_upl;
	2605
	2606	if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
	2607	kern_return_t kret;
	2608	int upl_flags;
	2609
	2610	if (upl == NULL) {
	2611	if ( !ISSET(bp->b_flags, B_INVAL)) {
	2612	kret = ubc_create_upl(bp->b_vp,
	2613	ubc_blktooff(bp->b_vp, bp->b_lblkno),
	2614	bp->b_bufsize,
	2615	&upl,
	2616	NULL,
	2617	UPL_PRECIOUS);
	2618
	2619	if (kret != KERN_SUCCESS)
	2620	panic("brelse: Failed to create UPL");
	2621	#if UPL_DEBUG
	2622	upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
	2623	#endif /* UPL_DEBUG */
	2624	}
	2625	} else {
	2626	if (bp->b_datap) {
	2627	kret = ubc_upl_unmap(upl);
	2628
	2629	if (kret != KERN_SUCCESS)
	2630	panic("ubc_upl_unmap failed");
	2631	bp->b_datap = (uintptr_t)NULL;
	2632	}
	2633	}
	2634	if (upl) {
	2635	if (bp->b_flags & (B_ERROR \| B_INVAL)) {
	2636	if (bp->b_flags & (B_READ \| B_INVAL))
	2637	upl_flags = UPL_ABORT_DUMP_PAGES;
	2638	else
	2639	upl_flags = 0;
	2640
	2641	ubc_upl_abort(upl, upl_flags);
	2642	} else {
	2643	if (ISSET(bp->b_flags, B_DELWRI \| B_WASDIRTY))
	2644	upl_flags = UPL_COMMIT_SET_DIRTY ;
	2645	else
	2646	upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
	2647
	2648	ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags \|
	2649	UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2650	}
	2651	bp->b_upl = NULL;
	2652	}
	2653	} else {
	2654	if ( (upl) )
	2655	panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
	2656	}
	2657
	2658	/*
	2659	* If it's locked, don't report an error; try again later.
	2660	*/
	2661	if (ISSET(bp->b_flags, (B_LOCKED\|B_ERROR)) == (B_LOCKED\|B_ERROR))
	2662	CLR(bp->b_flags, B_ERROR);
	2663	/*
	2664	* If it's not cacheable, or an error, mark it invalid.
	2665	*/
	2666	if (ISSET(bp->b_flags, (B_NOCACHE\|B_ERROR)))
	2667	SET(bp->b_flags, B_INVAL);
	2668
	2669	if ((bp->b_bufsize <= 0) \|\|
	2670	ISSET(bp->b_flags, B_INVAL) \|\|
	2671	(ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
	2672
	2673	boolean_t delayed_buf_free_meta_store = FALSE;
	2674
	2675	/*
	2676	* If it's invalid or empty, dissociate it from its vnode,
	2677	* release its storage if B_META, and
	2678	* clean it up a bit and put it on the EMPTY queue
	2679	*/
	2680	if (ISSET(bp->b_flags, B_DELWRI))
	2681	OSAddAtomicLong(-1, &nbdwrite);
	2682
	2683	if (ISSET(bp->b_flags, B_META)) {
	2684	if (bp->b_shadow_ref)
	2685	delayed_buf_free_meta_store = TRUE;
	2686	else
	2687	buf_free_meta_store(bp);
	2688	}
	2689	/*
	2690	* nuke any credentials we were holding
	2691	*/
	2692	buf_release_credentials(bp);
	2693
	2694	lck_mtx_lock_spin(buf_mtxp);
	2695
	2696	if (bp->b_shadow_ref) {
	2697	SET(bp->b_lflags, BL_WAITSHADOW);
	2698
	2699	lck_mtx_unlock(buf_mtxp);
	2700
	2701	return;
	2702	}
	2703	if (delayed_buf_free_meta_store == TRUE) {
	2704
	2705	lck_mtx_unlock(buf_mtxp);
	2706	finish_shadow_master:
	2707	buf_free_meta_store(bp);
	2708
	2709	lck_mtx_lock_spin(buf_mtxp);
	2710	}
	2711	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED \| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
	2712
	2713	if (bp->b_vp)
	2714	brelvp_locked(bp);
	2715
	2716	bremhash(bp);
	2717	BLISTNONE(bp);
	2718	binshash(bp, &invalhash);
	2719
	2720	bp->b_whichq = BQ_EMPTY;
	2721	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
	2722	} else {
	2723
	2724	/*
	2725	* It has valid data. Put it on the end of the appropriate
	2726	* queue, so that it'll stick around for as long as possible.
	2727	*/
	2728	if (ISSET(bp->b_flags, B_LOCKED))
	2729	whichq = BQ_LOCKED; /* locked in core */
	2730	else if (ISSET(bp->b_flags, B_META))
	2731	whichq = BQ_META; /* meta-data */
	2732	else if (ISSET(bp->b_flags, B_AGE))
	2733	whichq = BQ_AGE; /* stale but valid data */
	2734	else
	2735	whichq = BQ_LRU; /* valid data */
	2736	bufq = &bufqueues[whichq];
	2737
	2738	bp->b_timestamp = buf_timestamp();
	2739
	2740	lck_mtx_lock_spin(buf_mtxp);
	2741
	2742	/*
	2743	* the buf_brelse_shadow routine doesn't take 'ownership'
	2744	* of the parent buf_t... it updates state that is protected by
	2745	* the buf_mtxp, and checks for BL_BUSY to determine whether to
	2746	* put the buf_t back on a free list. b_shadow_ref is protected
	2747	* by the lock, and since we have not yet cleared B_BUSY, we need
	2748	* to check it while holding the lock to insure that one of us
	2749	* puts this buf_t back on a free list when it is safe to do so
	2750	*/
	2751	if (bp->b_shadow_ref == 0) {
	2752	CLR(bp->b_flags, (B_AGE \| B_ASYNC \| B_NOCACHE));
	2753	bp->b_whichq = whichq;
	2754	binstailfree(bp, bufq, whichq);
	2755	} else {
	2756	/*
	2757	* there are still cloned buf_t's pointing
	2758	* at this guy... need to keep it off the
	2759	* freelists until a buf_brelse is done on
	2760	* the last clone
	2761	*/
	2762	CLR(bp->b_flags, (B_ASYNC \| B_NOCACHE));
	2763	}
	2764	}
	2765	if (needbuffer) {
	2766	/*
	2767	* needbuffer is a global
	2768	* we're currently using buf_mtxp to protect it
	2769	* delay doing the actual wakeup until after
	2770	* we drop buf_mtxp
	2771	*/
	2772	needbuffer = 0;
	2773	need_wakeup = 1;
	2774	}
	2775	if (ISSET(bp->b_lflags, BL_WANTED)) {
	2776	/*
	2777	* delay the actual wakeup until after we
	2778	* clear BL_BUSY and we've dropped buf_mtxp
	2779	*/
	2780	need_bp_wakeup = 1;
	2781	}
	2782	/*
	2783	* Unlock the buffer.
	2784	*/
	2785	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
	2786	buf_busycount--;
	2787
	2788	lck_mtx_unlock(buf_mtxp);
	2789
	2790	if (need_wakeup) {
	2791	/*
	2792	* Wake up any processes waiting for any buffer to become free.
	2793	*/
	2794	wakeup(&needbuffer);
	2795	}
	2796	if (need_bp_wakeup) {
	2797	/*
	2798	* Wake up any proceeses waiting for _this_ buffer to become free.
	2799	*/
	2800	wakeup(bp);
	2801	}
	2802	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_END,
	2803	bp, bp->b_datap, bp->b_flags, 0, 0);
	2804	}
	2805
	2806	/*
	2807	* Determine if a block is in the cache.
	2808	* Just look on what would be its hash chain. If it's there, return
	2809	* a pointer to it, unless it's marked invalid. If it's marked invalid,
	2810	* we normally don't return the buffer, unless the caller explicitly
	2811	* wants us to.
	2812	*/
	2813	static boolean_t
	2814	incore(vnode_t vp, daddr64_t blkno)
	2815	{
	2816	boolean_t retval;
	2817	struct bufhashhdr *dp;
	2818
	2819	dp = BUFHASH(vp, blkno);
	2820
	2821	lck_mtx_lock_spin(buf_mtxp);
	2822
	2823	if (incore_locked(vp, blkno, dp))
	2824	retval = TRUE;
	2825	else
	2826	retval = FALSE;
	2827	lck_mtx_unlock(buf_mtxp);
	2828
	2829	return (retval);
	2830	}
	2831
	2832
	2833	static buf_t
	2834	incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
	2835	{
	2836	struct buf *bp;
	2837
	2838	/* Search hash chain */
	2839	for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
	2840	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
	2841	!ISSET(bp->b_flags, B_INVAL)) {
	2842	return (bp);
	2843	}
	2844	}
	2845	return (NULL);
	2846	}
	2847
	2848
	2849	void
	2850	buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
	2851	{
	2852	buf_t bp;
	2853	struct bufhashhdr *dp;
	2854
	2855	dp = BUFHASH(vp, blkno);
	2856
	2857	lck_mtx_lock_spin(buf_mtxp);
	2858
	2859	for (;;) {
	2860	if ((bp = incore_locked(vp, blkno, dp)) == NULL)
	2861	break;
	2862
	2863	if (bp->b_shadow_ref == 0)
	2864	break;
	2865
	2866	SET(bp->b_lflags, BL_WANTED_REF);
	2867
	2868	(void) msleep(bp, buf_mtxp, PSPIN \| (PRIBIO+1), "buf_wait_for_shadow", NULL);
	2869	}
	2870	lck_mtx_unlock(buf_mtxp);
	2871	}
	2872
	2873	/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
	2874	/*
	2875	* Get a block of requested size that is associated with
	2876	* a given vnode and block offset. If it is found in the
	2877	* block cache, mark it as having been found, make it busy
	2878	* and return it. Otherwise, return an empty block of the
	2879	* correct size. It is up to the caller to insure that the
	2880	* cached blocks be of the correct size.
	2881	*/
	2882	buf_t
	2883	buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
	2884	{
	2885	buf_t bp;
	2886	int err;
	2887	upl_t upl;
	2888	upl_page_info_t *pl;
	2889	kern_return_t kret;
	2890	int ret_only_valid;
	2891	struct timespec ts;
	2892	int upl_flags;
	2893	struct bufhashhdr *dp;
	2894
	2895	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_START,
	2896	(uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
	2897
	2898	ret_only_valid = operation & BLK_ONLYVALID;
	2899	operation &= ~BLK_ONLYVALID;
	2900	dp = BUFHASH(vp, blkno);
	2901	start:
	2902	lck_mtx_lock_spin(buf_mtxp);
	2903
	2904	if ((bp = incore_locked(vp, blkno, dp))) {
	2905	/*
	2906	* Found in the Buffer Cache
	2907	*/
	2908	if (ISSET(bp->b_lflags, BL_BUSY)) {
	2909	/*
	2910	* but is busy
	2911	*/
	2912	switch (operation) {
	2913	case BLK_READ:
	2914	case BLK_WRITE:
	2915	case BLK_META:
	2916	SET(bp->b_lflags, BL_WANTED);
	2917	bufstats.bufs_busyincore++;
	2918
	2919	/*
	2920	* don't retake the mutex after being awakened...
	2921	* the time out is in msecs
	2922	*/
	2923	ts.tv_sec = (slptimeo/1000);
	2924	ts.tv_nsec = (slptimeo % 1000) * 10 * NSEC_PER_USEC * 1000;
	2925
	2926	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) \| DBG_FUNC_NONE,
	2927	(uintptr_t)blkno, size, operation, 0, 0);
	2928
	2929	err = msleep(bp, buf_mtxp, slpflag \| PDROP \| (PRIBIO + 1), "buf_getblk", &ts);
	2930
	2931	/*
	2932	* Callers who call with PCATCH or timeout are
	2933	* willing to deal with the NULL pointer
	2934	*/
	2935	if (err && ((slpflag & PCATCH) \|\| ((err == EWOULDBLOCK) && slptimeo)))
	2936	return (NULL);
	2937	goto start;
	2938	/NOTREACHED/
	2939	break;
	2940
	2941	default:
	2942	/*
	2943	* unknown operation requested
	2944	*/
	2945	panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
	2946	/NOTREACHED/
	2947	break;
	2948	}
	2949	} else {
	2950	/*
	2951	* buffer in core and not busy
	2952	*/
	2953	SET(bp->b_lflags, BL_BUSY);
	2954	SET(bp->b_flags, B_CACHE);
	2955	buf_busycount++;
	2956
	2957	bremfree_locked(bp);
	2958	bufstats.bufs_incore++;
	2959
	2960	lck_mtx_unlock(buf_mtxp);
	2961	#ifdef JOE_DEBUG
	2962	bp->b_owner = current_thread();
	2963	bp->b_tag = 1;
	2964	#endif
	2965	if ( (bp->b_upl) )
	2966	panic("buffer has UPL, but not marked BUSY: %p", bp);
	2967
	2968	if ( !ret_only_valid && bp->b_bufsize != size)
	2969	allocbuf(bp, size);
	2970
	2971	upl_flags = 0;
	2972	switch (operation) {
	2973	case BLK_WRITE:
	2974	/*
	2975	* "write" operation: let the UPL subsystem
	2976	* know that we intend to modify the buffer
	2977	* cache pages we're gathering.
	2978	*/
	2979	upl_flags \|= UPL_WILL_MODIFY;
	2980	case BLK_READ:
	2981	upl_flags \|= UPL_PRECIOUS;
	2982	if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
	2983	kret = ubc_create_upl(vp,
	2984	ubc_blktooff(vp, bp->b_lblkno),
	2985	bp->b_bufsize,
	2986	&upl,
	2987	&pl,
	2988	upl_flags);
	2989	if (kret != KERN_SUCCESS)
	2990	panic("Failed to create UPL");
	2991
	2992	bp->b_upl = upl;
	2993
	2994	if (upl_valid_page(pl, 0)) {
	2995	if (upl_dirty_page(pl, 0))
	2996	SET(bp->b_flags, B_WASDIRTY);
	2997	else
	2998	CLR(bp->b_flags, B_WASDIRTY);
	2999	} else
	3000	CLR(bp->b_flags, (B_DONE \| B_CACHE \| B_WASDIRTY \| B_DELWRI));
	3001
	3002	kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
	3003
	3004	if (kret != KERN_SUCCESS)
	3005	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	3006	}
	3007	break;
	3008
	3009	case BLK_META:
	3010	/*
	3011	* VM is not involved in IO for the meta data
	3012	* buffer already has valid data
	3013	*/
	3014	break;
	3015
	3016	default:
	3017	panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
	3018	/NOTREACHED/
	3019	break;
	3020	}
	3021	}
	3022	} else { /* not incore() */
	3023	int queue = BQ_EMPTY; /* Start with no preference */
	3024
	3025	if (ret_only_valid) {
	3026	lck_mtx_unlock(buf_mtxp);
	3027	return (NULL);
	3028	}
	3029	if ((vnode_isreg(vp) == 0) \|\| (UBCINFOEXISTS(vp) == 0) /\|\| (vnode_issystem(vp) == 1)/)
	3030	operation = BLK_META;
	3031
	3032	if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
	3033	goto start;
	3034
	3035	/*
	3036	* getnewbuf may block for a number of different reasons...
	3037	* if it does, it's then possible for someone else to
	3038	* create a buffer for the same block and insert it into
	3039	* the hash... if we see it incore at this point we dump
	3040	* the buffer we were working on and start over
	3041	*/
	3042	if (incore_locked(vp, blkno, dp)) {
	3043	SET(bp->b_flags, B_INVAL);
	3044	binshash(bp, &invalhash);
	3045
	3046	lck_mtx_unlock(buf_mtxp);
	3047
	3048	buf_brelse(bp);
	3049	goto start;
	3050	}
	3051	/*
	3052	* NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
	3053	* CALLED! BE CAREFUL.
	3054	*/
	3055
	3056	/*
	3057	* mark the buffer as B_META if indicated
	3058	* so that when buffer is released it will goto META queue
	3059	*/
	3060	if (operation == BLK_META)
	3061	SET(bp->b_flags, B_META);
	3062
	3063	bp->b_blkno = bp->b_lblkno = blkno;
	3064	bp->b_vp = vp;
	3065
	3066	/*
	3067	* Insert in the hash so that incore() can find it
	3068	*/
	3069	binshash(bp, BUFHASH(vp, blkno));
	3070
	3071	bgetvp_locked(vp, bp);
	3072
	3073	lck_mtx_unlock(buf_mtxp);
	3074
	3075	allocbuf(bp, size);
	3076
	3077	upl_flags = 0;
	3078	switch (operation) {
	3079	case BLK_META:
	3080	/*
	3081	* buffer data is invalid...
	3082	*
	3083	* I don't want to have to retake buf_mtxp,
	3084	* so the miss and vmhits counters are done
	3085	* with Atomic updates... all other counters
	3086	* in bufstats are protected with either
	3087	* buf_mtxp or iobuffer_mtxp
	3088	*/
	3089	OSAddAtomicLong(1, &bufstats.bufs_miss);
	3090	break;
	3091
	3092	case BLK_WRITE:
	3093	/*
	3094	* "write" operation: let the UPL subsystem know
	3095	* that we intend to modify the buffer cache pages
	3096	* we're gathering.
	3097	*/
	3098	upl_flags \|= UPL_WILL_MODIFY;
	3099	case BLK_READ:
	3100	{ off_t f_offset;
	3101	size_t contig_bytes;
	3102	int bmap_flags;
	3103
	3104	if ( (bp->b_upl) )
	3105	panic("bp already has UPL: %p",bp);
	3106
	3107	f_offset = ubc_blktooff(vp, blkno);
	3108
	3109	upl_flags \|= UPL_PRECIOUS;
	3110	kret = ubc_create_upl(vp,
	3111	f_offset,
	3112	bp->b_bufsize,
	3113	&upl,
	3114	&pl,
	3115	upl_flags);
	3116
	3117	if (kret != KERN_SUCCESS)
	3118	panic("Failed to create UPL");
	3119	#if UPL_DEBUG
	3120	upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
	3121	#endif /* UPL_DEBUG */
	3122	bp->b_upl = upl;
	3123
	3124	if (upl_valid_page(pl, 0)) {
	3125
	3126	if (operation == BLK_READ)
	3127	bmap_flags = VNODE_READ;
	3128	else
	3129	bmap_flags = VNODE_WRITE;
	3130
	3131	SET(bp->b_flags, B_CACHE \| B_DONE);
	3132
	3133	OSAddAtomicLong(1, &bufstats.bufs_vmhits);
	3134
	3135	bp->b_validoff = 0;
	3136	bp->b_dirtyoff = 0;
	3137
	3138	if (upl_dirty_page(pl, 0)) {
	3139	/* page is dirty */
	3140	SET(bp->b_flags, B_WASDIRTY);
	3141
	3142	bp->b_validend = bp->b_bcount;
	3143	bp->b_dirtyend = bp->b_bcount;
	3144	} else {
	3145	/* page is clean */
	3146	bp->b_validend = bp->b_bcount;
	3147	bp->b_dirtyend = 0;
	3148	}
	3149	/*
	3150	* try to recreate the physical block number associated with
	3151	* this buffer...
	3152	*/
	3153	if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
	3154	panic("getblk: VNOP_BLOCKMAP failed");
	3155	/*
	3156	* if the extent represented by this buffer
	3157	* is not completely physically contiguous on
	3158	* disk, than we can't cache the physical mapping
	3159	* in the buffer header
	3160	*/
	3161	if ((long)contig_bytes < bp->b_bcount)
	3162	bp->b_blkno = bp->b_lblkno;
	3163	} else {
	3164	OSAddAtomicLong(1, &bufstats.bufs_miss);
	3165	}
	3166	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
	3167
	3168	if (kret != KERN_SUCCESS)
	3169	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	3170	break;
	3171	}
	3172	default:
	3173	panic("getblk: paging or unknown operation - %x", operation);
	3174	/NOTREACHED/
	3175	break;
	3176	}
	3177	}
	3178	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_END,
	3179	bp, bp->b_datap, bp->b_flags, 3, 0);
	3180
	3181	#ifdef JOE_DEBUG
	3182	(void) OSBacktrace(&bp->b_stackgetblk[0], 6);
	3183	#endif
	3184	return (bp);
	3185	}
	3186
	3187	/*
	3188	* Get an empty, disassociated buffer of given size.
	3189	*/
	3190	buf_t
	3191	buf_geteblk(int size)
	3192	{
	3193	buf_t bp = NULL;
	3194	int queue = BQ_EMPTY;
	3195
	3196	do {
	3197	lck_mtx_lock_spin(buf_mtxp);
	3198
	3199	bp = getnewbuf(0, 0, &queue);
	3200	} while (bp == NULL);
	3201
	3202	SET(bp->b_flags, (B_META\|B_INVAL));
	3203
	3204	#if DIAGNOSTIC
	3205	assert(queue == BQ_EMPTY);
	3206	#endif /* DIAGNOSTIC */
	3207	/* XXX need to implement logic to deal with other queues */
	3208
	3209	binshash(bp, &invalhash);
	3210	bufstats.bufs_eblk++;
	3211
	3212	lck_mtx_unlock(buf_mtxp);
	3213
	3214	allocbuf(bp, size);
	3215
	3216	return (bp);
	3217	}
	3218
	3219	uint32_t
	3220	buf_redundancy_flags(buf_t bp)
	3221	{
	3222	return bp->b_redundancy_flags;
	3223	}
	3224
	3225	void
	3226	buf_set_redundancy_flags(buf_t bp, uint32_t flags)
	3227	{
	3228	SET(bp->b_redundancy_flags, flags);
	3229	}
	3230
	3231	void
	3232	buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
	3233	{
	3234	CLR(bp->b_redundancy_flags, flags);
	3235	}
	3236
	3237	/*
	3238	* With UBC, there is no need to expand / shrink the file data
	3239	* buffer. The VM uses the same pages, hence no waste.
	3240	* All the file data buffers can have one size.
	3241	* In fact expand / shrink would be an expensive operation.
	3242	*
	3243	* Only exception to this is meta-data buffers. Most of the
	3244	* meta data operations are smaller than PAGE_SIZE. Having the
	3245	* meta-data buffers grow and shrink as needed, optimizes use
	3246	* of the kernel wired memory.
	3247	*/
	3248
	3249	int
	3250	allocbuf(buf_t bp, int size)
	3251	{
	3252	vm_size_t desired_size;
	3253
	3254	desired_size = roundup(size, CLBYTES);
	3255
	3256	if (desired_size < PAGE_SIZE)
	3257	desired_size = PAGE_SIZE;
	3258	if (desired_size > MAXBSIZE)
	3259	panic("allocbuf: buffer larger than MAXBSIZE requested");
	3260
	3261	if (ISSET(bp->b_flags, B_META)) {
	3262	zone_t zprev, z;
	3263	int nsize = roundup(size, MINMETA);
	3264
	3265	if (bp->b_datap) {
	3266	vm_offset_t elem = (vm_offset_t)bp->b_datap;
	3267
	3268	if (ISSET(bp->b_flags, B_ZALLOC)) {
	3269	if (bp->b_bufsize < nsize) {
	3270	/* reallocate to a bigger size */
	3271
	3272	zprev = getbufzone(bp->b_bufsize);
	3273	if (nsize <= MAXMETA) {
	3274	desired_size = nsize;
	3275	z = getbufzone(nsize);
	3276	/* b_datap not really a ptr */
	3277	(void *)(&bp->b_datap) = zalloc(z);
	3278	} else {
	3279	bp->b_datap = (uintptr_t)NULL;
	3280	kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	3281	CLR(bp->b_flags, B_ZALLOC);
	3282	}
	3283	bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
	3284	zfree(zprev, (void *)elem);
	3285	} else {
	3286	desired_size = bp->b_bufsize;
	3287	}
	3288
	3289	} else {
	3290	if ((vm_size_t)bp->b_bufsize < desired_size) {
	3291	/* reallocate to a bigger size */
	3292	bp->b_datap = (uintptr_t)NULL;
	3293	kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	3294	bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
	3295	kmem_free(kernel_map, elem, bp->b_bufsize);
	3296	} else {
	3297	desired_size = bp->b_bufsize;
	3298	}
	3299	}
	3300	} else {
	3301	/* new allocation */
	3302	if (nsize <= MAXMETA) {
	3303	desired_size = nsize;
	3304	z = getbufzone(nsize);
	3305	/* b_datap not really a ptr */
	3306	(void *)(&bp->b_datap) = zalloc(z);
	3307	SET(bp->b_flags, B_ZALLOC);
	3308	} else
	3309	kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
	3310	}
	3311
	3312	if (bp->b_datap == 0)
	3313	panic("allocbuf: NULL b_datap");
	3314	}
	3315	bp->b_bufsize = desired_size;
	3316	bp->b_bcount = size;
	3317
	3318	return (0);
	3319	}
	3320
	3321	/*
	3322	* Get a new buffer from one of the free lists.
	3323	*
	3324	* Request for a queue is passes in. The queue from which the buffer was taken
	3325	* from is returned. Out of range queue requests get BQ_EMPTY. Request for
	3326	* BQUEUE means no preference. Use heuristics in that case.
	3327	* Heuristics is as follows:
	3328	* Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
	3329	* If none available block till one is made available.
	3330	* If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
	3331	* Pick the most stale buffer.
	3332	* If found buffer was marked delayed write, start the async. write
	3333	* and restart the search.
	3334	* Initialize the fields and disassociate the buffer from the vnode.
	3335	* Remove the buffer from the hash. Return the buffer and the queue
	3336	* on which it was found.
	3337	*
	3338	* buf_mtxp is held upon entry
	3339	* returns with buf_mtxp locked if new buf available
	3340	* returns with buf_mtxp UNlocked if new buf NOT available
	3341	*/
	3342
	3343	static buf_t
	3344	getnewbuf(int slpflag, int slptimeo, int * queue)
	3345	{
	3346	buf_t bp;
	3347	buf_t lru_bp;
	3348	buf_t age_bp;
	3349	buf_t meta_bp;
	3350	int age_time, lru_time, bp_time, meta_time;
	3351	int req = queue; / save it for restarts */
	3352	struct timespec ts;
	3353
	3354	start:
	3355	/*
	3356	* invalid request gets empty queue
	3357	*/
	3358	if ((queue >= BQUEUES) \|\| (queue < 0)
	3359	\|\| (queue == BQ_LAUNDRY) \|\| (queue == BQ_LOCKED))
	3360	*queue = BQ_EMPTY;
	3361
	3362
	3363	if (queue == BQ_EMPTY && (bp = bufqueues[queue].tqh_first))
	3364	goto found;
	3365
	3366	/*
	3367	* need to grow number of bufs, add another one rather than recycling
	3368	*/
	3369	if (nbuf_headers < max_nbuf_headers) {
	3370	/*
	3371	* Increment count now as lock
	3372	* is dropped for allocation.
	3373	* That avoids over commits
	3374	*/
	3375	nbuf_headers++;
	3376	goto add_newbufs;
	3377	}
	3378	/* Try for the requested queue first */
	3379	bp = bufqueues[*queue].tqh_first;
	3380	if (bp)
	3381	goto found;
	3382
	3383	/* Unable to use requested queue */
	3384	age_bp = bufqueues[BQ_AGE].tqh_first;
	3385	lru_bp = bufqueues[BQ_LRU].tqh_first;
	3386	meta_bp = bufqueues[BQ_META].tqh_first;
	3387
	3388	if (!age_bp && !lru_bp && !meta_bp) {
	3389	/*
	3390	* Unavailble on AGE or LRU or META queues
	3391	* Try the empty list first
	3392	*/
	3393	bp = bufqueues[BQ_EMPTY].tqh_first;
	3394	if (bp) {
	3395	*queue = BQ_EMPTY;
	3396	goto found;
	3397	}
	3398	/*
	3399	* We have seen is this is hard to trigger.
	3400	* This is an overcommit of nbufs but needed
	3401	* in some scenarios with diskiamges
	3402	*/
	3403
	3404	add_newbufs:
	3405	lck_mtx_unlock(buf_mtxp);
	3406
	3407	/* Create a new temporary buffer header */
	3408	bp = (struct buf *)zalloc(buf_hdr_zone);
	3409
	3410	if (bp) {
	3411	bufhdrinit(bp);
	3412	bp->b_whichq = BQ_EMPTY;
	3413	bp->b_timestamp = buf_timestamp();
	3414	BLISTNONE(bp);
	3415	SET(bp->b_flags, B_HDRALLOC);
	3416	*queue = BQ_EMPTY;
	3417	}
	3418	lck_mtx_lock_spin(buf_mtxp);
	3419
	3420	if (bp) {
	3421	binshash(bp, &invalhash);
	3422	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
	3423	buf_hdr_count++;
	3424	goto found;
	3425	}
	3426	/* subtract already accounted bufcount */
	3427	nbuf_headers--;
	3428
	3429	bufstats.bufs_sleeps++;
	3430
	3431	/* wait for a free buffer of any kind */
	3432	needbuffer = 1;
	3433	/* hz value is 100 */
	3434	ts.tv_sec = (slptimeo/1000);
	3435	/* the hz value is 100; which leads to 10ms */
	3436	ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
	3437
	3438	msleep(&needbuffer, buf_mtxp, slpflag \| PDROP \| (PRIBIO+1), "getnewbuf", &ts);
	3439	return (NULL);
	3440	}
	3441
	3442	/* Buffer available either on AGE or LRU or META */
	3443	bp = NULL;
	3444	*queue = -1;
	3445
	3446	/* Buffer available either on AGE or LRU */
	3447	if (!age_bp) {
	3448	bp = lru_bp;
	3449	*queue = BQ_LRU;
	3450	} else if (!lru_bp) {
	3451	bp = age_bp;
	3452	*queue = BQ_AGE;
	3453	} else { /* buffer available on both AGE and LRU */
	3454	int t = buf_timestamp();
	3455
	3456	age_time = t - age_bp->b_timestamp;
	3457	lru_time = t - lru_bp->b_timestamp;
	3458	if ((age_time < 0) \|\| (lru_time < 0)) { /* time set backwards */
	3459	bp = age_bp;
	3460	*queue = BQ_AGE;
	3461	/*
	3462	* we should probably re-timestamp eveything in the
	3463	* queues at this point with the current time
	3464	*/
	3465	} else {
	3466	if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
	3467	bp = lru_bp;
	3468	*queue = BQ_LRU;
	3469	} else {
	3470	bp = age_bp;
	3471	*queue = BQ_AGE;
	3472	}
	3473	}
	3474	}
	3475
	3476	if (!bp) { /* Neither on AGE nor on LRU */
	3477	bp = meta_bp;
	3478	*queue = BQ_META;
	3479	} else if (meta_bp) {
	3480	int t = buf_timestamp();
	3481
	3482	bp_time = t - bp->b_timestamp;
	3483	meta_time = t - meta_bp->b_timestamp;
	3484
	3485	if (!(bp_time < 0) && !(meta_time < 0)) {
	3486	/* time not set backwards */
	3487	int bp_is_stale;
	3488	bp_is_stale = (*queue == BQ_LRU) ?
	3489	lru_is_stale : age_is_stale;
	3490
	3491	if ((meta_time >= meta_is_stale) &&
	3492	(bp_time < bp_is_stale)) {
	3493	bp = meta_bp;
	3494	*queue = BQ_META;
	3495	}
	3496	}
	3497	}
	3498	found:
	3499	if (ISSET(bp->b_flags, B_LOCKED) \|\| ISSET(bp->b_lflags, BL_BUSY))
	3500	panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
	3501
	3502	/* Clean it */
	3503	if (bcleanbuf(bp, FALSE)) {
	3504	/*
	3505	* moved to the laundry thread, buffer not ready
	3506	*/
	3507	*queue = req;
	3508	goto start;
	3509	}
	3510	return (bp);
	3511	}
	3512
	3513
	3514	/*
	3515	* Clean a buffer.
	3516	* Returns 0 if buffer is ready to use,
	3517	* Returns 1 if issued a buf_bawrite() to indicate
	3518	* that the buffer is not ready.
	3519	*
	3520	* buf_mtxp is held upon entry
	3521	* returns with buf_mtxp locked
	3522	*/
	3523	int
	3524	bcleanbuf(buf_t bp, boolean_t discard)
	3525	{
	3526	/* Remove from the queue */
	3527	bremfree_locked(bp);
	3528
	3529	#ifdef JOE_DEBUG
	3530	bp->b_owner = current_thread();
	3531	bp->b_tag = 2;
	3532	#endif
	3533	/*
	3534	* If buffer was a delayed write, start the IO by queuing
	3535	* it on the LAUNDRY queue, and return 1
	3536	*/
	3537	if (ISSET(bp->b_flags, B_DELWRI)) {
	3538	if (discard) {
	3539	SET(bp->b_lflags, BL_WANTDEALLOC);
	3540	}
	3541
	3542	bmovelaundry(bp);
	3543
	3544	lck_mtx_unlock(buf_mtxp);
	3545
	3546	wakeup(&bufqueues[BQ_LAUNDRY]);
	3547	/*
	3548	* and give it a chance to run
	3549	*/
	3550	(void)thread_block(THREAD_CONTINUE_NULL);
	3551
	3552	lck_mtx_lock_spin(buf_mtxp);
	3553
	3554	return (1);
	3555	}
	3556	#ifdef JOE_DEBUG
	3557	bp->b_owner = current_thread();
	3558	bp->b_tag = 8;
	3559	#endif
	3560	/*
	3561	* Buffer is no longer on any free list... we own it
	3562	*/
	3563	SET(bp->b_lflags, BL_BUSY);
	3564	buf_busycount++;
	3565
	3566	bremhash(bp);
	3567
	3568	/*
	3569	* disassociate us from our vnode, if we had one...
	3570	*/
	3571	if (bp->b_vp)
	3572	brelvp_locked(bp);
	3573
	3574	lck_mtx_unlock(buf_mtxp);
	3575
	3576	BLISTNONE(bp);
	3577
	3578	if (ISSET(bp->b_flags, B_META))
	3579	buf_free_meta_store(bp);
	3580
	3581	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	3582
	3583	buf_release_credentials(bp);
	3584
	3585	/* If discarding, just move to the empty queue */
	3586	if (discard) {
	3587	lck_mtx_lock_spin(buf_mtxp);
	3588	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED \| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
	3589	bp->b_whichq = BQ_EMPTY;
	3590	binshash(bp, &invalhash);
	3591	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
	3592	CLR(bp->b_lflags, BL_BUSY);
	3593	buf_busycount--;
	3594	} else {
	3595	/* Not discarding: clean up and prepare for reuse */
	3596	bp->b_bufsize = 0;
	3597	bp->b_datap = (uintptr_t)NULL;
	3598	bp->b_upl = (void *)NULL;
	3599	/*
	3600	* preserve the state of whether this buffer
	3601	* was allocated on the fly or not...
	3602	* the only other flag that should be set at
	3603	* this point is BL_BUSY...
	3604	*/
	3605	#ifdef JOE_DEBUG
	3606	bp->b_owner = current_thread();
	3607	bp->b_tag = 3;
	3608	#endif
	3609	bp->b_lflags = BL_BUSY;
	3610	bp->b_flags = (bp->b_flags & B_HDRALLOC);
	3611	bp->b_dev = NODEV;
	3612	bp->b_blkno = bp->b_lblkno = 0;
	3613	bp->b_iodone = NULL;
	3614	bp->b_error = 0;
	3615	bp->b_resid = 0;
	3616	bp->b_bcount = 0;
	3617	bp->b_dirtyoff = bp->b_dirtyend = 0;
	3618	bp->b_validoff = bp->b_validend = 0;
	3619	bzero(&bp->b_attr, sizeof(struct bufattr));
	3620
	3621	lck_mtx_lock_spin(buf_mtxp);
	3622	}
	3623	return (0);
	3624	}
	3625
	3626
	3627
	3628	errno_t
	3629	buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
	3630	{
	3631	buf_t bp;
	3632	errno_t error;
	3633	struct bufhashhdr *dp;
	3634
	3635	dp = BUFHASH(vp, lblkno);
	3636
	3637	relook:
	3638	lck_mtx_lock_spin(buf_mtxp);
	3639
	3640	if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
	3641	lck_mtx_unlock(buf_mtxp);
	3642	return (0);
	3643	}
	3644	if (ISSET(bp->b_lflags, BL_BUSY)) {
	3645	if ( !ISSET(flags, BUF_WAIT)) {
	3646	lck_mtx_unlock(buf_mtxp);
	3647	return (EBUSY);
	3648	}
	3649	SET(bp->b_lflags, BL_WANTED);
	3650
	3651	error = msleep((caddr_t)bp, buf_mtxp, PDROP \| (PRIBIO + 1), "buf_invalblkno", NULL);
	3652
	3653	if (error) {
	3654	return (error);
	3655	}
	3656	goto relook;
	3657	}
	3658	bremfree_locked(bp);
	3659	SET(bp->b_lflags, BL_BUSY);
	3660	SET(bp->b_flags, B_INVAL);
	3661	buf_busycount++;
	3662	#ifdef JOE_DEBUG
	3663	bp->b_owner = current_thread();
	3664	bp->b_tag = 4;
	3665	#endif
	3666	lck_mtx_unlock(buf_mtxp);
	3667	buf_brelse(bp);
	3668
	3669	return (0);
	3670	}
	3671
	3672
	3673	void
	3674	buf_drop(buf_t bp)
	3675	{
	3676	int need_wakeup = 0;
	3677
	3678	lck_mtx_lock_spin(buf_mtxp);
	3679
	3680	if (ISSET(bp->b_lflags, BL_WANTED)) {
	3681	/*
	3682	* delay the actual wakeup until after we
	3683	* clear BL_BUSY and we've dropped buf_mtxp
	3684	*/
	3685	need_wakeup = 1;
	3686	}
	3687	#ifdef JOE_DEBUG
	3688	bp->b_owner = current_thread();
	3689	bp->b_tag = 9;
	3690	#endif
	3691	/*
	3692	* Unlock the buffer.
	3693	*/
	3694	CLR(bp->b_lflags, (BL_BUSY \| BL_WANTED));
	3695	buf_busycount--;
	3696
	3697	lck_mtx_unlock(buf_mtxp);
	3698
	3699	if (need_wakeup) {
	3700	/*
	3701	* Wake up any proceeses waiting for _this_ buffer to become free.
	3702	*/
	3703	wakeup(bp);
	3704	}
	3705	}
	3706
	3707
	3708	errno_t
	3709	buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
	3710	errno_t error;
	3711
	3712	lck_mtx_lock_spin(buf_mtxp);
	3713
	3714	error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
	3715
	3716	lck_mtx_unlock(buf_mtxp);
	3717
	3718	return (error);
	3719	}
	3720
	3721
	3722	static errno_t
	3723	buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
	3724	{
	3725	errno_t error;
	3726	struct timespec ts;
	3727
	3728	if (ISSET(bp->b_flags, B_LOCKED)) {
	3729	if ((flags & BAC_SKIP_LOCKED))
	3730	return (EDEADLK);
	3731	} else {
	3732	if ((flags & BAC_SKIP_NONLOCKED))
	3733	return (EDEADLK);
	3734	}
	3735	if (ISSET(bp->b_lflags, BL_BUSY)) {
	3736	/*
	3737	* since the lck_mtx_lock may block, the buffer
	3738	* may become BUSY, so we need to
	3739	* recheck for a NOWAIT request
	3740	*/
	3741	if (flags & BAC_NOWAIT)
	3742	return (EBUSY);
	3743	SET(bp->b_lflags, BL_WANTED);
	3744
	3745	/* the hz value is 100; which leads to 10ms */
	3746	ts.tv_sec = (slptimeo/100);
	3747	ts.tv_nsec = (slptimeo % 100) * 10 * NSEC_PER_USEC * 1000;
	3748	error = msleep((caddr_t)bp, buf_mtxp, slpflag \| (PRIBIO + 1), "buf_acquire", &ts);
	3749
	3750	if (error)
	3751	return (error);
	3752	return (EAGAIN);
	3753	}
	3754	if (flags & BAC_REMOVE)
	3755	bremfree_locked(bp);
	3756	SET(bp->b_lflags, BL_BUSY);
	3757	buf_busycount++;
	3758
	3759	#ifdef JOE_DEBUG
	3760	bp->b_owner = current_thread();
	3761	bp->b_tag = 5;
	3762	#endif
	3763	return (0);
	3764	}
	3765
	3766
	3767	/*
	3768	* Wait for operations on the buffer to complete.
	3769	* When they do, extract and return the I/O's error value.
	3770	*/
	3771	errno_t
	3772	buf_biowait(buf_t bp)
	3773	{
	3774	while (!ISSET(bp->b_flags, B_DONE)) {
	3775
	3776	lck_mtx_lock_spin(buf_mtxp);
	3777
	3778	if (!ISSET(bp->b_flags, B_DONE)) {
	3779	DTRACE_IO1(wait__start, buf_t, bp);
	3780	(void) msleep(bp, buf_mtxp, PDROP \| (PRIBIO+1), "buf_biowait", NULL);
	3781	DTRACE_IO1(wait__done, buf_t, bp);
	3782	} else
	3783	lck_mtx_unlock(buf_mtxp);
	3784	}
	3785	/* check for interruption of I/O (e.g. via NFS), then errors. */
	3786	if (ISSET(bp->b_flags, B_EINTR)) {
	3787	CLR(bp->b_flags, B_EINTR);
	3788	return (EINTR);
	3789	} else if (ISSET(bp->b_flags, B_ERROR))
	3790	return (bp->b_error ? bp->b_error : EIO);
	3791	else
	3792	return (0);
	3793	}
	3794
	3795
	3796	/*
	3797	* Mark I/O complete on a buffer.
	3798	*
	3799	* If a callback has been requested, e.g. the pageout
	3800	* daemon, do so. Otherwise, awaken waiting processes.
	3801	*
	3802	* [ Leffler, et al., says on p.247:
	3803	* "This routine wakes up the blocked process, frees the buffer
	3804	* for an asynchronous write, or, for a request by the pagedaemon
	3805	* process, invokes a procedure specified in the buffer structure" ]
	3806	*
	3807	* In real life, the pagedaemon (or other system processes) wants
	3808	* to do async stuff to, and doesn't want the buffer buf_brelse()'d.
	3809	* (for swap pager, that puts swap buffers on the free lists (!!!),
	3810	* for the vn device, that puts malloc'd buffers on the free lists!)
	3811	*/
	3812
	3813	void
	3814	buf_biodone(buf_t bp)
	3815	{
	3816	mount_t mp;
	3817	struct bufattr *bap;
	3818
	3819	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_START,
	3820	bp, bp->b_datap, bp->b_flags, 0, 0);
	3821
	3822	if (ISSET(bp->b_flags, B_DONE))
	3823	panic("biodone already");
	3824
	3825	if (ISSET(bp->b_flags, B_ERROR)) {
	3826	fslog_io_error(bp);
	3827	}
	3828
	3829	bap = &bp->b_attr;
	3830
	3831	if (bp->b_vp && bp->b_vp->v_mount) {
	3832	mp = bp->b_vp->v_mount;
	3833	} else {
	3834	mp = NULL;
	3835	}
	3836
	3837	if (mp && (bp->b_flags & B_READ) == 0) {
	3838	update_last_io_time(mp);
	3839	INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
	3840	} else if (mp) {
	3841	INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
	3842	}
	3843
	3844	if (kdebug_enable) {
	3845	int code = DKIO_DONE;
	3846	int io_tier = GET_BUFATTR_IO_TIER(bap);
	3847
	3848	if (bp->b_flags & B_READ)
	3849	code \|= DKIO_READ;
	3850	if (bp->b_flags & B_ASYNC)
	3851	code \|= DKIO_ASYNC;
	3852
	3853	if (bp->b_flags & B_META)
	3854	code \|= DKIO_META;
	3855	else if (bp->b_flags & B_PAGEIO)
	3856	code \|= DKIO_PAGING;
	3857
	3858	if (io_tier != 0)
	3859	code \|= DKIO_THROTTLE;
	3860
	3861	code \|= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
	3862
	3863	if (bp->b_flags & B_PASSIVE)
	3864	code \|= DKIO_PASSIVE;
	3865
	3866	if (bap->ba_flags & BA_NOCACHE)
	3867	code \|= DKIO_NOCACHE;
	3868
	3869	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) \| DBG_FUNC_NONE,
	3870	buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
	3871	}
	3872
	3873	/*
	3874	* I/O was done, so don't believe
	3875	* the DIRTY state from VM anymore...
	3876	* and we need to reset the THROTTLED/PASSIVE
	3877	* indicators
	3878	*/
	3879	CLR(bp->b_flags, (B_WASDIRTY \| B_PASSIVE));
	3880	CLR(bap->ba_flags, (BA_META \| BA_NOCACHE \| BA_DELAYIDLESLEEP));
	3881
	3882	SET_BUFATTR_IO_TIER(bap, 0);
	3883
	3884	DTRACE_IO1(done, buf_t, bp);
	3885
	3886	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
	3887	/*
	3888	* wake up any writer's blocked
	3889	* on throttle or waiting for I/O
	3890	* to drain
	3891	*/
	3892	vnode_writedone(bp->b_vp);
	3893
	3894	if (ISSET(bp->b_flags, (B_CALL \| B_FILTER))) { /* if necessary, call out */
	3895	void (iodone_func)(struct buf , void *) = bp->b_iodone;
	3896	void *arg = bp->b_transaction;
	3897	int callout = ISSET(bp->b_flags, B_CALL);
	3898
	3899	if (iodone_func == NULL)
	3900	panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
	3901
	3902	CLR(bp->b_flags, (B_CALL \| B_FILTER)); /* filters and callouts are one-shot */
	3903	bp->b_iodone = NULL;
	3904	bp->b_transaction = NULL;
	3905
	3906	if (callout)
	3907	SET(bp->b_flags, B_DONE); /* note that it's done */
	3908
	3909	(*iodone_func)(bp, arg);
	3910
	3911	if (callout) {
	3912	/*
	3913	* assumes that the callback function takes
	3914	* ownership of the bp and deals with releasing it if necessary
	3915	*/
	3916	goto biodone_done;
	3917	}
	3918	/*
	3919	* in this case the call back function is acting
	3920	* strictly as a filter... it does not take
	3921	* ownership of the bp and is expecting us
	3922	* to finish cleaning up... this is currently used
	3923	* by the HFS journaling code
	3924	*/
	3925	}
	3926	if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
	3927	SET(bp->b_flags, B_DONE); /* note that it's done */
	3928
	3929	buf_brelse(bp);
	3930	} else { /* or just wakeup the buffer */
	3931	/*
	3932	* by taking the mutex, we serialize
	3933	* the buf owner calling buf_biowait so that we'll
	3934	* only see him in one of 2 states...
	3935	* state 1: B_DONE wasn't set and he's
	3936	* blocked in msleep
	3937	* state 2: he's blocked trying to take the
	3938	* mutex before looking at B_DONE
	3939	* BL_WANTED is cleared in case anyone else
	3940	* is blocked waiting for the buffer... note
	3941	* that we haven't cleared B_BUSY yet, so if
	3942	* they do get to run, their going to re-set
	3943	* BL_WANTED and go back to sleep
	3944	*/
	3945	lck_mtx_lock_spin(buf_mtxp);
	3946
	3947	CLR(bp->b_lflags, BL_WANTED);
	3948	SET(bp->b_flags, B_DONE); /* note that it's done */
	3949
	3950	lck_mtx_unlock(buf_mtxp);
	3951
	3952	wakeup(bp);
	3953	}
	3954	biodone_done:
	3955	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_END,
	3956	(uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
	3957	}
	3958
	3959	/*
	3960	* Obfuscate buf pointers.
	3961	*/
	3962	vm_offset_t
	3963	buf_kernel_addrperm_addr(void * addr)
	3964	{
	3965	if ((vm_offset_t)addr == 0)
	3966	return 0;
	3967	else
	3968	return ((vm_offset_t)addr + buf_kernel_addrperm);
	3969	}
	3970
	3971	/*
	3972	* Return a count of buffers on the "locked" queue.
	3973	*/
	3974	int
	3975	count_lock_queue(void)
	3976	{
	3977	buf_t bp;
	3978	int n = 0;
	3979
	3980	lck_mtx_lock_spin(buf_mtxp);
	3981
	3982	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
	3983	bp = bp->b_freelist.tqe_next)
	3984	n++;
	3985	lck_mtx_unlock(buf_mtxp);
	3986
	3987	return (n);
	3988	}
	3989
	3990	/*
	3991	* Return a count of 'busy' buffers. Used at the time of shutdown.
	3992	* note: This is also called from the mach side in debug context in kdp.c
	3993	*/
	3994	int
	3995	count_busy_buffers(void)
	3996	{
	3997	return buf_busycount + bufstats.bufs_iobufinuse;
	3998	}
	3999
	4000	#if DIAGNOSTIC
	4001	/*
	4002	* Print out statistics on the current allocation of the buffer pool.
	4003	* Can be enabled to print out on every ``sync'' by setting "syncprt"
	4004	* in vfs_syscalls.c using sysctl.
	4005	*/
	4006	void
	4007	vfs_bufstats()
	4008	{
	4009	int i, j, count;
	4010	struct buf *bp;
	4011	struct bqueues *dp;
	4012	int counts[MAXBSIZE/CLBYTES+1];
	4013	static char *bname[BQUEUES] =
	4014	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	4015
	4016	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
	4017	count = 0;
	4018	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	4019	counts[j] = 0;
	4020
	4021	lck_mtx_lock(buf_mtxp);
	4022
	4023	for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
	4024	counts[bp->b_bufsize/CLBYTES]++;
	4025	count++;
	4026	}
	4027	lck_mtx_unlock(buf_mtxp);
	4028
	4029	printf("%s: total-%d", bname[i], count);
	4030	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	4031	if (counts[j] != 0)
	4032	printf(", %d-%d", j * CLBYTES, counts[j]);
	4033	printf("\n");
	4034	}
	4035	}
	4036	#endif /* DIAGNOSTIC */
	4037
	4038	#define NRESERVEDIOBUFS 128
	4039
	4040
	4041	buf_t
	4042	alloc_io_buf(vnode_t vp, int priv)
	4043	{
	4044	buf_t bp;
	4045
	4046	lck_mtx_lock_spin(iobuffer_mtxp);
	4047
	4048	while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) \|\|
	4049	(bp = iobufqueue.tqh_first) == NULL) {
	4050	bufstats.bufs_iobufsleeps++;
	4051
	4052	need_iobuffer = 1;
	4053	(void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN \| (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
	4054	}
	4055	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
	4056
	4057	bufstats.bufs_iobufinuse++;
	4058	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
	4059	bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
	4060
	4061	lck_mtx_unlock(iobuffer_mtxp);
	4062
	4063	/*
	4064	* initialize various fields
	4065	* we don't need to hold the mutex since the buffer
	4066	* is now private... the vp should have a reference
	4067	* on it and is not protected by this mutex in any event
	4068	*/
	4069	bp->b_timestamp = 0;
	4070	bp->b_proc = NULL;
	4071
	4072	bp->b_datap = 0;
	4073	bp->b_flags = 0;
	4074	bp->b_lflags = BL_BUSY \| BL_IOBUF;
	4075	bp->b_redundancy_flags = 0;
	4076	bp->b_blkno = bp->b_lblkno = 0;
	4077	#ifdef JOE_DEBUG
	4078	bp->b_owner = current_thread();
	4079	bp->b_tag = 6;
	4080	#endif
	4081	bp->b_iodone = NULL;
	4082	bp->b_error = 0;
	4083	bp->b_resid = 0;
	4084	bp->b_bcount = 0;
	4085	bp->b_bufsize = 0;
	4086	bp->b_upl = NULL;
	4087	bp->b_vp = vp;
	4088	bzero(&bp->b_attr, sizeof(struct bufattr));
	4089
	4090	if (vp && (vp->v_type == VBLK \|\| vp->v_type == VCHR))
	4091	bp->b_dev = vp->v_rdev;
	4092	else
	4093	bp->b_dev = NODEV;
	4094
	4095	return (bp);
	4096	}
	4097
	4098
	4099	void
	4100	free_io_buf(buf_t bp)
	4101	{
	4102	int need_wakeup = 0;
	4103
	4104	/*
	4105	* put buffer back on the head of the iobufqueue
	4106	*/
	4107	bp->b_vp = NULL;
	4108	bp->b_flags = B_INVAL;
	4109
	4110	lck_mtx_lock_spin(iobuffer_mtxp);
	4111
	4112	binsheadfree(bp, &iobufqueue, -1);
	4113
	4114	if (need_iobuffer) {
	4115	/*
	4116	* Wake up any processes waiting because they need an io buffer
	4117	*
	4118	* do the wakeup after we drop the mutex... it's possible that the
	4119	* wakeup will be superfluous if need_iobuffer gets set again and
	4120	* another thread runs this path, but it's highly unlikely, doesn't
	4121	* hurt, and it means we don't hold up I/O progress if the wakeup blocks
	4122	* trying to grab a task related lock...
	4123	*/
	4124	need_iobuffer = 0;
	4125	need_wakeup = 1;
	4126	}
	4127	if (bufstats.bufs_iobufinuse <= 0)
	4128	panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
	4129
	4130	bufstats.bufs_iobufinuse--;
	4131
	4132	lck_mtx_unlock(iobuffer_mtxp);
	4133
	4134	if (need_wakeup)
	4135	wakeup(&need_iobuffer);
	4136	}
	4137
	4138
	4139	void
	4140	buf_list_lock(void)
	4141	{
	4142	lck_mtx_lock_spin(buf_mtxp);
	4143	}
	4144
	4145	void
	4146	buf_list_unlock(void)
	4147	{
	4148	lck_mtx_unlock(buf_mtxp);
	4149	}
	4150
	4151	/*
	4152	* If getnewbuf() calls bcleanbuf() on the same thread
	4153	* there is a potential for stack overrun and deadlocks.
	4154	* So we always handoff the work to a worker thread for completion
	4155	*/
	4156
	4157
	4158	static void
	4159	bcleanbuf_thread_init(void)
	4160	{
	4161	thread_t thread = THREAD_NULL;
	4162
	4163	/* create worker thread */
	4164	kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
	4165	thread_deallocate(thread);
	4166	}
	4167
	4168	typedef int (*bcleanbufcontinuation)(int);
	4169
	4170	static void
	4171	bcleanbuf_thread(void)
	4172	{
	4173	struct buf *bp;
	4174	int error = 0;
	4175	int loopcnt = 0;
	4176
	4177	for (;;) {
	4178	lck_mtx_lock_spin(buf_mtxp);
	4179
	4180	while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
	4181	(void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO\|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
	4182	}
	4183
	4184	/*
	4185	* Remove from the queue
	4186	*/
	4187	bremfree_locked(bp);
	4188
	4189	/*
	4190	* Buffer is no longer on any free list
	4191	*/
	4192	SET(bp->b_lflags, BL_BUSY);
	4193	buf_busycount++;
	4194
	4195	#ifdef JOE_DEBUG
	4196	bp->b_owner = current_thread();
	4197	bp->b_tag = 10;
	4198	#endif
	4199
	4200	lck_mtx_unlock(buf_mtxp);
	4201	/*
	4202	* do the IO
	4203	*/
	4204	error = bawrite_internal(bp, 0);
	4205
	4206	if (error) {
	4207	bp->b_whichq = BQ_LAUNDRY;
	4208	bp->b_timestamp = buf_timestamp();
	4209
	4210	lck_mtx_lock_spin(buf_mtxp);
	4211
	4212	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	4213	blaundrycnt++;
	4214
	4215	/* we never leave a busy page on the laundry queue */
	4216	CLR(bp->b_lflags, BL_BUSY);
	4217	buf_busycount--;
	4218	#ifdef JOE_DEBUG
	4219	bp->b_owner = current_thread();
	4220	bp->b_tag = 11;
	4221	#endif
	4222
	4223	lck_mtx_unlock(buf_mtxp);
	4224
	4225	if (loopcnt > MAXLAUNDRY) {
	4226	/*
	4227	* bawrite_internal() can return errors if we're throttled. If we've
	4228	* done several I/Os and failed, give the system some time to unthrottle
	4229	* the vnode
	4230	*/
	4231	(void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
	4232	loopcnt = 0;
	4233	} else {
	4234	/* give other threads a chance to run */
	4235	(void)thread_block(THREAD_CONTINUE_NULL);
	4236	loopcnt++;
	4237	}
	4238	}
	4239	}
	4240	}
	4241
	4242
	4243	static int
	4244	brecover_data(buf_t bp)
	4245	{
	4246	int upl_offset;
	4247	upl_t upl;
	4248	upl_page_info_t *pl;
	4249	kern_return_t kret;
	4250	vnode_t vp = bp->b_vp;
	4251	int upl_flags;
	4252
	4253
	4254	if ( !UBCINFOEXISTS(vp) \|\| bp->b_bufsize == 0)
	4255	goto dump_buffer;
	4256
	4257	upl_flags = UPL_PRECIOUS;
	4258	if (! (buf_flags(bp) & B_READ)) {
	4259	/*
	4260	* "write" operation: let the UPL subsystem know
	4261	* that we intend to modify the buffer cache pages we're
	4262	* gathering.
	4263	*/
	4264	upl_flags \|= UPL_WILL_MODIFY;
	4265	}
	4266
	4267	kret = ubc_create_upl(vp,
	4268	ubc_blktooff(vp, bp->b_lblkno),
	4269	bp->b_bufsize,
	4270	&upl,
	4271	&pl,
	4272	upl_flags);
	4273	if (kret != KERN_SUCCESS)
	4274	panic("Failed to create UPL");
	4275
	4276	for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
	4277
	4278	if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) \|\| !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
	4279	ubc_upl_abort(upl, 0);
	4280	goto dump_buffer;
	4281	}
	4282	}
	4283	bp->b_upl = upl;
	4284
	4285	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
	4286
	4287	if (kret != KERN_SUCCESS)
	4288	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	4289	return (1);
	4290
	4291	dump_buffer:
	4292	bp->b_bufsize = 0;
	4293	SET(bp->b_flags, B_INVAL);
	4294	buf_brelse(bp);
	4295
	4296	return(0);
	4297	}
	4298
	4299	boolean_t
	4300	buffer_cache_gc(int all)
	4301	{
	4302	buf_t bp;
	4303	boolean_t did_large_zfree = FALSE;
	4304	boolean_t need_wakeup = FALSE;
	4305	int now = buf_timestamp();
	4306	uint32_t found = 0;
	4307	struct bqueues privq;
	4308	int thresh_hold = BUF_STALE_THRESHHOLD;
	4309
	4310	if (all)
	4311	thresh_hold = 0;
	4312	/*
	4313	* We only care about metadata (incore storage comes from zalloc()).
	4314	* Unless "all" is set (used to evict meta data buffers in preparation
	4315	* for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
	4316	* that have not been accessed in the last 30s. This limit controls both
	4317	* the hold time of the global lock "buf_mtxp" and the length of time
	4318	* we spend compute bound in the GC thread which calls this function
	4319	*/
	4320	lck_mtx_lock(buf_mtxp);
	4321
	4322	do {
	4323	found = 0;
	4324	TAILQ_INIT(&privq);
	4325	need_wakeup = FALSE;
	4326
	4327	while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
	4328	(now > bp->b_timestamp) &&
	4329	(now - bp->b_timestamp > thresh_hold) &&
	4330	(found < BUF_MAX_GC_BATCH_SIZE)) {
	4331
	4332	/* Remove from free list */
	4333	bremfree_locked(bp);
	4334	found++;
	4335
	4336	#ifdef JOE_DEBUG
	4337	bp->b_owner = current_thread();
	4338	bp->b_tag = 12;
	4339	#endif
	4340
	4341	/* If dirty, move to laundry queue and remember to do wakeup */
	4342	if (ISSET(bp->b_flags, B_DELWRI)) {
	4343	SET(bp->b_lflags, BL_WANTDEALLOC);
	4344
	4345	bmovelaundry(bp);
	4346	need_wakeup = TRUE;
	4347
	4348	continue;
	4349	}
	4350
	4351	/*
	4352	* Mark busy and put on private list. We could technically get
	4353	* away without setting BL_BUSY here.
	4354	*/
	4355	SET(bp->b_lflags, BL_BUSY);
	4356	buf_busycount++;
	4357
	4358	/*
	4359	* Remove from hash and dissociate from vp.
	4360	*/
	4361	bremhash(bp);
	4362	if (bp->b_vp) {
	4363	brelvp_locked(bp);
	4364	}
	4365
	4366	TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
	4367	}
	4368
	4369	if (found == 0) {
	4370	break;
	4371	}
	4372
	4373	/* Drop lock for batch processing */
	4374	lck_mtx_unlock(buf_mtxp);
	4375
	4376	/* Wakeup and yield for laundry if need be */
	4377	if (need_wakeup) {
	4378	wakeup(&bufqueues[BQ_LAUNDRY]);
	4379	(void)thread_block(THREAD_CONTINUE_NULL);
	4380	}
	4381
	4382	/* Clean up every buffer on private list */
	4383	TAILQ_FOREACH(bp, &privq, b_freelist) {
	4384	/* Take note if we've definitely freed at least a page to a zone */
	4385	if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
	4386	did_large_zfree = TRUE;
	4387	}
	4388
	4389	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	4390
	4391	/* Free Storage */
	4392	buf_free_meta_store(bp);
	4393
	4394	/* Release credentials */
	4395	buf_release_credentials(bp);
	4396
	4397	/* Prepare for moving to empty queue */
	4398	CLR(bp->b_flags, (B_META \| B_ZALLOC \| B_DELWRI \| B_LOCKED
	4399	\| B_AGE \| B_ASYNC \| B_NOCACHE \| B_FUA));
	4400	bp->b_whichq = BQ_EMPTY;
	4401	BLISTNONE(bp);
	4402	}
	4403	lck_mtx_lock(buf_mtxp);
	4404
	4405	/* Back under lock, move them all to invalid hash and clear busy */
	4406	TAILQ_FOREACH(bp, &privq, b_freelist) {
	4407	binshash(bp, &invalhash);
	4408	CLR(bp->b_lflags, BL_BUSY);
	4409	buf_busycount--;
	4410
	4411	#ifdef JOE_DEBUG
	4412	if (bp->b_owner != current_thread()) {
	4413	panic("Buffer stolen from buffer_cache_gc()");
	4414	}
	4415	bp->b_owner = current_thread();
	4416	bp->b_tag = 13;
	4417	#endif
	4418	}
	4419
	4420	/* And do a big bulk move to the empty queue */
	4421	TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
	4422
	4423	} while (all && (found == BUF_MAX_GC_BATCH_SIZE));
	4424
	4425	lck_mtx_unlock(buf_mtxp);
	4426
	4427	return did_large_zfree;
	4428	}
	4429
	4430
	4431	/*
	4432	* disabled for now
	4433	*/
	4434
	4435	#if FLUSH_QUEUES
	4436
	4437	#define NFLUSH 32
	4438
	4439	static int
	4440	bp_cmp(void a, void b)
	4441	{
	4442	buf_t bp_a = (buf_t **)a,
	4443	bp_b = (buf_t **)b;
	4444	daddr64_t res;
	4445
	4446	// don't have to worry about negative block
	4447	// numbers so this is ok to do.
	4448	//
	4449	res = (bp_a->b_blkno - bp_b->b_blkno);
	4450
	4451	return (int)res;
	4452	}
	4453
	4454
	4455	int
	4456	bflushq(int whichq, mount_t mp)
	4457	{
	4458	buf_t bp, next;
	4459	int i, buf_count;
	4460	int total_writes = 0;
	4461	static buf_t flush_table[NFLUSH];
	4462
	4463	if (whichq < 0 \|\| whichq >= BQUEUES) {
	4464	return (0);
	4465	}
	4466
	4467	restart:
	4468	lck_mtx_lock(buf_mtxp);
	4469
	4470	bp = TAILQ_FIRST(&bufqueues[whichq]);
	4471
	4472	for (buf_count = 0; bp; bp = next) {
	4473	next = bp->b_freelist.tqe_next;
	4474
	4475	if (bp->b_vp == NULL \|\| bp->b_vp->v_mount != mp) {
	4476	continue;
	4477	}
	4478
	4479	if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
	4480
	4481	bremfree_locked(bp);
	4482	#ifdef JOE_DEBUG
	4483	bp->b_owner = current_thread();
	4484	bp->b_tag = 7;
	4485	#endif
	4486	SET(bp->b_lflags, BL_BUSY);
	4487	buf_busycount++;
	4488
	4489	flush_table[buf_count] = bp;
	4490	buf_count++;
	4491	total_writes++;
	4492
	4493	if (buf_count >= NFLUSH) {
	4494	lck_mtx_unlock(buf_mtxp);
	4495
	4496	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	4497
	4498	for (i = 0; i < buf_count; i++) {
	4499	buf_bawrite(flush_table[i]);
	4500	}
	4501	goto restart;
	4502	}
	4503	}
	4504	}
	4505	lck_mtx_unlock(buf_mtxp);
	4506
	4507	if (buf_count > 0) {
	4508	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	4509
	4510	for (i = 0; i < buf_count; i++) {
	4511	buf_bawrite(flush_table[i]);
	4512	}
	4513	}
	4514
	4515	return (total_writes);
	4516	}
	4517	#endif
	4518
	4519
	4520	#if BALANCE_QUEUES
	4521
	4522	/* XXX move this to a separate file */
	4523
	4524	/*
	4525	* NOTE: THIS CODE HAS NOT BEEN UPDATED
	4526	* WITH RESPECT TO THE NEW LOCKING MODEL
	4527	*/
	4528
	4529
	4530	/*
	4531	* Dynamic Scaling of the Buffer Queues
	4532	*/
	4533
	4534	typedef long long blsize_t;
	4535
	4536	blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
	4537	/* Global tunable limits */
	4538	blsize_t nbufh; /* number of buffer headers */
	4539	blsize_t nbuflow; /* minimum number of buffer headers required */
	4540	blsize_t nbufhigh; /* maximum number of buffer headers allowed */
	4541	blsize_t nbuftarget; /* preferred number of buffer headers */
	4542
	4543	/*
	4544	* assertions:
	4545	*
	4546	* 1. 0 < nbuflow <= nbufh <= nbufhigh
	4547	* 2. nbufhigh <= MAXNBUF
	4548	* 3. 0 < nbuflow <= nbuftarget <= nbufhigh
	4549	* 4. nbufh can not be set by sysctl().
	4550	*/
	4551
	4552	/* Per queue tunable limits */
	4553
	4554	struct bufqlim {
	4555	blsize_t bl_nlow; /* minimum number of buffer headers required */
	4556	blsize_t bl_num; /* number of buffer headers on the queue */
	4557	blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
	4558	blsize_t bl_target; /* preferred number of buffer headers */
	4559	long bl_stale; /* Seconds after which a buffer is considered stale */
	4560	} bufqlim[BQUEUES];
	4561
	4562	/*
	4563	* assertions:
	4564	*
	4565	* 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
	4566	* 2. bl_nlhigh <= MAXNBUF
	4567	* 3. bufqlim[BQ_META].bl_nlow != 0
	4568	* 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
	4569	* file system IO operations)
	4570	* 5. bl_num can not be set by sysctl().
	4571	* 6. bl_nhigh <= nbufhigh
	4572	*/
	4573
	4574	/*
	4575	* Rationale:
	4576	* ----------
	4577	* Defining it blsize_t as long permits 2^31 buffer headers per queue.
	4578	* Which can describe (2^31 * PAGE_SIZE) memory per queue.
	4579	*
	4580	* These limits are exported to by means of sysctl().
	4581	* It was decided to define blsize_t as a 64 bit quantity.
	4582	* This will make sure that we will not be required to change it
	4583	* as long as we do not exceed 64 bit address space for the kernel.
	4584	*
	4585	* low and high numbers parameters initialized at compile time
	4586	* and boot arguments can be used to override them. sysctl()
	4587	* would not change the value. sysctl() can get all the values
	4588	* but can set only target. num is the current level.
	4589	*
	4590	* Advantages of having a "bufqscan" thread doing the balancing are,
	4591	* Keep enough bufs on BQ_EMPTY.
	4592	* getnewbuf() by default will always select a buffer from the BQ_EMPTY.
	4593	* getnewbuf() perfoms best if a buffer was found there.
	4594	* Also this minimizes the possibility of starting IO
	4595	* from getnewbuf(). That's a performance win, too.
	4596	*
	4597	* Localize complex logic [balancing as well as time aging]
	4598	* to balancebufq().
	4599	*
	4600	* Simplify getnewbuf() logic by elimination of time aging code.
	4601	*/
	4602
	4603	/*
	4604	* Algorithm:
	4605	* -----------
	4606	* The goal of the dynamic scaling of the buffer queues to to keep
	4607	* the size of the LRU close to bl_target. Buffers on a queue would
	4608	* be time aged.
	4609	*
	4610	* There would be a thread which will be responsible for "balancing"
	4611	* the buffer cache queues.
	4612	*
	4613	* The scan order would be: AGE, LRU, META, EMPTY.
	4614	*/
	4615
	4616	long bufqscanwait = 0;
	4617
	4618	static void bufqscan_thread();
	4619	static int balancebufq(int q);
	4620	static int btrimempty(int n);
	4621	static __inline__ int initbufqscan(void);
	4622	static __inline__ int nextbufq(int q);
	4623	static void buqlimprt(int all);
	4624
	4625
	4626	static __inline__ void
	4627	bufqinc(int q)
	4628	{
	4629	if ((q < 0) \|\| (q >= BQUEUES))
	4630	return;
	4631
	4632	bufqlim[q].bl_num++;
	4633	return;
	4634	}
	4635
	4636	static __inline__ void
	4637	bufqdec(int q)
	4638	{
	4639	if ((q < 0) \|\| (q >= BQUEUES))
	4640	return;
	4641
	4642	bufqlim[q].bl_num--;
	4643	return;
	4644	}
	4645
	4646	static void
	4647	bufq_balance_thread_init(void)
	4648	{
	4649	thread_t thread = THREAD_NULL;
	4650
	4651	if (bufqscanwait++ == 0) {
	4652
	4653	/* Initalize globals */
	4654	MAXNBUF = (sane_size / PAGE_SIZE);
	4655	nbufh = nbuf_headers;
	4656	nbuflow = min(nbufh, 100);
	4657	nbufhigh = min(MAXNBUF, max(nbufh, 2048));
	4658	nbuftarget = (sane_size >> 5) / PAGE_SIZE;
	4659	nbuftarget = max(nbuflow, nbuftarget);
	4660	nbuftarget = min(nbufhigh, nbuftarget);
	4661
	4662	/*
	4663	* Initialize the bufqlim
	4664	*/
	4665
	4666	/* LOCKED queue */
	4667	bufqlim[BQ_LOCKED].bl_nlow = 0;
	4668	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	4669	bufqlim[BQ_LOCKED].bl_target = 0;
	4670	bufqlim[BQ_LOCKED].bl_stale = 30;
	4671
	4672	/* LRU queue */
	4673	bufqlim[BQ_LRU].bl_nlow = 0;
	4674	bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
	4675	bufqlim[BQ_LRU].bl_target = nbuftarget/4;
	4676	bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
	4677
	4678	/* AGE queue */
	4679	bufqlim[BQ_AGE].bl_nlow = 0;
	4680	bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
	4681	bufqlim[BQ_AGE].bl_target = nbuftarget/4;
	4682	bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
	4683
	4684	/* EMPTY queue */
	4685	bufqlim[BQ_EMPTY].bl_nlow = 0;
	4686	bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
	4687	bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
	4688	bufqlim[BQ_EMPTY].bl_stale = 600000;
	4689
	4690	/* META queue */
	4691	bufqlim[BQ_META].bl_nlow = 0;
	4692	bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
	4693	bufqlim[BQ_META].bl_target = nbuftarget/4;
	4694	bufqlim[BQ_META].bl_stale = META_IS_STALE;
	4695
	4696	/* LAUNDRY queue */
	4697	bufqlim[BQ_LOCKED].bl_nlow = 0;
	4698	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	4699	bufqlim[BQ_LOCKED].bl_target = 0;
	4700	bufqlim[BQ_LOCKED].bl_stale = 30;
	4701
	4702	buqlimprt(1);
	4703	}
	4704
	4705	/* create worker thread */
	4706	kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread);
	4707	thread_deallocate(thread);
	4708	}
	4709
	4710	/* The workloop for the buffer balancing thread */
	4711	static void
	4712	bufqscan_thread()
	4713	{
	4714	int moretodo = 0;
	4715
	4716	for(;;) {
	4717	do {
	4718	int q; /* buffer queue to process */
	4719
	4720	q = initbufqscan();
	4721	for (; q; ) {
	4722	moretodo \|= balancebufq(q);
	4723	q = nextbufq(q);
	4724	}
	4725	} while (moretodo);
	4726
	4727	#if DIAGNOSTIC
	4728	vfs_bufstats();
	4729	buqlimprt(0);
	4730	#endif
	4731	(void)tsleep((void )&bufqscanwait, PRIBIO, "bufqscanwait", 60 hz);
	4732	moretodo = 0;
	4733	}
	4734	}
	4735
	4736	/* Seed for the buffer queue balancing */
	4737	static __inline__ int
	4738	initbufqscan()
	4739	{
	4740	/* Start with AGE queue */
	4741	return (BQ_AGE);
	4742	}
	4743
	4744	/* Pick next buffer queue to balance */
	4745	static __inline__ int
	4746	nextbufq(int q)
	4747	{
	4748	int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
	4749
	4750	q++;
	4751	q %= sizeof(order);
	4752	return (order[q]);
	4753	}
	4754
	4755	/* function to balance the buffer queues */
	4756	static int
	4757	balancebufq(int q)
	4758	{
	4759	int moretodo = 0;
	4760	int n, t;
	4761
	4762	/* reject invalid q */
	4763	if ((q < 0) \|\| (q >= BQUEUES))
	4764	goto out;
	4765
	4766	/* LOCKED or LAUNDRY queue MUST not be balanced */
	4767	if ((q == BQ_LOCKED) \|\| (q == BQ_LAUNDRY))
	4768	goto out;
	4769
	4770	n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
	4771
	4772	/* If queue has less than target nothing more to do */
	4773	if (n < 0)
	4774	goto out;
	4775
	4776	if ( n > 8 ) {
	4777	/* Balance only a small amount (12.5%) at a time */
	4778	n >>= 3;
	4779	}
	4780
	4781	/* EMPTY queue needs special handling */
	4782	if (q == BQ_EMPTY) {
	4783	moretodo \|= btrimempty(n);
	4784	goto out;
	4785	}
	4786
	4787	t = buf_timestamp():
	4788
	4789	for (; n > 0; n--) {
	4790	struct buf *bp = bufqueues[q].tqh_first;
	4791	if (!bp)
	4792	break;
	4793
	4794	/* check if it's stale */
	4795	if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
	4796	if (bcleanbuf(bp, FALSE)) {
	4797	/* buf_bawrite() issued, bp not ready */
	4798	moretodo = 1;
	4799	} else {
	4800	/* release the cleaned buffer to BQ_EMPTY */
	4801	SET(bp->b_flags, B_INVAL);
	4802	buf_brelse(bp);
	4803	}
	4804	} else
	4805	break;
	4806	}
	4807
	4808	out:
	4809	return (moretodo);
	4810	}
	4811
	4812	static int
	4813	btrimempty(int n)
	4814	{
	4815	/*
	4816	* When struct buf are allocated dynamically, this would
	4817	* reclaim upto 'n' struct buf from the empty queue.
	4818	*/
	4819
	4820	return (0);
	4821	}
	4822
	4823	static void
	4824	buqlimprt(int all)
	4825	{
	4826	int i;
	4827	static char *bname[BQUEUES] =
	4828	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	4829
	4830	if (all)
	4831	for (i = 0; i < BQUEUES; i++) {
	4832	printf("%s : ", bname[i]);
	4833	printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
	4834	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	4835	printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
	4836	printf("target = %ld, ", (long)bufqlim[i].bl_target);
	4837	printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
	4838	}
	4839	else
	4840	for (i = 0; i < BQUEUES; i++) {
	4841	printf("%s : ", bname[i]);
	4842	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	4843	}
	4844	}
	4845
	4846	#endif
	4847
	4848