git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
	7	*
	8	* This file contains Original Code and/or Modifications of Original Code
	9	* as defined in and that are subject to the Apple Public Source License
	10	* Version 2.0 (the 'License'). You may not use this file except in
	11	* compliance with the License. Please obtain a copy of the License at
	12	* http://www.opensource.apple.com/apsl/ and read it before using this
	13	* file.
	14	*
	15	* The Original Code and all software distributed under the License are
	16	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	17	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	18	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	19	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	20	* Please see the License for the specific language governing rights and
	21	* limitations under the License.
	22	*
	23	* @APPLE_LICENSE_HEADER_END@
	24	*/
	25	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	26	/*-
	27	* Copyright (c) 1994 Christopher G. Demetriou
	28	* Copyright (c) 1982, 1986, 1989, 1993
	29	* The Regents of the University of California. All rights reserved.
	30	* (c) UNIX System Laboratories, Inc.
	31	* All or some portions of this file are derived from material licensed
	32	* to the University of California by American Telephone and Telegraph
	33	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	34	* the permission of UNIX System Laboratories, Inc.
	35	*
	36	* Redistribution and use in source and binary forms, with or without
	37	* modification, are permitted provided that the following conditions
	38	* are met:
	39	* 1. Redistributions of source code must retain the above copyright
	40	* notice, this list of conditions and the following disclaimer.
	41	* 2. Redistributions in binary form must reproduce the above copyright
	42	* notice, this list of conditions and the following disclaimer in the
	43	* documentation and/or other materials provided with the distribution.
	44	* 3. All advertising materials mentioning features or use of this software
	45	* must display the following acknowledgement:
	46	* This product includes software developed by the University of
	47	* California, Berkeley and its contributors.
	48	* 4. Neither the name of the University nor the names of its contributors
	49	* may be used to endorse or promote products derived from this software
	50	* without specific prior written permission.
	51	*
	52	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	53	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	54	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	55	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	56	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	57	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	58	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	59	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	60	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	61	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	62	* SUCH DAMAGE.
	63	*
	64	* The NEXTSTEP Software License Agreement specifies the terms
	65	* and conditions for redistribution.
	66	*
	67	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
	68	*/
	69
	70	/*
	71	* Some references:
	72	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
	73	* Leffler, et al.: The Design and Implementation of the 4.3BSD
	74	* UNIX Operating System (Addison Welley, 1989)
	75	*/
	76
	77	#include <sys/param.h>
	78	#include <sys/systm.h>
	79	#include <sys/proc.h>
	80	#include <sys/buf.h>
	81	#include <sys/vnode.h>
	82	#include <sys/mount.h>
	83	#include <sys/trace.h>
	84	#include <sys/malloc.h>
	85	#include <sys/resourcevar.h>
	86	#include <miscfs/specfs/specdev.h>
	87	#include <sys/ubc.h>
	88	#include <vm/vm_pageout.h>
	89	#if DIAGNOSTIC
	90	#include <kern/assert.h>
	91	#endif /* DIAGNOSTIC */
	92	#include <kern/task.h>
	93	#include <kern/zalloc.h>
	94
	95	#include <sys/kdebug.h>
	96	#include <machine/spl.h>
	97
	98	static __inline__ void bufqinc(int q);
	99	static __inline__ void bufqdec(int q);
	100
	101	static int do_breadn_for_type(struct vnode vp, daddr_t blkno, int size, daddr_t rablks,
	102	int rasizes, int nrablks, struct ucred cred, struct buf **bpp, int queuetype);
	103	static struct buf getnewbuf(int slpflag, int slptimeo, int queue);
	104	static int bcleanbuf(struct buf *bp);
	105	static int brecover_data(struct buf *bp);
	106	extern void vwakeup();
	107
	108	extern int niobuf; /* The number of IO buffer headers for cluster IO */
	109	int blaundrycnt;
	110
	111	/* zone allocated buffer headers */
	112	static zone_t buf_hdr_zone;
	113	static int buf_hdr_count;
	114
	115	#if TRACE
	116	struct proc *traceproc;
	117	int tracewhich, tracebuf[TRCSIZ];
	118	u_int tracex;
	119	char traceflags[TR_NFLAGS];
	120	#endif /* TRACE */
	121
	122	/*
	123	* Definitions for the buffer hash lists.
	124	*/
	125	#define BUFHASH(dvp, lbn) \
	126	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
	127	LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
	128	u_long bufhash;
	129
	130	/* Definitions for the buffer stats. */
	131	struct bufstats bufstats;
	132
	133	/* Number of delayed write buffers */
	134	int nbdwrite = 0;
	135
	136	/*
	137	* Insq/Remq for the buffer hash lists.
	138	*/
	139	#if 0
	140	#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
	141	#define bremhash(bp) LIST_REMOVE(bp, b_hash)
	142	#endif /* 0 */
	143
	144
	145	TAILQ_HEAD(ioqueue, buf) iobufqueue;
	146	TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
	147	static int needbuffer;
	148	static int need_iobuffer;
	149
	150	/*
	151	* Insq/Remq for the buffer free lists.
	152	*/
	153	#define binsheadfree(bp, dp, whichq) do { \
	154	TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
	155	bufqinc((whichq)); \
	156	(bp)->b_whichq = whichq; \
	157	(bp)->b_timestamp = time.tv_sec; \
	158	} while (0)
	159
	160	#define binstailfree(bp, dp, whichq) do { \
	161	TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
	162	bufqinc((whichq)); \
	163	(bp)->b_whichq = whichq; \
	164	(bp)->b_timestamp = time.tv_sec; \
	165	} while (0)
	166
	167	#define BHASHENTCHECK(bp) \
	168	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef) \
	169	panic("%x: b_hash.le_prev is not deadbeef", (bp));
	170
	171	#define BLISTNONE(bp) \
	172	(bp)->b_hash.le_next = (struct buf *)0; \
	173	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
	174
	175	/*
	176	* Insq/Remq for the vnode usage lists.
	177	*/
	178	#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
	179	#define bufremvn(bp) { \
	180	LIST_REMOVE(bp, b_vnbufs); \
	181	(bp)->b_vnbufs.le_next = NOLIST; \
	182	}
	183
	184	simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */
	185
	186	/* number of per vnode, "in flight" buffer writes */
	187	#define BUFWRITE_THROTTLE 9
	188
	189
	190	/*
	191	* Time in seconds before a buffer on a list is
	192	* considered as a stale buffer
	193	*/
	194	#define LRU_IS_STALE 120 /* default value for the LRU */
	195	#define AGE_IS_STALE 60 /* default value for the AGE */
	196	#define META_IS_STALE 180 /* default value for the BQ_META */
	197
	198	int lru_is_stale = LRU_IS_STALE;
	199	int age_is_stale = AGE_IS_STALE;
	200	int meta_is_stale = META_IS_STALE;
	201
	202	/* LIST_INSERT_HEAD() with assertions */
	203	static __inline__ void
	204	blistenterhead(struct bufhashhdr * head, struct buf * bp)
	205	{
	206	if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
	207	(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
	208	(head)->lh_first = bp;
	209	bp->b_hash.le_prev = &(head)->lh_first;
	210	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	211	panic("blistenterhead: le_prev is deadbeef");
	212	}
	213
	214	static __inline__ void
	215	binshash(struct buf bp, struct bufhashhdr dp)
	216	{
	217	struct buf *nbp;
	218
	219	simple_lock(&bufhashlist_slock);
	220
	221	#if 0
	222	if((bad = incore(bp->b_vp, bp->b_lblkno)))
	223	panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
	224	#endif /* 0 */
	225
	226	BHASHENTCHECK(bp);
	227
	228	nbp = dp->lh_first;
	229	for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
	230	if(nbp == bp)
	231	panic("buf already in hashlist");
	232	}
	233
	234	blistenterhead(dp, bp);
	235	simple_unlock(&bufhashlist_slock);
	236	}
	237
	238	static __inline__ void
	239	bremhash(struct buf *bp)
	240	{
	241	simple_lock(&bufhashlist_slock);
	242	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	243	panic("bremhash le_prev is deadbeef");
	244	if (bp->b_hash.le_next == bp)
	245	panic("bremhash: next points to self");
	246
	247	if (bp->b_hash.le_next != NULL)
	248	bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
	249	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
	250	simple_unlock(&bufhashlist_slock);
	251	}
	252
	253	/*
	254	* Remove a buffer from the free list it's on
	255	*/
	256	void
	257	bremfree(bp)
	258	struct buf *bp;
	259	{
	260	struct bqueues *dp = NULL;
	261	int whichq = -1;
	262
	263	/*
	264	* We only calculate the head of the freelist when removing
	265	* the last element of the list as that is the only time that
	266	* it is needed (e.g. to reset the tail pointer).
	267	*
	268	* NB: This makes an assumption about how tailq's are implemented.
	269	*/
	270	if (bp->b_freelist.tqe_next == NULL) {
	271	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
	272	if (dp->tqh_last == &bp->b_freelist.tqe_next)
	273	break;
	274	if (dp == &bufqueues[BQUEUES])
	275	panic("bremfree: lost tail");
	276	}
	277	TAILQ_REMOVE(dp, bp, b_freelist);
	278	whichq = bp->b_whichq;
	279	bufqdec(whichq);
	280	bp->b_whichq = -1;
	281	bp->b_timestamp = 0;
	282	}
	283
	284	/*
	285	* Associate a buffer with a vnode.
	286	*/
	287	static void
	288	bgetvp(vp, bp)
	289	register struct vnode *vp;
	290	register struct buf *bp;
	291	{
	292
	293	if (bp->b_vp != vp)
	294	panic("bgetvp: not free");
	295	VHOLD(vp);
	296	bp->b_vp = vp;
	297	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	298	bp->b_dev = vp->v_rdev;
	299	else
	300	bp->b_dev = NODEV;
	301	/*
	302	* Insert onto list for new vnode.
	303	*/
	304	bufinsvn(bp, &vp->v_cleanblkhd);
	305	}
	306
	307	/*
	308	* Disassociate a buffer from a vnode.
	309	*/
	310	static void
	311	brelvp(bp)
	312	register struct buf *bp;
	313	{
	314	struct vnode *vp;
	315
	316	if (bp->b_vp == (struct vnode *) 0)
	317	panic("brelvp: NULL vp");
	318	/*
	319	* Delete from old vnode list, if on one.
	320	*/
	321	if (bp->b_vnbufs.le_next != NOLIST)
	322	bufremvn(bp);
	323	vp = bp->b_vp;
	324	bp->b_vp = (struct vnode *) 0;
	325	HOLDRELE(vp);
	326	}
	327
	328	/*
	329	* Reassign a buffer from one vnode to another.
	330	* Used to assign file specific control information
	331	* (indirect blocks) to the vnode to which they belong.
	332	*/
	333	void
	334	reassignbuf(bp, newvp)
	335	register struct buf *bp;
	336	register struct vnode *newvp;
	337	{
	338	register struct buflists *listheadp;
	339
	340	if (newvp == NULL) {
	341	printf("reassignbuf: NULL");
	342	return;
	343	}
	344	/*
	345	* Delete from old vnode list, if on one.
	346	*/
	347	if (bp->b_vnbufs.le_next != NOLIST)
	348	bufremvn(bp);
	349	/*
	350	* If dirty, put on list of dirty buffers;
	351	* otherwise insert onto list of clean buffers.
	352	*/
	353	if (ISSET(bp->b_flags, B_DELWRI))
	354	listheadp = &newvp->v_dirtyblkhd;
	355	else
	356	listheadp = &newvp->v_cleanblkhd;
	357	bufinsvn(bp, listheadp);
	358	}
	359
	360	static __inline__ void
	361	bufhdrinit(struct buf *bp)
	362	{
	363	bzero((char )bp, sizeof bp);
	364	bp->b_dev = NODEV;
	365	bp->b_rcred = NOCRED;
	366	bp->b_wcred = NOCRED;
	367	bp->b_vnbufs.le_next = NOLIST;
	368	bp->b_flags = B_INVAL;
	369
	370	return;
	371	}
	372
	373	/*
	374	* Initialize buffers and hash links for buffers.
	375	*/
	376	__private_extern__ void
	377	bufinit()
	378	{
	379	register struct buf *bp;
	380	register struct bqueues *dp;
	381	register int i;
	382	int metabuf;
	383	long whichq;
	384	static void bufzoneinit();
	385	static void bcleanbuf_thread_init();
	386
	387	/* Initialize the buffer queues ('freelists') and the hash table */
	388	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
	389	TAILQ_INIT(dp);
	390	bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
	391
	392	simple_lock_init(&bufhashlist_slock );
	393
	394	metabuf = nbuf/8; /* reserved for meta buf */
	395
	396	/* Initialize the buffer headers */
	397	for (i = 0; i < nbuf; i++) {
	398	bp = &buf[i];
	399	bufhdrinit(bp);
	400
	401	/*
	402	* metabuf buffer headers on the meta-data list and
	403	* rest of the buffer headers on the empty list
	404	*/
	405	if (--metabuf)
	406	whichq = BQ_META;
	407	else
	408	whichq = BQ_EMPTY;
	409
	410	BLISTNONE(bp);
	411	dp = &bufqueues[whichq];
	412	binsheadfree(bp, dp, whichq);
	413	binshash(bp, &invalhash);
	414	}
	415
	416	for (; i < nbuf + niobuf; i++) {
	417	bp = &buf[i];
	418	bufhdrinit(bp);
	419	binsheadfree(bp, &iobufqueue, -1);
	420	}
	421
	422	printf("using %d buffer headers and %d cluster IO buffer headers\n",
	423	nbuf, niobuf);
	424
	425	/* Set up zones used by the buffer cache */
	426	bufzoneinit();
	427
	428	/* start the bcleanbuf() thread */
	429	bcleanbuf_thread_init();
	430
	431	#if 0 /* notyet */
	432	{
	433	static void bufq_balance_thread_init();
	434	/* create a thread to do dynamic buffer queue balancing */
	435	bufq_balance_thread_init();
	436	}
	437	#endif /* notyet */
	438	}
	439
	440	static struct buf *
	441	bio_doread(vp, blkno, size, cred, async, queuetype)
	442	struct vnode *vp;
	443	daddr_t blkno;
	444	int size;
	445	struct ucred *cred;
	446	int async;
	447	int queuetype;
	448	{
	449	register struct buf *bp;
	450	struct proc *p = current_proc();
	451
	452	bp = getblk(vp, blkno, size, 0, 0, queuetype);
	453
	454	/*
	455	* If buffer does not have data valid, start a read.
	456	* Note that if buffer is B_INVAL, getblk() won't return it.
	457	* Therefore, it's valid if it's I/O has completed or been delayed.
	458	*/
	459	if (!ISSET(bp->b_flags, (B_DONE \| B_DELWRI))) {
	460	/* Start I/O for the buffer (keeping credentials). */
	461	SET(bp->b_flags, B_READ \| async);
	462	if (cred != NOCRED && bp->b_rcred == NOCRED) {
	463	/*
	464	* NFS has embedded ucred.
	465	* Can not crhold() here as that causes zone corruption
	466	*/
	467	bp->b_rcred = crdup(cred);
	468	}
	469
	470	VOP_STRATEGY(bp);
	471
	472	trace(TR_BREADMISS, pack(vp, size), blkno);
	473
	474	/* Pay for the read. */
	475	if (p && p->p_stats)
	476	p->p_stats->p_ru.ru_inblock++; /* XXX */
	477	} else if (async) {
	478	brelse(bp);
	479	}
	480
	481	trace(TR_BREADHIT, pack(vp, size), blkno);
	482
	483	return (bp);
	484	}
	485	/*
	486	* Read a disk block.
	487	* This algorithm described in Bach (p.54).
	488	*/
	489	int
	490	bread(vp, blkno, size, cred, bpp)
	491	struct vnode *vp;
	492	daddr_t blkno;
	493	int size;
	494	struct ucred *cred;
	495	struct buf **bpp;
	496	{
	497	register struct buf *bp;
	498
	499	/* Get buffer for block. */
	500	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
	501
	502	/* Wait for the read to complete, and return result. */
	503	return (biowait(bp));
	504	}
	505
	506	/*
	507	* Read a disk block. [bread() for meta-data]
	508	* This algorithm described in Bach (p.54).
	509	*/
	510	int
	511	meta_bread(vp, blkno, size, cred, bpp)
	512	struct vnode *vp;
	513	daddr_t blkno;
	514	int size;
	515	struct ucred *cred;
	516	struct buf **bpp;
	517	{
	518	register struct buf *bp;
	519
	520	/* Get buffer for block. */
	521	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
	522
	523	/* Wait for the read to complete, and return result. */
	524	return (biowait(bp));
	525	}
	526
	527	/*
	528	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	529	*/
	530	int
	531	breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
	532	struct vnode *vp;
	533	daddr_t blkno; int size;
	534	daddr_t rablks[]; int rasizes[];
	535	int nrablks;
	536	struct ucred *cred;
	537	struct buf **bpp;
	538	{
	539	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
	540	}
	541
	542	/*
	543	* Read-ahead multiple disk blocks. The first is sync, the rest async.
	544	* [breadn() for meta-data]
	545	*/
	546	int
	547	meta_breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
	548	struct vnode *vp;
	549	daddr_t blkno; int size;
	550	daddr_t rablks[]; int rasizes[];
	551	int nrablks;
	552	struct ucred *cred;
	553	struct buf **bpp;
	554	{
	555	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
	556	}
	557
	558	/*
	559	* Perform the reads for breadn() and meta_breadn().
	560	* Trivial modification to the breada algorithm presented in Bach (p.55).
	561	*/
	562	static int
	563	do_breadn_for_type(struct vnode vp, daddr_t blkno, int size, daddr_t rablks, int *rasizes,
	564	int nrablks, struct ucred cred, struct buf *bpp, int queuetype)
	565	{
	566	register struct buf *bp;
	567	int i;
	568
	569	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
	570
	571	/*
	572	* For each of the read-ahead blocks, start a read, if necessary.
	573	*/
	574	for (i = 0; i < nrablks; i++) {
	575	/* If it's in the cache, just go on to next one. */
	576	if (incore(vp, rablks[i]))
	577	continue;
	578
	579	/* Get a buffer for the read-ahead block */
	580	(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
	581	}
	582
	583	/* Otherwise, we had to start a read for it; wait until it's valid. */
	584	return (biowait(bp));
	585	}
	586
	587	/*
	588	* Read with single-block read-ahead. Defined in Bach (p.55), but
	589	* implemented as a call to breadn().
	590	* XXX for compatibility with old file systems.
	591	*/
	592	int
	593	breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
	594	struct vnode *vp;
	595	daddr_t blkno; int size;
	596	daddr_t rablkno; int rabsize;
	597	struct ucred *cred;
	598	struct buf **bpp;
	599	{
	600
	601	return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
	602	}
	603
	604	/*
	605	* Block write. Described in Bach (p.56)
	606	*/
	607	int
	608	bwrite(bp)
	609	struct buf *bp;
	610	{
	611	int rv, sync, wasdelayed;
	612	struct proc *p = current_proc();
	613	struct vnode *vp = bp->b_vp;
	614
	615	if (bp->b_data == 0) {
	616	if (brecover_data(bp) == 0)
	617	return (0);
	618	}
	619	/* Remember buffer type, to switch on it later. */
	620	sync = !ISSET(bp->b_flags, B_ASYNC);
	621	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
	622	CLR(bp->b_flags, (B_READ \| B_DONE \| B_ERROR \| B_DELWRI));
	623	if (wasdelayed) {
	624	nbdwrite--;
	625	wakeup((caddr_t)&nbdwrite);
	626	}
	627
	628	if (!sync) {
	629	/*
	630	* If not synchronous, pay for the I/O operation and make
	631	* sure the buf is on the correct vnode queue. We have
	632	* to do this now, because if we don't, the vnode may not
	633	* be properly notified that its I/O has completed.
	634	*/
	635	if (wasdelayed)
	636	reassignbuf(bp, vp);
	637	else
	638	if (p && p->p_stats)
	639	p->p_stats->p_ru.ru_oublock++; /* XXX */
	640	}
	641
	642	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
	643
	644	/* Initiate disk write. Make sure the appropriate party is charged. */
	645	SET(bp->b_flags, B_WRITEINPROG);
	646	vp->v_numoutput++;
	647
	648	VOP_STRATEGY(bp);
	649
	650	if (sync) {
	651	/*
	652	* If I/O was synchronous, wait for it to complete.
	653	*/
	654	rv = biowait(bp);
	655
	656	/*
	657	* Pay for the I/O operation, if it's not been paid for, and
	658	* make sure it's on the correct vnode queue. (async operatings
	659	* were payed for above.)
	660	*/
	661	if (wasdelayed)
	662	reassignbuf(bp, vp);
	663	else
	664	if (p && p->p_stats)
	665	p->p_stats->p_ru.ru_oublock++; /* XXX */
	666
	667	/* Release the buffer. */
	668	// XXXdbg - only if the unused bit is set
	669	if (!ISSET(bp->b_flags, B_NORELSE)) {
	670	brelse(bp);
	671	} else {
	672	CLR(bp->b_flags, B_NORELSE);
	673	}
	674
	675	return (rv);
	676	} else {
	677	return (0);
	678	}
	679	}
	680
	681	int
	682	vn_bwrite(ap)
	683	struct vop_bwrite_args *ap;
	684	{
	685	return (bwrite(ap->a_bp));
	686	}
	687
	688	/*
	689	* Delayed write.
	690	*
	691	* The buffer is marked dirty, but is not queued for I/O.
	692	* This routine should be used when the buffer is expected
	693	* to be modified again soon, typically a small write that
	694	* partially fills a buffer.
	695	*
	696	* NB: magnetic tapes cannot be delayed; they must be
	697	* written in the order that the writes are requested.
	698	*
	699	* Described in Leffler, et al. (pp. 208-213).
	700	*
	701	* Note: With the abilitty to allocate additional buffer
	702	* headers, we can get in to the situation where "too" many
	703	* bdwrite()s can create situation where the kernel can create
	704	* buffers faster than the disks can service. Doing a bawrite() in
	705	* cases were we have "too many" outstanding bdwrite()s avoids that.
	706	*/
	707	__private_extern__ int
	708	bdwrite_internal(bp, return_error)
	709	struct buf *bp;
	710	int return_error;
	711	{
	712	struct proc *p = current_proc();
	713	struct vnode *vp = bp->b_vp;
	714
	715	/*
	716	* If the block hasn't been seen before:
	717	* (1) Mark it as having been seen,
	718	* (2) Charge for the write.
	719	* (3) Make sure it's on its vnode's correct block list,
	720	*/
	721	if (!ISSET(bp->b_flags, B_DELWRI)) {
	722	SET(bp->b_flags, B_DELWRI);
	723	if (p && p->p_stats)
	724	p->p_stats->p_ru.ru_oublock++; /* XXX */
	725	nbdwrite ++;
	726	reassignbuf(bp, vp);
	727	}
	728
	729	/* If this is a tape block, write it the block now. */
	730	if (ISSET(bp->b_flags, B_TAPE)) {
	731	/* bwrite(bp); */
	732	VOP_BWRITE(bp);
	733	return (0);
	734	}
	735
	736	/*
	737	* If the vnode has "too many" write operations in progress
	738	* wait for them to finish the IO
	739	*/
	740	while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
	741	vp->v_flag \|= VTHROTTLED;
	742	(void)tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "bdwrite", 0);
	743	}
	744
	745	/*
	746	* If we have too many delayed write buffers,
	747	* more than we can "safely" handle, just fall back to
	748	* doing the async write
	749	*/
	750	if (nbdwrite < 0)
	751	panic("bdwrite: Negative nbdwrite");
	752
	753	// can't do a bawrite() if the LOCKED bit is set because the
	754	// buffer is part of a transaction and can't go to disk until
	755	// the LOCKED bit is cleared.
	756	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
	757	if (return_error)
	758	return (EAGAIN);
	759	else
	760	bawrite(bp);
	761	return (0);
	762	}
	763
	764	/* Otherwise, the "write" is done, so mark and release the buffer. */
	765	SET(bp->b_flags, B_DONE);
	766	brelse(bp);
	767	return (0);
	768	}
	769
	770	void
	771	bdwrite(bp)
	772	struct buf *bp;
	773	{
	774	(void) bdwrite_internal(bp, 0);
	775	}
	776
	777
	778	/*
	779	* Asynchronous block write; just an asynchronous bwrite().
	780	*
	781	* Note: With the abilitty to allocate additional buffer
	782	* headers, we can get in to the situation where "too" many
	783	* bawrite()s can create situation where the kernel can create
	784	* buffers faster than the disks can service.
	785	* We limit the number of "in flight" writes a vnode can have to
	786	* avoid this.
	787	*/
	788	static int
	789	bawrite_internal(bp, throttle)
	790	struct buf *bp;
	791	int throttle;
	792	{
	793	struct vnode *vp = bp->b_vp;
	794
	795	if (vp) {
	796	/*
	797	* If the vnode has "too many" write operations in progress
	798	* wait for them to finish the IO
	799	*/
	800	while (vp->v_numoutput >= BUFWRITE_THROTTLE) {
	801	if (throttle) {
	802	vp->v_flag \|= VTHROTTLED;
	803	(void)tsleep((caddr_t)&vp->v_numoutput,
	804	PRIBIO + 1, "bawrite", 0);
	805	} else
	806	return (EWOULDBLOCK);
	807	}
	808	}
	809
	810	SET(bp->b_flags, B_ASYNC);
	811	VOP_BWRITE(bp);
	812	return (0);
	813	}
	814
	815	void
	816	bawrite(bp)
	817	struct buf *bp;
	818	{
	819	(void) bawrite_internal(bp, 1);
	820	}
	821
	822	/*
	823	* bwillwrite:
	824	*
	825	* Called prior to the locking of any vnodes when we are expecting to
	826	* write. We do not want to starve the buffer cache with too many
	827	* dirty buffers so we block here. By blocking prior to the locking
	828	* of any vnodes we attempt to avoid the situation where a locked vnode
	829	* prevents the various system daemons from flushing related buffers.
	830	*/
	831
	832	void
	833	bwillwrite(void)
	834	{
	835	/* XXX To be implemented later */
	836	}
	837
	838	/*
	839	* Release a buffer on to the free lists.
	840	* Described in Bach (p. 46).
	841	*/
	842	void
	843	brelse(bp)
	844	struct buf *bp;
	845	{
	846	struct bqueues *bufq;
	847	int s;
	848	long whichq;
	849
	850	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_START,
	851	bp->b_lblkno * PAGE_SIZE, (int)bp, (int)bp->b_data,
	852	bp->b_flags, 0);
	853
	854	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	855
	856	// if we're invalidating a buffer that has the B_CALL bit
	857	// set then call the b_iodone function so it gets cleaned
	858	// up properly.
	859	//
	860	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
	861	if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
	862	panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
	863	}
	864	if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
	865	void (iodone_func)(struct buf ) = bp->b_iodone;
	866
	867	CLR(bp->b_flags, B_CALL); /* but note callout done */
	868	bp->b_iodone = NULL;
	869
	870	if (iodone_func == NULL) {
	871	panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
	872	}
	873	(*iodone_func)(bp);
	874	}
	875	}
	876
	877	/* IO is done. Cleanup the UPL state */
	878	if (!ISSET(bp->b_flags, B_META)
	879	&& UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
	880	kern_return_t kret;
	881	upl_t upl;
	882	int upl_flags;
	883
	884	if ( !ISSET(bp->b_flags, B_PAGELIST)) {
	885	if ( !ISSET(bp->b_flags, B_INVAL)) {
	886	kret = ubc_create_upl(bp->b_vp,
	887	ubc_blktooff(bp->b_vp, bp->b_lblkno),
	888	bp->b_bufsize,
	889	&upl,
	890	NULL,
	891	UPL_PRECIOUS);
	892	if (kret != KERN_SUCCESS)
	893	panic("brelse: Failed to get pagelists");
	894	#ifdef UBC_DEBUG
	895	upl_ubc_alias_set(upl, bp, 5);
	896	#endif /* UBC_DEBUG */
	897	} else
	898	upl = (upl_t) 0;
	899	} else {
	900	upl = bp->b_pagelist;
	901
	902	if (bp->b_data) {
	903	kret = ubc_upl_unmap(upl);
	904
	905	if (kret != KERN_SUCCESS)
	906	panic("kernel_upl_unmap failed");
	907	bp->b_data = 0;
	908	}
	909	}
	910	if (upl) {
	911	if (bp->b_flags & (B_ERROR \| B_INVAL)) {
	912	if (bp->b_flags & (B_READ \| B_INVAL))
	913	upl_flags = UPL_ABORT_DUMP_PAGES;
	914	else
	915	upl_flags = 0;
	916	ubc_upl_abort(upl, upl_flags);
	917	} else {
	918	if (ISSET(bp->b_flags, B_NEEDCOMMIT))
	919	upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
	920	else if (ISSET(bp->b_flags, B_DELWRI \| B_WASDIRTY))
	921	upl_flags = UPL_COMMIT_SET_DIRTY ;
	922	else
	923	upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
	924	ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags \|
	925	UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	926	}
	927	s = splbio();
	928	CLR(bp->b_flags, B_PAGELIST);
	929	bp->b_pagelist = 0;
	930	splx(s);
	931	}
	932	} else {
	933	if(ISSET(bp->b_flags, B_PAGELIST))
	934	panic("brelse: pagelist set for non VREG; vp=%x", bp->b_vp);
	935	}
	936
	937	/* Wake up any processes waiting for any buffer to become free. */
	938	if (needbuffer) {
	939	needbuffer = 0;
	940	wakeup(&needbuffer);
	941	}
	942
	943	/* Wake up any proceeses waiting for _this_ buffer to become free. */
	944	if (ISSET(bp->b_flags, B_WANTED)) {
	945	CLR(bp->b_flags, B_WANTED);
	946	wakeup(bp);
	947	}
	948
	949	/* Block disk interrupts. */
	950	s = splbio();
	951
	952	/*
	953	* Determine which queue the buffer should be on, then put it there.
	954	*/
	955
	956	/* If it's locked, don't report an error; try again later. */
	957	if (ISSET(bp->b_flags, (B_LOCKED\|B_ERROR)) == (B_LOCKED\|B_ERROR))
	958	CLR(bp->b_flags, B_ERROR);
	959
	960	/* If it's not cacheable, or an error, mark it invalid. */
	961	if (ISSET(bp->b_flags, (B_NOCACHE\|B_ERROR)))
	962	SET(bp->b_flags, B_INVAL);
	963
	964	if ((bp->b_bufsize <= 0) \|\| ISSET(bp->b_flags, B_INVAL)) {
	965	/*
	966	* If it's invalid or empty, dissociate it from its vnode
	967	* and put on the head of the appropriate queue.
	968	*/
	969	if (bp->b_vp)
	970	brelvp(bp);
	971	if (ISSET(bp->b_flags, B_DELWRI)) {
	972	CLR(bp->b_flags, B_DELWRI);
	973	nbdwrite--;
	974	wakeup((caddr_t)&nbdwrite);
	975	}
	976	if (bp->b_bufsize <= 0)
	977	whichq = BQ_EMPTY; /* no data */
	978	else if (ISSET(bp->b_flags, B_META))
	979	whichq = BQ_META; /* meta-data */
	980	else
	981	whichq = BQ_AGE; /* invalid data */
	982
	983	bufq = &bufqueues[whichq];
	984	binsheadfree(bp, bufq, whichq);
	985	} else {
	986	/*
	987	* It has valid data. Put it on the end of the appropriate
	988	* queue, so that it'll stick around for as long as possible.
	989	*/
	990	if (ISSET(bp->b_flags, B_LOCKED))
	991	whichq = BQ_LOCKED; /* locked in core */
	992	else if (ISSET(bp->b_flags, B_META))
	993	whichq = BQ_META; /* meta-data */
	994	else if (ISSET(bp->b_flags, B_AGE))
	995	whichq = BQ_AGE; /* stale but valid data */
	996	else
	997	whichq = BQ_LRU; /* valid data */
	998
	999	bufq = &bufqueues[whichq];
	1000	binstailfree(bp, bufq, whichq);
	1001	}
	1002
	1003	/* Unlock the buffer. */
	1004	CLR(bp->b_flags, (B_AGE \| B_ASYNC \| B_BUSY \| B_NOCACHE));
	1005
	1006	/* Allow disk interrupts. */
	1007	splx(s);
	1008
	1009	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) \| DBG_FUNC_END,
	1010	(int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
	1011	}
	1012
	1013	/*
	1014	* Determine if a block is in the cache.
	1015	* Just look on what would be its hash chain. If it's there, return
	1016	* a pointer to it, unless it's marked invalid. If it's marked invalid,
	1017	* we normally don't return the buffer, unless the caller explicitly
	1018	* wants us to.
	1019	*/
	1020	struct buf *
	1021	incore(vp, blkno)
	1022	struct vnode *vp;
	1023	daddr_t blkno;
	1024	{
	1025	struct buf *bp;
	1026
	1027	bp = BUFHASH(vp, blkno)->lh_first;
	1028
	1029	/* Search hash chain */
	1030	for (; bp != NULL; bp = bp->b_hash.le_next) {
	1031	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
	1032	!ISSET(bp->b_flags, B_INVAL))
	1033	return (bp);
	1034	}
	1035
	1036	return (0);
	1037	}
	1038
	1039
	1040	/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
	1041	/*
	1042	* Get a block of requested size that is associated with
	1043	* a given vnode and block offset. If it is found in the
	1044	* block cache, mark it as having been found, make it busy
	1045	* and return it. Otherwise, return an empty block of the
	1046	* correct size. It is up to the caller to insure that the
	1047	* cached blocks be of the correct size.
	1048	*/
	1049	struct buf *
	1050	getblk(vp, blkno, size, slpflag, slptimeo, operation)
	1051	register struct vnode *vp;
	1052	daddr_t blkno;
	1053	int size, slpflag, slptimeo, operation;
	1054	{
	1055	struct buf *bp;
	1056	int s, err;
	1057	upl_t upl;
	1058	upl_page_info_t *pl;
	1059	kern_return_t kret;
	1060	int error=0;
	1061	int pagedirty = 0;
	1062
	1063	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_START,
	1064	blkno * PAGE_SIZE, size, operation, 0, 0);
	1065	start:
	1066
	1067	s = splbio();
	1068	if ((bp = incore(vp, blkno))) {
	1069	/* Found in the Buffer Cache */
	1070	if (ISSET(bp->b_flags, B_BUSY)) {
	1071	/* but is busy */
	1072	switch (operation) {
	1073	case BLK_READ:
	1074	case BLK_WRITE:
	1075	case BLK_META:
	1076	SET(bp->b_flags, B_WANTED);
	1077	bufstats.bufs_busyincore++;
	1078	err = tsleep(bp, slpflag \| (PRIBIO + 1), "getblk",
	1079	slptimeo);
	1080	splx(s);
	1081	/*
	1082	* Callers who call with PCATCH or timeout are
	1083	* willing to deal with the NULL pointer
	1084	*/
	1085	if (err && ((slpflag & PCATCH) \|\|
	1086	((err == EWOULDBLOCK) && slptimeo)))
	1087	return (NULL);
	1088	goto start;
	1089	/NOTREACHED/
	1090	break;
	1091
	1092	case BLK_PAGEIN:
	1093	/* pagein operation must not use getblk */
	1094	panic("getblk: pagein for incore busy buffer");
	1095	splx(s);
	1096	/NOTREACHED/
	1097	break;
	1098
	1099	case BLK_PAGEOUT:
	1100	/* pageout operation must not use getblk */
	1101	panic("getblk: pageout for incore busy buffer");
	1102	splx(s);
	1103	/NOTREACHED/
	1104	break;
	1105
	1106	default:
	1107	panic("getblk: %d unknown operation 1", operation);
	1108	/NOTREACHED/
	1109	break;
	1110	}
	1111	} else {
	1112	/* not busy */
	1113	SET(bp->b_flags, (B_BUSY \| B_CACHE));
	1114	bremfree(bp);
	1115	bufstats.bufs_incore++;
	1116	splx(s);
	1117
	1118	allocbuf(bp, size);
	1119	if (ISSET(bp->b_flags, B_PAGELIST))
	1120	panic("pagelist buffer is not busy");
	1121
	1122	switch (operation) {
	1123	case BLK_READ:
	1124	case BLK_WRITE:
	1125	if (UBCISVALID(bp->b_vp) && bp->b_bufsize) {
	1126	kret = ubc_create_upl(vp,
	1127	ubc_blktooff(vp, bp->b_lblkno),
	1128	bp->b_bufsize,
	1129	&upl,
	1130	&pl,
	1131	UPL_PRECIOUS);
	1132	if (kret != KERN_SUCCESS)
	1133	panic("Failed to get pagelists");
	1134
	1135	SET(bp->b_flags, B_PAGELIST);
	1136	bp->b_pagelist = upl;
	1137
	1138	if (!upl_valid_page(pl, 0)) {
	1139	if (vp->v_tag != VT_NFS)
	1140	panic("getblk: incore buffer without valid page");
	1141	CLR(bp->b_flags, B_CACHE);
	1142	}
	1143
	1144	if (upl_dirty_page(pl, 0))
	1145	SET(bp->b_flags, B_WASDIRTY);
	1146	else
	1147	CLR(bp->b_flags, B_WASDIRTY);
	1148
	1149	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
	1150	if (kret != KERN_SUCCESS)
	1151	panic("getblk: ubc_upl_map() failed with (%d)",
	1152	kret);
	1153	if (bp->b_data == 0)
	1154	panic("ubc_upl_map mapped 0");
	1155	}
	1156	break;
	1157
	1158	case BLK_META:
	1159	/*
	1160	* VM is not involved in IO for the meta data
	1161	* buffer already has valid data
	1162	*/
	1163	if(bp->b_data == 0)
	1164	panic("bp->b_data null incore buf=%x", bp);
	1165	break;
	1166
	1167	case BLK_PAGEIN:
	1168	case BLK_PAGEOUT:
	1169	panic("getblk: paging operation 1");
	1170	break;
	1171
	1172	default:
	1173	panic("getblk: %d unknown operation 2", operation);
	1174	/NOTREACHED/
	1175	break;
	1176	}
	1177	}
	1178	} else { /* not incore() */
	1179	int queue = BQ_EMPTY; /* Start with no preference */
	1180	splx(s);
	1181
	1182	if ((operation == BLK_META) \|\| (UBCINVALID(vp)) \|\|
	1183	!(UBCINFOEXISTS(vp))) {
	1184	operation = BLK_META;
	1185	}
	1186	if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
	1187	goto start;
	1188	if (incore(vp, blkno)) {
	1189	SET(bp->b_flags, B_INVAL);
	1190	binshash(bp, &invalhash);
	1191	brelse(bp);
	1192	goto start;
	1193	}
	1194	/*
	1195	* NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
	1196	* CALLED! BE CAREFUL.
	1197	*/
	1198
	1199	/*
	1200	* if it is meta, the queue may be set to other
	1201	* type so reset as well as mark it to be B_META
	1202	* so that when buffer is released it will goto META queue
	1203	* Also, if the vnode is not VREG, then it is META
	1204	*/
	1205	if (operation == BLK_META) {
	1206	SET(bp->b_flags, B_META);
	1207	queue = BQ_META;
	1208	}
	1209
	1210	bp->b_blkno = bp->b_lblkno = blkno;
	1211	bp->b_vp = vp;
	1212
	1213	/*
	1214	* Insert in the hash so that incore() can find it
	1215	*/
	1216	binshash(bp, BUFHASH(vp, blkno));
	1217
	1218	s = splbio();
	1219	bgetvp(vp, bp);
	1220	splx(s);
	1221
	1222	allocbuf(bp, size);
	1223
	1224	switch (operation) {
	1225	case BLK_META:
	1226	/* buffer data is invalid */
	1227
	1228	if(bp->b_data == 0)
	1229	panic("bp->b_data is null %x",bp);
	1230
	1231	bufstats.bufs_miss++;
	1232
	1233	/* wakeup the buffer */
	1234	CLR(bp->b_flags, B_WANTED);
	1235	wakeup(bp);
	1236	break;
	1237
	1238	case BLK_READ:
	1239	case BLK_WRITE:
	1240
	1241	if (ISSET(bp->b_flags, B_PAGELIST))
	1242	panic("B_PAGELIST in bp=%x",bp);
	1243
	1244	kret = ubc_create_upl(vp,
	1245	ubc_blktooff(vp, blkno),
	1246	bp->b_bufsize,
	1247	&upl,
	1248	&pl,
	1249	UPL_PRECIOUS);
	1250	if (kret != KERN_SUCCESS)
	1251	panic("Failed to get pagelists");
	1252
	1253	#ifdef UBC_DEBUG
	1254	upl_ubc_alias_set(upl, bp, 4);
	1255	#endif /* UBC_DEBUG */
	1256	bp->b_pagelist = upl;
	1257
	1258	SET(bp->b_flags, B_PAGELIST);
	1259
	1260	if (upl_valid_page(pl, 0)) {
	1261	SET(bp->b_flags, B_CACHE \| B_DONE);
	1262	bufstats.bufs_vmhits++;
	1263
	1264	pagedirty = upl_dirty_page(pl, 0);
	1265
	1266	if (pagedirty)
	1267	SET(bp->b_flags, B_WASDIRTY);
	1268
	1269	if (vp->v_tag == VT_NFS) {
	1270	off_t f_offset;
	1271	int valid_size;
	1272
	1273	bp->b_validoff = 0;
	1274	bp->b_dirtyoff = 0;
	1275
	1276	f_offset = ubc_blktooff(vp, blkno);
	1277
	1278	if (f_offset > vp->v_ubcinfo->ui_size) {
	1279	CLR(bp->b_flags, (B_CACHE\|B_DONE\|B_WASDIRTY));
	1280	bp->b_validend = 0;
	1281	bp->b_dirtyend = 0;
	1282	} else {
	1283	valid_size = min(((unsigned int)(vp->v_ubcinfo->ui_size - f_offset)), PAGE_SIZE);
	1284	bp->b_validend = valid_size;
	1285
	1286	if (pagedirty)
	1287	bp->b_dirtyend = valid_size;
	1288	else
	1289	bp->b_dirtyend = 0;
	1290
	1291	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_NONE,
	1292	bp->b_validend, bp->b_dirtyend,
	1293	(int)vp->v_ubcinfo->ui_size, 0, 0);
	1294	}
	1295	} else {
	1296	bp->b_validoff = 0;
	1297	bp->b_dirtyoff = 0;
	1298
	1299	if (pagedirty) {
	1300	/* page is dirty */
	1301	bp->b_validend = bp->b_bcount;
	1302	bp->b_dirtyend = bp->b_bcount;
	1303	} else {
	1304	/* page is clean */
	1305	bp->b_validend = bp->b_bcount;
	1306	bp->b_dirtyend = 0;
	1307	}
	1308	}
	1309	error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL);
	1310	if(error) {
	1311	panic("getblk: VOP_BMAP failed");
	1312	/NOTREACHED/
	1313	/*
	1314	* XXX: We probably should invalidate the VM Page
	1315	*/
	1316	bp->b_error = error;
	1317	SET(bp->b_flags, (B_ERROR \| B_INVAL));
	1318	/* undo B_DONE that was set before upl_commit() */
	1319	CLR(bp->b_flags, B_DONE);
	1320	brelse(bp);
	1321	return (0);
	1322	}
	1323	} else {
	1324	bufstats.bufs_miss++;
	1325	}
	1326	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
	1327	if (kret != KERN_SUCCESS) {
	1328	panic("getblk: ubc_upl_map() "
	1329	"failed with (%d)", kret);
	1330	}
	1331	if (bp->b_data == 0)
	1332	panic("kernel_upl_map mapped 0");
	1333
	1334	break;
	1335
	1336	case BLK_PAGEIN:
	1337	case BLK_PAGEOUT:
	1338	panic("getblk: paging operation 2");
	1339	break;
	1340	default:
	1341	panic("getblk: %d unknown operation 3", operation);
	1342	/NOTREACHED/
	1343	break;
	1344	}
	1345	}
	1346
	1347	if (bp->b_data == NULL)
	1348	panic("getblk: bp->b_addr is null");
	1349
	1350	if (bp->b_bufsize & 0xfff) {
	1351	if (ISSET(bp->b_flags, B_META) && (bp->b_bufsize & 0x1ff))
	1352	panic("getblk: bp->b_bufsize = %d", bp->b_bufsize);
	1353	}
	1354
	1355	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) \| DBG_FUNC_END,
	1356	(int)bp, (int)bp->b_data, bp->b_flags, 3, 0);
	1357
	1358	return (bp);
	1359	}
	1360
	1361	/*
	1362	* Get an empty, disassociated buffer of given size.
	1363	*/
	1364	struct buf *
	1365	geteblk(size)
	1366	int size;
	1367	{
	1368	struct buf *bp;
	1369	int queue = BQ_EMPTY;
	1370
	1371	while ((bp = getnewbuf(0, 0, &queue)) == 0)
	1372	;
	1373	SET(bp->b_flags, (B_META\|B_INVAL));
	1374
	1375	#if DIAGNOSTIC
	1376	assert(queue == BQ_EMPTY);
	1377	#endif /* DIAGNOSTIC */
	1378	/* XXX need to implement logic to deal with other queues */
	1379
	1380	binshash(bp, &invalhash);
	1381	allocbuf(bp, size);
	1382	bufstats.bufs_eblk++;
	1383
	1384	return (bp);
	1385	}
	1386
	1387	/*
	1388	* Zones for the meta data buffers
	1389	*/
	1390
	1391	#define MINMETA 512
	1392	#define MAXMETA 4096
	1393
	1394	struct meta_zone_entry {
	1395	zone_t mz_zone;
	1396	vm_size_t mz_size;
	1397	vm_size_t mz_max;
	1398	char *mz_name;
	1399	};
	1400
	1401	struct meta_zone_entry meta_zones[] = {
	1402	{NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
	1403	{NULL, (MINMETA * 2), 64 * (MINMETA * 2), "buf.1024" },
	1404	{NULL, (MINMETA * 4), 16 * (MINMETA * 4), "buf.2048" },
	1405	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
	1406	{NULL, 0, 0, "" } /* End */
	1407	};
	1408
	1409	/*
	1410	* Initialize the meta data zones
	1411	*/
	1412	static void
	1413	bufzoneinit(void)
	1414	{
	1415	int i;
	1416
	1417	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	1418	meta_zones[i].mz_zone =
	1419	zinit(meta_zones[i].mz_size,
	1420	meta_zones[i].mz_max,
	1421	PAGE_SIZE,
	1422	meta_zones[i].mz_name);
	1423	}
	1424	buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
	1425	}
	1426
	1427	static __inline__ zone_t
	1428	getbufzone(size_t size)
	1429	{
	1430	int i;
	1431
	1432	if ((size % 512) \|\| (size < MINMETA) \|\| (size > MAXMETA))
	1433	panic("getbufzone: incorect size = %d", size);
	1434
	1435	for (i = 0; meta_zones[i].mz_size != 0; i++) {
	1436	if (meta_zones[i].mz_size >= size)
	1437	break;
	1438	}
	1439
	1440	return (meta_zones[i].mz_zone);
	1441	}
	1442
	1443	/*
	1444	* With UBC, there is no need to expand / shrink the file data
	1445	* buffer. The VM uses the same pages, hence no waste.
	1446	* All the file data buffers can have one size.
	1447	* In fact expand / shrink would be an expensive operation.
	1448	*
	1449	* Only exception to this is meta-data buffers. Most of the
	1450	* meta data operations are smaller than PAGE_SIZE. Having the
	1451	* meta-data buffers grow and shrink as needed, optimizes use
	1452	* of the kernel wired memory.
	1453	*/
	1454
	1455	int
	1456	allocbuf(bp, size)
	1457	struct buf *bp;
	1458	int size;
	1459	{
	1460	vm_size_t desired_size;
	1461
	1462	desired_size = roundup(size, CLBYTES);
	1463
	1464	if(desired_size < PAGE_SIZE)
	1465	desired_size = PAGE_SIZE;
	1466	if (desired_size > MAXBSIZE)
	1467	panic("allocbuf: buffer larger than MAXBSIZE requested");
	1468
	1469	if (ISSET(bp->b_flags, B_META)) {
	1470	kern_return_t kret;
	1471	zone_t zprev, z;
	1472	size_t nsize = roundup(size, MINMETA);
	1473
	1474	if (bp->b_data) {
	1475	vm_offset_t elem = (vm_offset_t)bp->b_data;
	1476
	1477	if (ISSET(bp->b_flags, B_ZALLOC))
	1478	if (bp->b_bufsize <= MAXMETA) {
	1479	if (bp->b_bufsize < nsize) {
	1480	/* reallocate to a bigger size */
	1481
	1482	zprev = getbufzone(bp->b_bufsize);
	1483	if (nsize <= MAXMETA) {
	1484	desired_size = nsize;
	1485	z = getbufzone(nsize);
	1486	bp->b_data = (caddr_t)zalloc(z);
	1487	if(bp->b_data == 0)
	1488	panic("allocbuf: zalloc() returned NULL");
	1489	} else {
	1490	kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
	1491	if (kret != KERN_SUCCESS)
	1492	panic("allocbuf: kmem_alloc() 0 returned %d", kret);
	1493	if(bp->b_data == 0)
	1494	panic("allocbuf: null b_data 0");
	1495	CLR(bp->b_flags, B_ZALLOC);
	1496	}
	1497	bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
	1498	zfree(zprev, elem);
	1499	} else {
	1500	desired_size = bp->b_bufsize;
	1501	}
	1502	} else
	1503	panic("allocbuf: B_ZALLOC set incorrectly");
	1504	else
	1505	if (bp->b_bufsize < desired_size) {
	1506	/* reallocate to a bigger size */
	1507	kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
	1508	if (kret != KERN_SUCCESS)
	1509	panic("allocbuf: kmem_alloc() returned %d", kret);
	1510	if(bp->b_data == 0)
	1511	panic("allocbuf: null b_data");
	1512	bcopy((const void *)elem, bp->b_data, bp->b_bufsize);
	1513	kmem_free(kernel_map, elem, bp->b_bufsize);
	1514	} else {
	1515	desired_size = bp->b_bufsize;
	1516	}
	1517	} else {
	1518	/* new allocation */
	1519	if (nsize <= MAXMETA) {
	1520	desired_size = nsize;
	1521	z = getbufzone(nsize);
	1522	bp->b_data = (caddr_t)zalloc(z);
	1523	if(bp->b_data == 0)
	1524	panic("allocbuf: zalloc() returned NULL 2");
	1525	SET(bp->b_flags, B_ZALLOC);
	1526	} else {
	1527	kret = kmem_alloc(kernel_map, &bp->b_data, desired_size);
	1528	if (kret != KERN_SUCCESS)
	1529	panic("allocbuf: kmem_alloc() 2 returned %d", kret);
	1530	if(bp->b_data == 0)
	1531	panic("allocbuf: null b_data 2");
	1532	}
	1533	}
	1534	}
	1535
	1536	if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
	1537	panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
	1538
	1539	bp->b_bufsize = desired_size;
	1540	bp->b_bcount = size;
	1541	return (0);
	1542	}
	1543
	1544	/*
	1545	* Get a new buffer from one of the free lists.
	1546	*
	1547	* Request for a queue is passes in. The queue from which the buffer was taken
	1548	* from is returned. Out of range queue requests get BQ_EMPTY. Request for
	1549	* BQUEUE means no preference. Use heuristics in that case.
	1550	* Heuristics is as follows:
	1551	* Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
	1552	* If none available block till one is made available.
	1553	* If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
	1554	* Pick the most stale buffer.
	1555	* If found buffer was marked delayed write, start the async. write
	1556	* and restart the search.
	1557	* Initialize the fields and disassociate the buffer from the vnode.
	1558	* Remove the buffer from the hash. Return the buffer and the queue
	1559	* on which it was found.
	1560	*/
	1561
	1562	static struct buf *
	1563	getnewbuf(slpflag, slptimeo, queue)
	1564	int slpflag, slptimeo;
	1565	int *queue;
	1566	{
	1567	register struct buf *bp;
	1568	register struct buf *lru_bp;
	1569	register struct buf *age_bp;
	1570	register struct buf *meta_bp;
	1571	register int age_time, lru_time, bp_time, meta_time;
	1572	int s;
	1573	int req = queue; / save it for restarts */
	1574
	1575	start:
	1576	s = splbio();
	1577
	1578	/* invalid request gets empty queue */
	1579	if ((queue > BQUEUES) \|\| (queue < 0)
	1580	\|\| (queue == BQ_LAUNDRY) \|\| (queue == BQ_LOCKED))
	1581	*queue = BQ_EMPTY;
	1582
	1583	/* (queue == BQUEUES) means no preference /
	1584	if (*queue != BQUEUES) {
	1585	/* Try for the requested queue first */
	1586	bp = bufqueues[*queue].tqh_first;
	1587	if (bp)
	1588	goto found;
	1589	}
	1590
	1591	/* Unable to use requested queue */
	1592	age_bp = bufqueues[BQ_AGE].tqh_first;
	1593	lru_bp = bufqueues[BQ_LRU].tqh_first;
	1594	meta_bp = bufqueues[BQ_META].tqh_first;
	1595
	1596	if (!age_bp && !lru_bp && !meta_bp) {
	1597	/*
	1598	* Unavailble on AGE or LRU or META queues
	1599	* Try the empty list first
	1600	*/
	1601	bp = bufqueues[BQ_EMPTY].tqh_first;
	1602	if (bp) {
	1603	*queue = BQ_EMPTY;
	1604	goto found;
	1605	}
	1606
	1607	/* Create a new temparory buffer header */
	1608	bp = (struct buf *)zalloc(buf_hdr_zone);
	1609
	1610	if (bp) {
	1611	bufhdrinit(bp);
	1612	BLISTNONE(bp);
	1613	binshash(bp, &invalhash);
	1614	SET(bp->b_flags, B_HDRALLOC);
	1615	*queue = BQ_EMPTY;
	1616	binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
	1617	buf_hdr_count++;
	1618	goto found;
	1619	}
	1620
	1621	/* Log this error condition */
	1622	printf("getnewbuf: No useful buffers");
	1623
	1624	/* wait for a free buffer of any kind */
	1625	needbuffer = 1;
	1626	bufstats.bufs_sleeps++;
	1627	tsleep(&needbuffer, slpflag\|(PRIBIO+1), "getnewbuf", slptimeo);
	1628	splx(s);
	1629	return (0);
	1630	}
	1631
	1632	/* Buffer available either on AGE or LRU or META */
	1633	bp = NULL;
	1634	*queue = -1;
	1635
	1636	/* Buffer available either on AGE or LRU */
	1637	if (!age_bp) {
	1638	bp = lru_bp;
	1639	*queue = BQ_LRU;
	1640	} else if (!lru_bp) {
	1641	bp = age_bp;
	1642	*queue = BQ_AGE;
	1643	} else { /* buffer available on both AGE and LRU */
	1644	age_time = time.tv_sec - age_bp->b_timestamp;
	1645	lru_time = time.tv_sec - lru_bp->b_timestamp;
	1646	if ((age_time < 0) \|\| (lru_time < 0)) { /* time set backwards */
	1647	bp = age_bp;
	1648	*queue = BQ_AGE;
	1649	/*
	1650	* we should probably re-timestamp eveything in the
	1651	* queues at this point with the current time
	1652	*/
	1653	} else {
	1654	if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
	1655	bp = lru_bp;
	1656	*queue = BQ_LRU;
	1657	} else {
	1658	bp = age_bp;
	1659	*queue = BQ_AGE;
	1660	}
	1661	}
	1662	}
	1663
	1664	if (!bp) { /* Neither on AGE nor on LRU */
	1665	bp = meta_bp;
	1666	*queue = BQ_META;
	1667	} else if (meta_bp) {
	1668	bp_time = time.tv_sec - bp->b_timestamp;
	1669	meta_time = time.tv_sec - meta_bp->b_timestamp;
	1670
	1671	if (!(bp_time < 0) && !(meta_time < 0)) {
	1672	/* time not set backwards */
	1673	int bp_is_stale;
	1674	bp_is_stale = (*queue == BQ_LRU) ?
	1675	lru_is_stale : age_is_stale;
	1676
	1677	if ((meta_time >= meta_is_stale) &&
	1678	(bp_time < bp_is_stale)) {
	1679	bp = meta_bp;
	1680	*queue = BQ_META;
	1681	}
	1682	}
	1683	}
	1684
	1685	if (bp == NULL)
	1686	panic("getnewbuf: null bp");
	1687
	1688	found:
	1689	if (ISSET(bp->b_flags, B_LOCKED)) {
	1690	panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
	1691	}
	1692
	1693	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	1694	panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
	1695
	1696	if(ISSET(bp->b_flags, B_BUSY))
	1697	panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
	1698
	1699	/* Clean it */
	1700	if (bcleanbuf(bp)) {
	1701	/* bawrite() issued, buffer not ready */
	1702	splx(s);
	1703	*queue = req;
	1704	goto start;
	1705	}
	1706	splx(s);
	1707	return (bp);
	1708	}
	1709
	1710	#include <mach/mach_types.h>
	1711	#include <mach/memory_object_types.h>
	1712	#include <kern/sched_prim.h>
	1713
	1714	/*
	1715	* Clean a buffer.
	1716	* Returns 0 is buffer is ready to use,
	1717	* Returns 1 if issued a bawrite() to indicate
	1718	* that the buffer is not ready.
	1719	*/
	1720	static int
	1721	bcleanbuf(struct buf *bp)
	1722	{
	1723	int s;
	1724	struct ucred *cred;
	1725	int hdralloc = 0;
	1726
	1727	s = splbio();
	1728
	1729	/* Remove from the queue */
	1730	bremfree(bp);
	1731
	1732	/* Buffer is no longer on free lists. */
	1733	SET(bp->b_flags, B_BUSY);
	1734
	1735	/* Check whether the buffer header was "allocated" */
	1736	if (ISSET(bp->b_flags, B_HDRALLOC))
	1737	hdralloc = 1;
	1738
	1739	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
	1740	panic("bcleanbuf: le_prev is deadbeef");
	1741
	1742	/*
	1743	* If buffer was a delayed write, start the IO by queuing
	1744	* it on the LAUNDRY queue, and return 1
	1745	*/
	1746	if (ISSET(bp->b_flags, B_DELWRI)) {
	1747	splx(s);
	1748	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	1749	blaundrycnt++;
	1750	wakeup(&blaundrycnt);
	1751	/* and give it a chance to run */
	1752	(void)thread_block(THREAD_CONTINUE_NULL);
	1753	return (1);
	1754	}
	1755
	1756	if (bp->b_vp)
	1757	brelvp(bp);
	1758	bremhash(bp);
	1759	BLISTNONE(bp);
	1760
	1761	splx(s);
	1762
	1763	if (ISSET(bp->b_flags, B_META)) {
	1764	vm_offset_t elem = (vm_offset_t)bp->b_data;
	1765	if (elem == 0)
	1766	panic("bcleanbuf: NULL bp->b_data B_META buffer");
	1767
	1768	if (ISSET(bp->b_flags, B_ZALLOC)) {
	1769	if (bp->b_bufsize <= MAXMETA) {
	1770	zone_t z;
	1771
	1772	z = getbufzone(bp->b_bufsize);
	1773	bp->b_data = (caddr_t)0xdeadbeef;
	1774	zfree(z, elem);
	1775	CLR(bp->b_flags, B_ZALLOC);
	1776	} else
	1777	panic("bcleanbuf: B_ZALLOC set incorrectly");
	1778	} else {
	1779	bp->b_data = (caddr_t)0xdeadbeef;
	1780	kmem_free(kernel_map, elem, bp->b_bufsize);
	1781	}
	1782	}
	1783
	1784	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
	1785
	1786	/* disassociate us from our vnode, if we had one... */
	1787	s = splbio();
	1788
	1789	/* clear out various other fields */
	1790	bp->b_bufsize = 0;
	1791	bp->b_data = 0;
	1792	bp->b_flags = B_BUSY;
	1793	if (hdralloc)
	1794	SET(bp->b_flags, B_HDRALLOC);
	1795	bp->b_dev = NODEV;
	1796	bp->b_blkno = bp->b_lblkno = 0;
	1797	bp->b_iodone = 0;
	1798	bp->b_error = 0;
	1799	bp->b_resid = 0;
	1800	bp->b_bcount = 0;
	1801	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1802	bp->b_validoff = bp->b_validend = 0;
	1803
	1804	/* nuke any credentials we were holding */
	1805	cred = bp->b_rcred;
	1806	if (cred != NOCRED) {
	1807	bp->b_rcred = NOCRED;
	1808	crfree(cred);
	1809	}
	1810	cred = bp->b_wcred;
	1811	if (cred != NOCRED) {
	1812	bp->b_wcred = NOCRED;
	1813	crfree(cred);
	1814	}
	1815	splx(s);
	1816	return (0);
	1817	}
	1818
	1819
	1820	/*
	1821	* Wait for operations on the buffer to complete.
	1822	* When they do, extract and return the I/O's error value.
	1823	*/
	1824	int
	1825	biowait(bp)
	1826	struct buf *bp;
	1827	{
	1828	int s;
	1829
	1830	s = splbio();
	1831	while (!ISSET(bp->b_flags, B_DONE))
	1832	tsleep(bp, PRIBIO + 1, "biowait", 0);
	1833	splx(s);
	1834
	1835	/* check for interruption of I/O (e.g. via NFS), then errors. */
	1836	if (ISSET(bp->b_flags, B_EINTR)) {
	1837	CLR(bp->b_flags, B_EINTR);
	1838	return (EINTR);
	1839	} else if (ISSET(bp->b_flags, B_ERROR))
	1840	return (bp->b_error ? bp->b_error : EIO);
	1841	else
	1842	return (0);
	1843	}
	1844
	1845	/*
	1846	* Mark I/O complete on a buffer.
	1847	*
	1848	* If a callback has been requested, e.g. the pageout
	1849	* daemon, do so. Otherwise, awaken waiting processes.
	1850	*
	1851	* [ Leffler, et al., says on p.247:
	1852	* "This routine wakes up the blocked process, frees the buffer
	1853	* for an asynchronous write, or, for a request by the pagedaemon
	1854	* process, invokes a procedure specified in the buffer structure" ]
	1855	*
	1856	* In real life, the pagedaemon (or other system processes) wants
	1857	* to do async stuff to, and doesn't want the buffer brelse()'d.
	1858	* (for swap pager, that puts swap buffers on the free lists (!!!),
	1859	* for the vn device, that puts malloc'd buffers on the free lists!)
	1860	*/
	1861	void
	1862	biodone(bp)
	1863	struct buf *bp;
	1864	{
	1865	boolean_t funnel_state;
	1866	struct vnode *vp;
	1867	extern struct timeval priority_IO_timestamp_for_root;
	1868	extern int hard_throttle_on_root;
	1869
	1870	funnel_state = thread_funnel_set(kernel_flock, TRUE);
	1871
	1872	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_START,
	1873	(int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
	1874
	1875	if (ISSET(bp->b_flags, B_DONE))
	1876	panic("biodone already");
	1877	SET(bp->b_flags, B_DONE); /* note that it's done */
	1878	/*
	1879	* I/O was done, so don't believe
	1880	* the DIRTY state from VM anymore
	1881	*/
	1882	CLR(bp->b_flags, B_WASDIRTY);
	1883
	1884	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
	1885	vwakeup(bp); /* wake up reader */
	1886
	1887	if (kdebug_enable) {
	1888	int code = DKIO_DONE;
	1889
	1890	if (bp->b_flags & B_READ)
	1891	code \|= DKIO_READ;
	1892	if (bp->b_flags & B_ASYNC)
	1893	code \|= DKIO_ASYNC;
	1894
	1895	if (bp->b_flags & B_META)
	1896	code \|= DKIO_META;
	1897	else if (bp->b_flags & (B_PGIN \| B_PAGEOUT))
	1898	code \|= DKIO_PAGING;
	1899
	1900	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) \| DBG_FUNC_NONE,
	1901	(unsigned int)bp, (unsigned int)bp->b_vp,
	1902	bp->b_resid, bp->b_error, 0);
	1903	}
	1904
	1905	/* Wakeup the throttled write operations as needed */
	1906	vp = bp->b_vp;
	1907	if (vp
	1908	&& (vp->v_flag & VTHROTTLED)
	1909	&& (vp->v_numoutput <= (BUFWRITE_THROTTLE / 3))) {
	1910	vp->v_flag &= ~VTHROTTLED;
	1911	wakeup((caddr_t)&vp->v_numoutput);
	1912	}
	1913	if ((bp->b_flags & B_PGIN) && (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV)) {
	1914	priority_IO_timestamp_for_root = time;
	1915	hard_throttle_on_root = 0;
	1916	}
	1917	if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
	1918	void (iodone_func)(struct buf ) = bp->b_iodone;
	1919
	1920	CLR(bp->b_flags, B_CALL); /* but note callout done */
	1921	bp->b_iodone = NULL;
	1922
	1923	if (iodone_func == NULL) {
	1924	panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
	1925	} else {
	1926	(*iodone_func)(bp);
	1927	}
	1928	} else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
	1929	brelse(bp);
	1930	else { /* or just wakeup the buffer */
	1931	CLR(bp->b_flags, B_WANTED);
	1932	wakeup(bp);
	1933	}
	1934
	1935	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) \| DBG_FUNC_END,
	1936	(int)bp, (int)bp->b_data, bp->b_flags, 0, 0);
	1937
	1938	thread_funnel_set(kernel_flock, funnel_state);
	1939	}
	1940
	1941	/*
	1942	* Return a count of buffers on the "locked" queue.
	1943	*/
	1944	int
	1945	count_lock_queue()
	1946	{
	1947	register struct buf *bp;
	1948	register int n = 0;
	1949
	1950	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
	1951	bp = bp->b_freelist.tqe_next)
	1952	n++;
	1953	return (n);
	1954	}
	1955
	1956	/*
	1957	* Return a count of 'busy' buffers. Used at the time of shutdown.
	1958	*/
	1959	int
	1960	count_busy_buffers()
	1961	{
	1962	register struct buf *bp;
	1963	register int nbusy = 0;
	1964
	1965	for (bp = &buf[nbuf]; --bp >= buf; )
	1966	if ((bp->b_flags & (B_BUSY\|B_INVAL)) == B_BUSY)
	1967	nbusy++;
	1968	return (nbusy);
	1969	}
	1970
	1971	#if DIAGNOSTIC
	1972	/*
	1973	* Print out statistics on the current allocation of the buffer pool.
	1974	* Can be enabled to print out on every ``sync'' by setting "syncprt"
	1975	* in vfs_syscalls.c using sysctl.
	1976	*/
	1977	void
	1978	vfs_bufstats()
	1979	{
	1980	int s, i, j, count;
	1981	register struct buf *bp;
	1982	register struct bqueues *dp;
	1983	int counts[MAXBSIZE/CLBYTES+1];
	1984	static char *bname[BQUEUES] =
	1985	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	1986
	1987	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
	1988	count = 0;
	1989	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	1990	counts[j] = 0;
	1991	s = splbio();
	1992	for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
	1993	counts[bp->b_bufsize/CLBYTES]++;
	1994	count++;
	1995	}
	1996	splx(s);
	1997	printf("%s: total-%d", bname[i], count);
	1998	for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
	1999	if (counts[j] != 0)
	2000	printf(", %d-%d", j * CLBYTES, counts[j]);
	2001	printf("\n");
	2002	}
	2003	}
	2004	#endif /* DIAGNOSTIC */
	2005
	2006	#define NRESERVEDIOBUFS 64
	2007
	2008	__private_extern__ struct buf *
	2009	alloc_io_buf(vp, priv)
	2010	struct vnode *vp;
	2011	int priv;
	2012	{
	2013	register struct buf *bp;
	2014	int s;
	2015
	2016	s = splbio();
	2017
	2018	while (niobuf - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse && !priv) {
	2019	need_iobuffer = 1;
	2020	bufstats.bufs_iobufsleeps++;
	2021	(void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf", 0);
	2022	}
	2023
	2024	while ((bp = iobufqueue.tqh_first) == NULL) {
	2025	need_iobuffer = 1;
	2026	bufstats.bufs_iobufsleeps++;
	2027	(void) tsleep(&need_iobuffer, (PRIBIO+1), "alloc_io_buf1", 0);
	2028	}
	2029
	2030	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
	2031	bp->b_timestamp = 0;
	2032
	2033	/* clear out various fields */
	2034	bp->b_flags = B_BUSY;
	2035	bp->b_blkno = bp->b_lblkno = 0;
	2036
	2037	bp->b_iodone = 0;
	2038	bp->b_error = 0;
	2039	bp->b_resid = 0;
	2040	bp->b_bcount = 0;
	2041	bp->b_bufsize = 0;
	2042	bp->b_vp = vp;
	2043
	2044	if (vp->v_type == VBLK \|\| vp->v_type == VCHR)
	2045	bp->b_dev = vp->v_rdev;
	2046	else
	2047	bp->b_dev = NODEV;
	2048	bufstats.bufs_iobufinuse++;
	2049	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
	2050	bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
	2051	splx(s);
	2052
	2053	return (bp);
	2054	}
	2055
	2056	__private_extern__ void
	2057	free_io_buf(bp)
	2058	struct buf *bp;
	2059	{
	2060	int s;
	2061
	2062	s = splbio();
	2063	/* put buffer back on the head of the iobufqueue */
	2064	bp->b_vp = NULL;
	2065	bp->b_flags = B_INVAL;
	2066
	2067	binsheadfree(bp, &iobufqueue, -1);
	2068
	2069	/* Wake up any processes waiting for any buffer to become free. */
	2070	if (need_iobuffer) {
	2071	need_iobuffer = 0;
	2072	wakeup(&need_iobuffer);
	2073	}
	2074	bufstats.bufs_iobufinuse--;
	2075	splx(s);
	2076	}
	2077
	2078	/* disabled for now */
	2079
	2080	/* XXX move this to a separate file */
	2081	/*
	2082	* Dynamic Scaling of the Buffer Queues
	2083	*/
	2084
	2085	typedef long long blsize_t;
	2086
	2087	blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
	2088	/* Global tunable limits */
	2089	blsize_t nbufh; /* number of buffer headers */
	2090	blsize_t nbuflow; /* minimum number of buffer headers required */
	2091	blsize_t nbufhigh; /* maximum number of buffer headers allowed */
	2092	blsize_t nbuftarget; /* preferred number of buffer headers */
	2093
	2094	/*
	2095	* assertions:
	2096	*
	2097	* 1. 0 < nbuflow <= nbufh <= nbufhigh
	2098	* 2. nbufhigh <= MAXNBUF
	2099	* 3. 0 < nbuflow <= nbuftarget <= nbufhigh
	2100	* 4. nbufh can not be set by sysctl().
	2101	*/
	2102
	2103	/* Per queue tunable limits */
	2104
	2105	struct bufqlim {
	2106	blsize_t bl_nlow; /* minimum number of buffer headers required */
	2107	blsize_t bl_num; /* number of buffer headers on the queue */
	2108	blsize_t bl_nlhigh; /* maximum number of buffer headers allowed */
	2109	blsize_t bl_target; /* preferred number of buffer headers */
	2110	long bl_stale; /* Seconds after which a buffer is considered stale */
	2111	} bufqlim[BQUEUES];
	2112
	2113	/*
	2114	* assertions:
	2115	*
	2116	* 1. 0 <= bl_nlow <= bl_num <= bl_nlhigh
	2117	* 2. bl_nlhigh <= MAXNBUF
	2118	* 3. bufqlim[BQ_META].bl_nlow != 0
	2119	* 4. bufqlim[BQ_META].bl_nlow > (number of possible concurrent
	2120	* file system IO operations)
	2121	* 5. bl_num can not be set by sysctl().
	2122	* 6. bl_nhigh <= nbufhigh
	2123	*/
	2124
	2125	/*
	2126	* Rationale:
	2127	* ----------
	2128	* Defining it blsize_t as long permits 2^31 buffer headers per queue.
	2129	* Which can describe (2^31 * PAGE_SIZE) memory per queue.
	2130	*
	2131	* These limits are exported to by means of sysctl().
	2132	* It was decided to define blsize_t as a 64 bit quantity.
	2133	* This will make sure that we will not be required to change it
	2134	* as long as we do not exceed 64 bit address space for the kernel.
	2135	*
	2136	* low and high numbers parameters initialized at compile time
	2137	* and boot arguments can be used to override them. sysctl()
	2138	* would not change the value. sysctl() can get all the values
	2139	* but can set only target. num is the current level.
	2140	*
	2141	* Advantages of having a "bufqscan" thread doing the balancing are,
	2142	* Keep enough bufs on BQ_EMPTY.
	2143	* getnewbuf() by default will always select a buffer from the BQ_EMPTY.
	2144	* getnewbuf() perfoms best if a buffer was found there.
	2145	* Also this minimizes the possibility of starting IO
	2146	* from getnewbuf(). That's a performance win, too.
	2147	*
	2148	* Localize complex logic [balancing as well as time aging]
	2149	* to balancebufq().
	2150	*
	2151	* Simplify getnewbuf() logic by elimination of time aging code.
	2152	*/
	2153
	2154	/*
	2155	* Algorithm:
	2156	* -----------
	2157	* The goal of the dynamic scaling of the buffer queues to to keep
	2158	* the size of the LRU close to bl_target. Buffers on a queue would
	2159	* be time aged.
	2160	*
	2161	* There would be a thread which will be responsible for "balancing"
	2162	* the buffer cache queues.
	2163	*
	2164	* The scan order would be: AGE, LRU, META, EMPTY.
	2165	*/
	2166
	2167	long bufqscanwait = 0;
	2168
	2169	static void bufqscan_thread();
	2170	static int balancebufq(int q);
	2171	static int btrimempty(int n);
	2172	static __inline__ int initbufqscan(void);
	2173	static __inline__ int nextbufq(int q);
	2174	static void buqlimprt(int all);
	2175
	2176	static void
	2177	bufq_balance_thread_init()
	2178	{
	2179
	2180	if (bufqscanwait++ == 0) {
	2181
	2182	/* Initalize globals */
	2183	MAXNBUF = (sane_size / PAGE_SIZE);
	2184	nbufh = nbuf;
	2185	nbuflow = min(nbufh, 100);
	2186	nbufhigh = min(MAXNBUF, max(nbufh, 2048));
	2187	nbuftarget = (sane_size >> 5) / PAGE_SIZE;
	2188	nbuftarget = max(nbuflow, nbuftarget);
	2189	nbuftarget = min(nbufhigh, nbuftarget);
	2190
	2191	/*
	2192	* Initialize the bufqlim
	2193	*/
	2194
	2195	/* LOCKED queue */
	2196	bufqlim[BQ_LOCKED].bl_nlow = 0;
	2197	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	2198	bufqlim[BQ_LOCKED].bl_target = 0;
	2199	bufqlim[BQ_LOCKED].bl_stale = 30;
	2200
	2201	/* LRU queue */
	2202	bufqlim[BQ_LRU].bl_nlow = 0;
	2203	bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
	2204	bufqlim[BQ_LRU].bl_target = nbuftarget/4;
	2205	bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
	2206
	2207	/* AGE queue */
	2208	bufqlim[BQ_AGE].bl_nlow = 0;
	2209	bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
	2210	bufqlim[BQ_AGE].bl_target = nbuftarget/4;
	2211	bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
	2212
	2213	/* EMPTY queue */
	2214	bufqlim[BQ_EMPTY].bl_nlow = 0;
	2215	bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
	2216	bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
	2217	bufqlim[BQ_EMPTY].bl_stale = 600000;
	2218
	2219	/* META queue */
	2220	bufqlim[BQ_META].bl_nlow = 0;
	2221	bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
	2222	bufqlim[BQ_META].bl_target = nbuftarget/4;
	2223	bufqlim[BQ_META].bl_stale = META_IS_STALE;
	2224
	2225	/* LAUNDRY queue */
	2226	bufqlim[BQ_LOCKED].bl_nlow = 0;
	2227	bufqlim[BQ_LOCKED].bl_nlhigh = 32;
	2228	bufqlim[BQ_LOCKED].bl_target = 0;
	2229	bufqlim[BQ_LOCKED].bl_stale = 30;
	2230
	2231	buqlimprt(1);
	2232	}
	2233
	2234	/* create worker thread */
	2235	kernel_thread(kernel_task, bufqscan_thread);
	2236	}
	2237
	2238	/* The workloop for the buffer balancing thread */
	2239	static void
	2240	bufqscan_thread()
	2241	{
	2242	boolean_t funnel_state;
	2243	int moretodo = 0;
	2244
	2245	funnel_state = thread_funnel_set(kernel_flock, TRUE);
	2246
	2247	for(;;) {
	2248	do {
	2249	int q; /* buffer queue to process */
	2250
	2251	q = initbufqscan();
	2252	for (; q; ) {
	2253	moretodo \|= balancebufq(q);
	2254	q = nextbufq(q);
	2255	}
	2256	} while (moretodo);
	2257
	2258	#if DIAGNOSTIC
	2259	vfs_bufstats();
	2260	buqlimprt(0);
	2261	#endif
	2262	(void)tsleep((void )&bufqscanwait, PRIBIO, "bufqscanwait", 60 hz);
	2263	moretodo = 0;
	2264	}
	2265
	2266	(void) thread_funnel_set(kernel_flock, FALSE);
	2267	}
	2268
	2269	/* Seed for the buffer queue balancing */
	2270	static __inline__ int
	2271	initbufqscan()
	2272	{
	2273	/* Start with AGE queue */
	2274	return (BQ_AGE);
	2275	}
	2276
	2277	/* Pick next buffer queue to balance */
	2278	static __inline__ int
	2279	nextbufq(int q)
	2280	{
	2281	int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
	2282
	2283	q++;
	2284	q %= sizeof(order);
	2285	return (order[q]);
	2286	}
	2287
	2288	/* function to balance the buffer queues */
	2289	static int
	2290	balancebufq(int q)
	2291	{
	2292	int moretodo = 0;
	2293	int s = splbio();
	2294	int n;
	2295
	2296	/* reject invalid q */
	2297	if ((q < 0) \|\| (q >= BQUEUES))
	2298	goto out;
	2299
	2300	/* LOCKED or LAUNDRY queue MUST not be balanced */
	2301	if ((q == BQ_LOCKED) \|\| (q == BQ_LAUNDRY))
	2302	goto out;
	2303
	2304	n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
	2305
	2306	/* If queue has less than target nothing more to do */
	2307	if (n < 0)
	2308	goto out;
	2309
	2310	if ( n > 8 ) {
	2311	/* Balance only a small amount (12.5%) at a time */
	2312	n >>= 3;
	2313	}
	2314
	2315	/* EMPTY queue needs special handling */
	2316	if (q == BQ_EMPTY) {
	2317	moretodo \|= btrimempty(n);
	2318	goto out;
	2319	}
	2320
	2321	for (; n > 0; n--) {
	2322	struct buf *bp = bufqueues[q].tqh_first;
	2323	if (!bp)
	2324	break;
	2325
	2326	/* check if it's stale */
	2327	if ((time.tv_sec - bp->b_timestamp) > bufqlim[q].bl_stale) {
	2328	if (bcleanbuf(bp)) {
	2329	/* bawrite() issued, bp not ready */
	2330	moretodo = 1;
	2331	} else {
	2332	/* release the cleaned buffer to BQ_EMPTY */
	2333	SET(bp->b_flags, B_INVAL);
	2334	brelse(bp);
	2335	}
	2336	} else
	2337	break;
	2338	}
	2339
	2340	out:
	2341	splx(s);
	2342	return (moretodo);
	2343	}
	2344
	2345	static int
	2346	btrimempty(int n)
	2347	{
	2348	/*
	2349	* When struct buf are allocated dynamically, this would
	2350	* reclaim upto 'n' struct buf from the empty queue.
	2351	*/
	2352
	2353	return (0);
	2354	}
	2355
	2356	static __inline__ void
	2357	bufqinc(int q)
	2358	{
	2359	if ((q < 0) \|\| (q >= BQUEUES))
	2360	return;
	2361
	2362	bufqlim[q].bl_num++;
	2363	return;
	2364	}
	2365
	2366	static __inline__ void
	2367	bufqdec(int q)
	2368	{
	2369	if ((q < 0) \|\| (q >= BQUEUES))
	2370	return;
	2371
	2372	bufqlim[q].bl_num--;
	2373	return;
	2374	}
	2375
	2376	static void
	2377	buqlimprt(int all)
	2378	{
	2379	int i;
	2380	static char *bname[BQUEUES] =
	2381	{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
	2382
	2383	if (all)
	2384	for (i = 0; i < BQUEUES; i++) {
	2385	printf("%s : ", bname[i]);
	2386	printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
	2387	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	2388	printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
	2389	printf("target = %ld, ", (long)bufqlim[i].bl_target);
	2390	printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
	2391	}
	2392	else
	2393	for (i = 0; i < BQUEUES; i++) {
	2394	printf("%s : ", bname[i]);
	2395	printf("cur = %ld, ", (long)bufqlim[i].bl_num);
	2396	}
	2397	}
	2398
	2399	/*
	2400	* If the getnewbuf() calls bcleanbuf() on the same thread
	2401	* there is a potential for stack overrun and deadlocks.
	2402	* So we always handoff the work to worker thread for completion
	2403	*/
	2404
	2405	static void
	2406	bcleanbuf_thread_init()
	2407	{
	2408	static void bcleanbuf_thread();
	2409
	2410	/* create worker thread */
	2411	kernel_thread(kernel_task, bcleanbuf_thread);
	2412	}
	2413
	2414	static void
	2415	bcleanbuf_thread()
	2416	{
	2417	boolean_t funnel_state;
	2418	struct buf *bp;
	2419	int error = 0;
	2420	int loopcnt = 0;
	2421
	2422	funnel_state = thread_funnel_set(kernel_flock, TRUE);
	2423
	2424	doit:
	2425	while (blaundrycnt == 0)
	2426	(void)tsleep((void )&blaundrycnt, PRIBIO, "blaundry", 60 hz);
	2427	bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY]);
	2428	/* Remove from the queue */
	2429	bremfree(bp);
	2430	blaundrycnt--;
	2431
	2432	/* do the IO */
	2433	error = bawrite_internal(bp, 0);
	2434	if (error) {
	2435	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
	2436	blaundrycnt++;
	2437	if (loopcnt > 10) {
	2438	(void)tsleep((void *)&blaundrycnt, PRIBIO, "blaundry", 1);
	2439	loopcnt = 0;
	2440	} else {
	2441	(void)thread_block(THREAD_CONTINUE_NULL);
	2442	loopcnt++;
	2443	}
	2444	}
	2445	/* start again */
	2446	goto doit;
	2447
	2448	(void) thread_funnel_set(kernel_flock, funnel_state);
	2449	}
	2450
	2451
	2452	static int
	2453	brecover_data(struct buf *bp)
	2454	{
	2455	upl_t upl;
	2456	upl_page_info_t *pl;
	2457	int upl_offset;
	2458	kern_return_t kret;
	2459	struct vnode *vp = bp->b_vp;
	2460
	2461	if (vp->v_tag == VT_NFS)
	2462	/*
	2463	* NFS currently deals with this case
	2464	* in a slightly different manner...
	2465	* continue to let it do so
	2466	*/
	2467	return(1);
	2468
	2469	if (!UBCISVALID(vp) \|\| bp->b_bufsize == 0)
	2470	goto dump_buffer;
	2471
	2472	kret = ubc_create_upl(vp,
	2473	ubc_blktooff(vp, bp->b_lblkno),
	2474	bp->b_bufsize,
	2475	&upl,
	2476	&pl,
	2477	UPL_PRECIOUS);
	2478	if (kret != KERN_SUCCESS)
	2479	panic("Failed to get pagelists");
	2480
	2481	for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
	2482
	2483	if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) \|\| !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
	2484	ubc_upl_abort(upl, 0);
	2485	goto dump_buffer;
	2486	}
	2487	}
	2488	SET(bp->b_flags, B_PAGELIST);
	2489	bp->b_pagelist = upl;
	2490
	2491	kret = ubc_upl_map(upl, (vm_address_t *)&(bp->b_data));
	2492	if (kret != KERN_SUCCESS)
	2493	panic("getblk: ubc_upl_map() failed with (%d)", kret);
	2494	if (bp->b_data == 0)
	2495	panic("ubc_upl_map mapped 0");
	2496
	2497	return (1);
	2498
	2499	dump_buffer:
	2500	bp->b_bufsize = 0;
	2501	SET(bp->b_flags, B_INVAL);
	2502	brelse(bp);
	2503
	2504	return(0);
	2505	}
	2506
	2507
	2508	static int
	2509	bp_cmp(void a, void b)
	2510	{
	2511	struct buf bp_a = (struct buf **)a,
	2512	bp_b = (struct buf **)b;
	2513	daddr_t res;
	2514
	2515	// don't have to worry about negative block
	2516	// numbers so this is ok to do.
	2517	//
	2518	res = (bp_a->b_blkno - bp_b->b_blkno);
	2519
	2520	return (int)res;
	2521	}
	2522
	2523	#define NFLUSH 32
	2524
	2525	int
	2526	bflushq(int whichq, struct mount *mp)
	2527	{
	2528	struct buf bp, next;
	2529	int i, buf_count, s;
	2530	int counter=0, total_writes=0;
	2531	static struct buf *flush_table[NFLUSH];
	2532
	2533	if (whichq < 0 \|\| whichq >= BQUEUES) {
	2534	return;
	2535	}
	2536
	2537
	2538	restart:
	2539	bp = TAILQ_FIRST(&bufqueues[whichq]);
	2540	for(buf_count=0; bp; bp=next) {
	2541	next = bp->b_freelist.tqe_next;
	2542
	2543	if (bp->b_vp == NULL \|\| bp->b_vp->v_mount != mp) {
	2544	continue;
	2545	}
	2546
	2547	if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
	2548	if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
	2549	panic("bflushq: bp @ 0x%x is locked!\n", bp);
	2550	}
	2551
	2552	bremfree(bp);
	2553	bp->b_flags \|= B_BUSY;
	2554	flush_table[buf_count] = bp;
	2555	buf_count++;
	2556	total_writes++;
	2557
	2558	if (buf_count >= NFLUSH) {
	2559	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	2560
	2561	for(i=0; i < buf_count; i++) {
	2562	bawrite(flush_table[i]);
	2563	}
	2564
	2565	goto restart;
	2566	}
	2567	}
	2568	}
	2569
	2570	if (buf_count > 0) {
	2571	qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
	2572	for(i=0; i < buf_count; i++) {
	2573	bawrite(flush_table[i]);
	2574	}
	2575	}
	2576
	2577	return total_writes;
	2578	}