git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	23	/*
	24	* Copyright (c) 1993
	25	* The Regents of the University of California. All rights reserved.
	26	*
	27	* Redistribution and use in source and binary forms, with or without
	28	* modification, are permitted provided that the following conditions
	29	* are met:
	30	* 1. Redistributions of source code must retain the above copyright
	31	* notice, this list of conditions and the following disclaimer.
	32	* 2. Redistributions in binary form must reproduce the above copyright
	33	* notice, this list of conditions and the following disclaimer in the
	34	* documentation and/or other materials provided with the distribution.
	35	* 3. All advertising materials mentioning features or use of this software
	36	* must display the following acknowledgement:
	37	* This product includes software developed by the University of
	38	* California, Berkeley and its contributors.
	39	* 4. Neither the name of the University nor the names of its contributors
	40	* may be used to endorse or promote products derived from this software
	41	* without specific prior written permission.
	42	*
	43	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	44	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	45	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	46	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	47	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	48	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	49	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	50	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	51	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	52	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	53	* SUCH DAMAGE.
	54	*
	55	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	56	*/
	57
	58	#include <sys/param.h>
	59	#include <sys/proc_internal.h>
	60	#include <sys/buf_internal.h>
	61	#include <sys/mount_internal.h>
	62	#include <sys/vnode_internal.h>
	63	#include <sys/trace.h>
	64	#include <sys/malloc.h>
	65	#include <sys/time.h>
	66	#include <sys/kernel.h>
	67	#include <sys/resourcevar.h>
	68	#include <sys/uio_internal.h>
	69	#include <libkern/libkern.h>
	70	#include <machine/machine_routines.h>
	71
	72	#include <sys/ubc_internal.h>
	73
	74	#include <mach/mach_types.h>
	75	#include <mach/memory_object_types.h>
	76	#include <mach/vm_map.h>
	77	#include <mach/upl.h>
	78
	79	#include <vm/vm_kern.h>
	80	#include <vm/vm_map.h>
	81	#include <vm/vm_pageout.h>
	82
	83	#include <sys/kdebug.h>
	84
	85
	86	#define CL_READ 0x01
	87	#define CL_ASYNC 0x02
	88	#define CL_COMMIT 0x04
	89	#define CL_PAGEOUT 0x10
	90	#define CL_AGE 0x20
	91	#define CL_DUMP 0x40
	92	#define CL_NOZERO 0x80
	93	#define CL_PAGEIN 0x100
	94	#define CL_DEV_MEMORY 0x200
	95	#define CL_PRESERVE 0x400
	96	#define CL_THROTTLE 0x800
	97	#define CL_KEEPCACHED 0x1000
	98
	99
	100	struct clios {
	101	u_int io_completed; /* amount of io that has currently completed */
	102	u_int io_issued; /* amount of io that was successfully issued */
	103	int io_error; /* error code of first error encountered */
	104	int io_wanted; /* someone is sleeping waiting for a change in state */
	105	};
	106
	107	static lck_grp_t *cl_mtx_grp;
	108	static lck_attr_t *cl_mtx_attr;
	109	static lck_grp_attr_t *cl_mtx_grp_attr;
	110	static lck_mtx_t *cl_mtxp;
	111
	112
	113	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	114	int flags, buf_t real_bp, struct clios *iostate);
	115	static int cluster_iodone(buf_t bp, void *dummy);
	116	static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
	117	static int cluster_hard_throttle_on(vnode_t vp);
	118
	119	static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
	120	static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	121	off_t headOff, off_t tailOff, int flags);
	122	static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
	123	static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
	124	static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
	125	static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
	126	static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
	127
	128	static void cluster_rd_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra);
	129
	130	static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
	131	static void cluster_push_EOF(vnode_t vp, off_t EOF);
	132
	133	static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
	134
	135	static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
	136	static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
	137	static void sparse_cluster_add(struct cl_writebehind , vnode_t vp, struct cl_extent , off_t EOF);
	138
	139	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, int setcountp);
	140	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	141	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	142
	143	int is_file_clean(vnode_t, off_t);
	144
	145	/*
	146	* throttle the number of async writes that
	147	* can be outstanding on a single vnode
	148	* before we issue a synchronous write
	149	*/
	150	#define HARD_THROTTLE_MAXCNT 0
	151	#define HARD_THROTTLE_MAXSIZE (64 * 1024)
	152
	153	int hard_throttle_on_root = 0;
	154	struct timeval priority_IO_timestamp_for_root;
	155
	156
	157	void
	158	cluster_init(void) {
	159	/*
	160	* allocate lock group attribute and group
	161	*/
	162	cl_mtx_grp_attr = lck_grp_attr_alloc_init();
	163	//lck_grp_attr_setstat(cl_mtx_grp_attr);
	164	cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
	165
	166	/*
	167	* allocate the lock attribute
	168	*/
	169	cl_mtx_attr = lck_attr_alloc_init();
	170	//lck_attr_setdebug(clf_mtx_attr);
	171
	172	/*
	173	* allocate and initialize mutex's used to protect updates and waits
	174	* on the cluster_io context
	175	*/
	176	cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
	177
	178	if (cl_mtxp == NULL)
	179	panic("cluster_init: failed to allocate cl_mtxp");
	180	}
	181
	182
	183
	184	#define CLW_ALLOCATE 0x01
	185	#define CLW_RETURNLOCKED 0x02
	186	/*
	187	* if the read ahead context doesn't yet exist,
	188	* allocate and initialize it...
	189	* the vnode lock serializes multiple callers
	190	* during the actual assignment... first one
	191	* to grab the lock wins... the other callers
	192	* will release the now unnecessary storage
	193	*
	194	* once the context is present, try to grab (but don't block on)
	195	* the lock associated with it... if someone
	196	* else currently owns it, than the read
	197	* will run without read-ahead. this allows
	198	* multiple readers to run in parallel and
	199	* since there's only 1 read ahead context,
	200	* there's no real loss in only allowing 1
	201	* reader to have read-ahead enabled.
	202	*/
	203	static struct cl_readahead *
	204	cluster_get_rap(vnode_t vp)
	205	{
	206	struct ubc_info *ubc;
	207	struct cl_readahead *rap;
	208
	209	ubc = vp->v_ubcinfo;
	210
	211	if ((rap = ubc->cl_rahead) == NULL) {
	212	MALLOC_ZONE(rap, struct cl_readahead , sizeof rap, M_CLRDAHEAD, M_WAITOK);
	213
	214	bzero(rap, sizeof *rap);
	215	rap->cl_lastr = -1;
	216	lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
	217
	218	vnode_lock(vp);
	219
	220	if (ubc->cl_rahead == NULL)
	221	ubc->cl_rahead = rap;
	222	else {
	223	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	224	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	225	rap = ubc->cl_rahead;
	226	}
	227	vnode_unlock(vp);
	228	}
	229	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
	230	return(rap);
	231
	232	return ((struct cl_readahead *)NULL);
	233	}
	234
	235
	236	/*
	237	* if the write behind context doesn't yet exist,
	238	* and CLW_ALLOCATE is specified, allocate and initialize it...
	239	* the vnode lock serializes multiple callers
	240	* during the actual assignment... first one
	241	* to grab the lock wins... the other callers
	242	* will release the now unnecessary storage
	243	*
	244	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	245	* the lock associated with the write behind context before
	246	* returning
	247	*/
	248
	249	static struct cl_writebehind *
	250	cluster_get_wbp(vnode_t vp, int flags)
	251	{
	252	struct ubc_info *ubc;
	253	struct cl_writebehind *wbp;
	254
	255	ubc = vp->v_ubcinfo;
	256
	257	if ((wbp = ubc->cl_wbehind) == NULL) {
	258
	259	if ( !(flags & CLW_ALLOCATE))
	260	return ((struct cl_writebehind *)NULL);
	261
	262	MALLOC_ZONE(wbp, struct cl_writebehind , sizeof wbp, M_CLWRBEHIND, M_WAITOK);
	263
	264	bzero(wbp, sizeof *wbp);
	265	lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
	266
	267	vnode_lock(vp);
	268
	269	if (ubc->cl_wbehind == NULL)
	270	ubc->cl_wbehind = wbp;
	271	else {
	272	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	273	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	274	wbp = ubc->cl_wbehind;
	275	}
	276	vnode_unlock(vp);
	277	}
	278	if (flags & CLW_RETURNLOCKED)
	279	lck_mtx_lock(&wbp->cl_lockw);
	280
	281	return (wbp);
	282	}
	283
	284
	285	static int
	286	cluster_hard_throttle_on(vnode_t vp)
	287	{
	288	static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
	289
	290	if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
	291	struct timeval elapsed;
	292
	293	if (hard_throttle_on_root)
	294	return(1);
	295
	296	microuptime(&elapsed);
	297	timevalsub(&elapsed, &priority_IO_timestamp_for_root);
	298
	299	if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
	300	return(1);
	301	}
	302	return(0);
	303	}
	304
	305
	306	static int
	307	cluster_iodone(buf_t bp, __unused void *dummy)
	308	{
	309	int b_flags;
	310	int error;
	311	int total_size;
	312	int total_resid;
	313	int upl_offset;
	314	int zero_offset;
	315	upl_t upl;
	316	buf_t cbp;
	317	buf_t cbp_head;
	318	buf_t cbp_next;
	319	buf_t real_bp;
	320	struct clios *iostate;
	321	int commit_size;
	322	int pg_offset;
	323
	324	cbp_head = (buf_t)(bp->b_trans_head);
	325
	326	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	327	(int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	328
	329	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	330	/*
	331	* all I/O requests that are part of this transaction
	332	* have to complete before we can process it
	333	*/
	334	if ( !(cbp->b_flags & B_DONE)) {
	335
	336	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	337	(int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
	338
	339	return 0;
	340	}
	341	}
	342	error = 0;
	343	total_size = 0;
	344	total_resid = 0;
	345
	346	cbp = cbp_head;
	347	upl_offset = cbp->b_uploffset;
	348	upl = cbp->b_upl;
	349	b_flags = cbp->b_flags;
	350	real_bp = cbp->b_real_bp;
	351	zero_offset= cbp->b_validend;
	352	iostate = (struct clios *)cbp->b_iostate;
	353
	354	if (real_bp)
	355	real_bp->b_dev = cbp->b_dev;
	356
	357	while (cbp) {
	358	if ((cbp->b_flags & B_ERROR) && error == 0)
	359	error = cbp->b_error;
	360
	361	total_resid += cbp->b_resid;
	362	total_size += cbp->b_bcount;
	363
	364	cbp_next = cbp->b_trans_next;
	365
	366	free_io_buf(cbp);
	367
	368	cbp = cbp_next;
	369	}
	370	if (zero_offset)
	371	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	372
	373	if (iostate) {
	374	int need_wakeup = 0;
	375
	376	/*
	377	* someone has issued multiple I/Os asynchrounsly
	378	* and is waiting for them to complete (streaming)
	379	*/
	380	lck_mtx_lock(cl_mtxp);
	381
	382	if (error && iostate->io_error == 0)
	383	iostate->io_error = error;
	384
	385	iostate->io_completed += total_size;
	386
	387	if (iostate->io_wanted) {
	388	/*
	389	* someone is waiting for the state of
	390	* this io stream to change
	391	*/
	392	iostate->io_wanted = 0;
	393	need_wakeup = 1;
	394	}
	395	lck_mtx_unlock(cl_mtxp);
	396
	397	if (need_wakeup)
	398	wakeup((caddr_t)&iostate->io_wanted);
	399	}
	400	if ((b_flags & B_NEED_IODONE) && real_bp) {
	401	if (error) {
	402	real_bp->b_flags \|= B_ERROR;
	403	real_bp->b_error = error;
	404	}
	405	real_bp->b_resid = total_resid;
	406
	407	buf_biodone(real_bp);
	408	}
	409	if (error == 0 && total_resid)
	410	error = EIO;
	411
	412	if (b_flags & B_COMMIT_UPL) {
	413	pg_offset = upl_offset & PAGE_MASK;
	414	commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	415
	416	if (error \|\| (b_flags & B_NOCACHE)) {
	417	int upl_abort_code;
	418	int page_in = 0;
	419	int page_out = 0;
	420
	421	if (b_flags & B_PAGEIO) {
	422	if (b_flags & B_READ)
	423	page_in = 1;
	424	else
	425	page_out = 1;
	426	}
	427	if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
	428	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	429	else if (page_out && (error != ENXIO)) /* transient error */
	430	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	431	else if (page_in)
	432	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	433	else
	434	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	435
	436	ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
	437	upl_abort_code);
	438
	439	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	440	(int)upl, upl_offset - pg_offset, commit_size,
	441	0x80000000\|upl_abort_code, 0);
	442
	443	} else {
	444	int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
	445
	446	if ((b_flags & B_PHYS) && (b_flags & B_READ))
	447	upl_commit_flags \|= UPL_COMMIT_SET_DIRTY;
	448
	449	if (b_flags & B_AGE)
	450	upl_commit_flags \|= UPL_COMMIT_INACTIVATE;
	451
	452	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
	453	upl_commit_flags);
	454
	455	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	456	(int)upl, upl_offset - pg_offset, commit_size,
	457	upl_commit_flags, 0);
	458	}
	459	} else {
	460	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	461	(int)upl, upl_offset, 0, error, 0);
	462	}
	463
	464	return (error);
	465	}
	466
	467
	468	void
	469	cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
	470	{
	471	upl_page_info_t *pl;
	472
	473	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	474	upl_offset, size, (int)bp, 0, 0);
	475
	476	if (bp == NULL \|\| bp->b_datap == 0) {
	477
	478	pl = ubc_upl_pageinfo(upl);
	479
	480	while (size) {
	481	int page_offset;
	482	int page_index;
	483	addr64_t zero_addr;
	484	int zero_cnt;
	485
	486	page_index = upl_offset / PAGE_SIZE;
	487	page_offset = upl_offset & PAGE_MASK;
	488
	489	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
	490	zero_cnt = min(PAGE_SIZE - page_offset, size);
	491
	492	bzero_phys(zero_addr, zero_cnt);
	493
	494	size -= zero_cnt;
	495	upl_offset += zero_cnt;
	496	}
	497	} else
	498	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	499
	500	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	501	upl_offset, size, 0, 0, 0);
	502	}
	503
	504
	505	static int
	506	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	507	int flags, buf_t real_bp, struct clios *iostate)
	508	{
	509	buf_t cbp;
	510	u_int size;
	511	u_int io_size;
	512	int io_flags;
	513	int bmap_flags;
	514	int error = 0;
	515	int retval = 0;
	516	buf_t cbp_head = NULL;
	517	buf_t cbp_tail = NULL;
	518	int trans_count = 0;
	519	u_int pg_count;
	520	int pg_offset;
	521	u_int max_iosize;
	522	u_int max_vectors;
	523	int priv;
	524	int zero_offset = 0;
	525	int async_throttle = 0;
	526	mount_t mp;
	527
	528	mp = vp->v_mount;
	529
	530	if (mp->mnt_devblocksize > 1) {
	531	/*
	532	* round the requested size up so that this I/O ends on a
	533	* page boundary in case this is a 'write'... if the filesystem
	534	* has blocks allocated to back the page beyond the EOF, we want to
	535	* make sure to write out the zero's that are sitting beyond the EOF
	536	* so that in case the filesystem doesn't explicitly zero this area
	537	* if a hole is created via a lseek/write beyond the current EOF,
	538	* it will return zeros when it's read back from the disk. If the
	539	* physical allocation doesn't extend for the whole page, we'll
	540	* only write/read from the disk up to the end of this allocation
	541	* via the extent info returned from the VNOP_BLOCKMAP call.
	542	*/
	543	pg_offset = upl_offset & PAGE_MASK;
	544
	545	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	546	} else {
	547	/*
	548	* anyone advertising a blocksize of 1 byte probably
	549	* can't deal with us rounding up the request size
	550	* AFP is one such filesystem/device
	551	*/
	552	size = non_rounded_size;
	553	}
	554	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START,
	555	(int)f_offset, size, upl_offset, flags, 0);
	556
	557	if (flags & CL_READ) {
	558	io_flags = (B_READ);
	559	bmap_flags = VNODE_READ;
	560
	561	max_iosize = mp->mnt_maxreadcnt;
	562	max_vectors = mp->mnt_segreadcnt;
	563	} else {
	564	io_flags = 0;
	565	bmap_flags = VNODE_WRITE;
	566
	567	max_iosize = mp->mnt_maxwritecnt;
	568	max_vectors = mp->mnt_segwritecnt;
	569	}
	570	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	571
	572	/*
	573	* make sure the maximum iosize is a
	574	* multiple of the page size
	575	*/
	576	max_iosize &= ~PAGE_MASK;
	577
	578	if (flags & CL_THROTTLE) {
	579	if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
	580	if (max_iosize > HARD_THROTTLE_MAXSIZE)
	581	max_iosize = HARD_THROTTLE_MAXSIZE;
	582	async_throttle = HARD_THROTTLE_MAXCNT;
	583	} else
	584	async_throttle = VNODE_ASYNC_THROTTLE;
	585	}
	586	if (flags & CL_AGE)
	587	io_flags \|= B_AGE;
	588	if (flags & CL_DUMP)
	589	io_flags \|= B_NOCACHE;
	590	if (flags & (CL_PAGEIN \| CL_PAGEOUT))
	591	io_flags \|= B_PAGEIO;
	592	if (flags & CL_COMMIT)
	593	io_flags \|= B_COMMIT_UPL;
	594	if (flags & CL_PRESERVE)
	595	io_flags \|= B_PHYS;
	596	if (flags & CL_KEEPCACHED)
	597	io_flags \|= B_CACHE;
	598
	599	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	600	/*
	601	* then we are going to end up
	602	* with a page that we can't complete (the file size wasn't a multiple
	603	* of PAGE_SIZE and we're trying to read to the end of the file
	604	* so we'll go ahead and zero out the portion of the page we can't
	605	* read in from the file
	606	*/
	607	zero_offset = upl_offset + non_rounded_size;
	608	}
	609	while (size) {
	610	int pg_resid;
	611	daddr64_t blkno;
	612	daddr64_t lblkno;
	613
	614	if (size > max_iosize)
	615	io_size = max_iosize;
	616	else
	617	io_size = size;
	618
	619	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
	620	break;
	621	}
	622	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
	623	real_bp->b_blkno = blkno;
	624
	625	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	626	(int)f_offset, (int)blkno, io_size, zero_offset, 0);
	627
	628	if (io_size == 0) {
	629	/*
	630	* vnop_blockmap didn't return an error... however, it did
	631	* return an extent size of 0 which means we can't
	632	* make forward progress on this I/O... a hole in the
	633	* file would be returned as a blkno of -1 with a non-zero io_size
	634	* a real extent is returned with a blkno != -1 and a non-zero io_size
	635	*/
	636	error = EINVAL;
	637	break;
	638	}
	639	if ( !(flags & CL_READ) && blkno == -1) {
	640	off_t e_offset;
	641
	642	/*
	643	* we're writing into a 'hole'
	644	*/
	645	if (flags & CL_PAGEOUT) {
	646	/*
	647	* if we got here via cluster_pageout
	648	* then just error the request and return
	649	* the 'hole' should already have been covered
	650	*/
	651	error = EINVAL;
	652	break;
	653	}
	654	if ( !(flags & CL_COMMIT)) {
	655	/*
	656	* currently writes always request the commit to happen
	657	* as part of the io completion... however, if the CL_COMMIT
	658	* flag isn't specified, than we can't issue the abort_range
	659	* since the call site is going to abort or commit the same upl..
	660	* in this case we can only return an error
	661	*/
	662	error = EINVAL;
	663	break;
	664	}
	665	/*
	666	* we can get here if the cluster code happens to
	667	* pick up a page that was dirtied via mmap vs
	668	* a 'write' and the page targets a 'hole'...
	669	* i.e. the writes to the cluster were sparse
	670	* and the file was being written for the first time
	671	*
	672	* we can also get here if the filesystem supports
	673	* 'holes' that are less than PAGE_SIZE.... because
	674	* we can't know if the range in the page that covers
	675	* the 'hole' has been dirtied via an mmap or not,
	676	* we have to assume the worst and try to push the
	677	* entire page to storage.
	678	*
	679	* Try paging out the page individually before
	680	* giving up entirely and dumping it (the pageout
	681	* path will insure that the zero extent accounting
	682	* has been taken care of before we get back into cluster_io)
	683	*/
	684	ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	685
	686	e_offset = round_page_64(f_offset + 1);
	687
	688	if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
	689	error = EINVAL;
	690	break;
	691	}
	692	io_size = e_offset - f_offset;
	693
	694	f_offset += io_size;
	695	upl_offset += io_size;
	696
	697	if (size >= io_size)
	698	size -= io_size;
	699	else
	700	size = 0;
	701	/*
	702	* keep track of how much of the original request
	703	* that we've actually completed... non_rounded_size
	704	* may go negative due to us rounding the request
	705	* to a page size multiple (i.e. size > non_rounded_size)
	706	*/
	707	non_rounded_size -= io_size;
	708
	709	if (non_rounded_size <= 0) {
	710	/*
	711	* we've transferred all of the data in the original
	712	* request, but we were unable to complete the tail
	713	* of the last page because the file didn't have
	714	* an allocation to back that portion... this is ok.
	715	*/
	716	size = 0;
	717	}
	718	continue;
	719	}
	720	lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
	721	/*
	722	* we have now figured out how much I/O we can do - this is in 'io_size'
	723	* pg_offset is the starting point in the first page for the I/O
	724	* pg_count is the number of full and partial pages that 'io_size' encompasses
	725	*/
	726	pg_offset = upl_offset & PAGE_MASK;
	727
	728	if (flags & CL_DEV_MEMORY) {
	729	/*
	730	* currently, can't deal with reading 'holes' in file
	731	*/
	732	if (blkno == -1) {
	733	error = EINVAL;
	734	break;
	735	}
	736	/*
	737	* treat physical requests as one 'giant' page
	738	*/
	739	pg_count = 1;
	740	} else
	741	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	742
	743	if ((flags & CL_READ) && blkno == -1) {
	744	int bytes_to_zero;
	745
	746	/*
	747	* if we're reading and blkno == -1, then we've got a
	748	* 'hole' in the file that we need to deal with by zeroing
	749	* out the affected area in the upl
	750	*/
	751	if (zero_offset && io_size == size) {
	752	/*
	753	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	754	* than 'zero_offset' will be non-zero
	755	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	756	* (indicated by the io_size finishing off the I/O request for this UPL)
	757	* than we're not going to issue an I/O for the
	758	* last page in this upl... we need to zero both the hole and the tail
	759	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	760	*/
	761	bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
	762
	763	zero_offset = 0;
	764	} else
	765	bytes_to_zero = io_size;
	766
	767	cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
	768
	769	if (cbp_head)
	770	/*
	771	* if there is a current I/O chain pending
	772	* then the first page of the group we just zero'd
	773	* will be handled by the I/O completion if the zero
	774	* fill started in the middle of the page
	775	*/
	776	pg_count = (io_size - pg_offset) / PAGE_SIZE;
	777	else {
	778	/*
	779	* no pending I/O to pick up that first page
	780	* so, we have to make sure it gets committed
	781	* here.
	782	* set the pg_offset to 0 so that the upl_commit_range
	783	* starts with this page
	784	*/
	785	pg_count = (io_size + pg_offset) / PAGE_SIZE;
	786	pg_offset = 0;
	787	}
	788	if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
	789	/*
	790	* if we're done with the request for this UPL
	791	* then we have to make sure to commit the last page
	792	* even if we only partially zero-filled it
	793	*/
	794	pg_count++;
	795
	796	if (pg_count) {
	797	if (pg_offset)
	798	pg_resid = PAGE_SIZE - pg_offset;
	799	else
	800	pg_resid = 0;
	801
	802	if (flags & CL_COMMIT)
	803	ubc_upl_commit_range(upl,
	804	(upl_offset + pg_resid) & ~PAGE_MASK,
	805	pg_count * PAGE_SIZE,
	806	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	807	}
	808	upl_offset += io_size;
	809	f_offset += io_size;
	810	size -= io_size;
	811	/*
	812	* keep track of how much of the original request
	813	* that we've actually completed... non_rounded_size
	814	* may go negative due to us rounding the request
	815	* to a page size multiple (i.e. size > non_rounded_size)
	816	*/
	817	non_rounded_size -= io_size;
	818
	819	if (non_rounded_size <= 0) {
	820	/*
	821	* we've transferred all of the data in the original
	822	* request, but we were unable to complete the tail
	823	* of the last page because the file didn't have
	824	* an allocation to back that portion... this is ok.
	825	*/
	826	size = 0;
	827	}
	828	if (cbp_head && pg_count)
	829	goto start_io;
	830	continue;
	831
	832	}
	833	if (pg_count > max_vectors) {
	834	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	835	io_size = PAGE_SIZE - pg_offset;
	836	pg_count = 1;
	837	} else {
	838	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	839	pg_count = max_vectors;
	840	}
	841	}
	842
	843	if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
	844	/*
	845	* if we're not targeting a virtual device i.e. a disk image
	846	* it's safe to dip into the reserve pool since real devices
	847	* can complete this I/O request without requiring additional
	848	* bufs from the alloc_io_buf pool
	849	*/
	850	priv = 1;
	851	else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
	852	/*
	853	* Throttle the speculative IO
	854	*/
	855	priv = 0;
	856	else
	857	priv = 1;
	858
	859	cbp = alloc_io_buf(vp, priv);
	860
	861	if (flags & CL_PAGEOUT) {
	862	u_int i;
	863
	864	for (i = 0; i < pg_count; i++) {
	865	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
	866	panic("BUSY bp found in cluster_io");
	867	}
	868	}
	869	if (flags & CL_ASYNC) {
	870	if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
	871	panic("buf_setcallback failed\n");
	872	}
	873	cbp->b_flags \|= io_flags;
	874
	875	cbp->b_lblkno = lblkno;
	876	cbp->b_blkno = blkno;
	877	cbp->b_bcount = io_size;
	878
	879	if (buf_setupl(cbp, upl, upl_offset))
	880	panic("buf_setupl failed\n");
	881
	882	cbp->b_trans_next = (buf_t)NULL;
	883
	884	if ((cbp->b_iostate = (void *)iostate))
	885	/*
	886	* caller wants to track the state of this
	887	* io... bump the amount issued against this stream
	888	*/
	889	iostate->io_issued += io_size;
	890
	891	if (flags & CL_READ) {
	892	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	893	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	894	}
	895	else {
	896	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	897	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	898	}
	899
	900	if (cbp_head) {
	901	cbp_tail->b_trans_next = cbp;
	902	cbp_tail = cbp;
	903	} else {
	904	cbp_head = cbp;
	905	cbp_tail = cbp;
	906	}
	907	(buf_t)(cbp->b_trans_head) = cbp_head;
	908	trans_count++;
	909
	910	upl_offset += io_size;
	911	f_offset += io_size;
	912	size -= io_size;
	913	/*
	914	* keep track of how much of the original request
	915	* that we've actually completed... non_rounded_size
	916	* may go negative due to us rounding the request
	917	* to a page size multiple (i.e. size > non_rounded_size)
	918	*/
	919	non_rounded_size -= io_size;
	920
	921	if (non_rounded_size <= 0) {
	922	/*
	923	* we've transferred all of the data in the original
	924	* request, but we were unable to complete the tail
	925	* of the last page because the file didn't have
	926	* an allocation to back that portion... this is ok.
	927	*/
	928	size = 0;
	929	}
	930	if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) \|\| trans_count > 8)) \|\| size == 0) {
	931	/*
	932	* if we have no more I/O to issue or
	933	* the current I/O we've prepared fully
	934	* completes the last page in this request
	935	* and it's either an ASYNC request or
	936	* we've already accumulated more than 8 I/O's into
	937	* this transaction and it's not an I/O directed to
	938	* special DEVICE memory
	939	* then go ahead and issue the I/O
	940	*/
	941	start_io:
	942	if (real_bp) {
	943	cbp_head->b_flags \|= B_NEED_IODONE;
	944	cbp_head->b_real_bp = real_bp;
	945	} else
	946	cbp_head->b_real_bp = (buf_t)NULL;
	947
	948	if (size == 0) {
	949	/*
	950	* we're about to issue the last I/O for this upl
	951	* if this was a read to the eof and the eof doesn't
	952	* finish on a page boundary, than we need to zero-fill
	953	* the rest of the page....
	954	*/
	955	cbp_head->b_validend = zero_offset;
	956	} else
	957	cbp_head->b_validend = 0;
	958
	959	if (flags & CL_THROTTLE)
	960	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
	961
	962	for (cbp = cbp_head; cbp;) {
	963	buf_t cbp_next;
	964
	965	if ( !(io_flags & B_READ))
	966	vnode_startwrite(vp);
	967
	968	cbp_next = cbp->b_trans_next;
	969
	970	(void) VNOP_STRATEGY(cbp);
	971	cbp = cbp_next;
	972	}
	973	if ( !(flags & CL_ASYNC)) {
	974	int dummy;
	975
	976	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	977	buf_biowait(cbp);
	978
	979	if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
	980	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
	981	error = 0; /* drop the error */
	982	else {
	983	if (retval == 0)
	984	retval = error;
	985	error = 0;
	986	}
	987	}
	988	}
	989	cbp_head = (buf_t)NULL;
	990	cbp_tail = (buf_t)NULL;
	991
	992	trans_count = 0;
	993	}
	994	}
	995	if (error) {
	996	int abort_size;
	997
	998	io_size = 0;
	999
	1000	for (cbp = cbp_head; cbp;) {
	1001	buf_t cbp_next;
	1002
	1003	upl_offset -= cbp->b_bcount;
	1004	size += cbp->b_bcount;
	1005	io_size += cbp->b_bcount;
	1006
	1007	cbp_next = cbp->b_trans_next;
	1008	free_io_buf(cbp);
	1009	cbp = cbp_next;
	1010	}
	1011	if (iostate) {
	1012	int need_wakeup = 0;
	1013
	1014	/*
	1015	* update the error condition for this stream
	1016	* since we never really issued the io
	1017	* just go ahead and adjust it back
	1018	*/
	1019	lck_mtx_lock(cl_mtxp);
	1020
	1021	if (iostate->io_error == 0)
	1022	iostate->io_error = error;
	1023	iostate->io_issued -= io_size;
	1024
	1025	if (iostate->io_wanted) {
	1026	/*
	1027	* someone is waiting for the state of
	1028	* this io stream to change
	1029	*/
	1030	iostate->io_wanted = 0;
	1031	need_wakeup = 0;
	1032	}
	1033	lck_mtx_unlock(cl_mtxp);
	1034
	1035	if (need_wakeup)
	1036	wakeup((caddr_t)&iostate->io_wanted);
	1037	}
	1038	pg_offset = upl_offset & PAGE_MASK;
	1039	abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1040
	1041	if (flags & CL_COMMIT) {
	1042	int upl_abort_code;
	1043
	1044	if (flags & CL_PRESERVE) {
	1045	ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
	1046	UPL_COMMIT_FREE_ON_EMPTY);
	1047	} else {
	1048	if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
	1049	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	1050	else if (flags & CL_PAGEIN)
	1051	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	1052	else
	1053	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	1054
	1055	ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
	1056	upl_abort_code);
	1057	}
	1058	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1059	(int)upl, upl_offset - pg_offset, abort_size, error, 0);
	1060	}
	1061	if (real_bp) {
	1062	real_bp->b_flags \|= B_ERROR;
	1063	real_bp->b_error = error;
	1064
	1065	buf_biodone(real_bp);
	1066	}
	1067	if (retval == 0)
	1068	retval = error;
	1069	}
	1070	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END,
	1071	(int)f_offset, size, upl_offset, retval, 0);
	1072
	1073	return (retval);
	1074	}
	1075
	1076
	1077	static int
	1078	cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
	1079	{
	1080	int pages_in_prefetch;
	1081
	1082	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	1083	(int)f_offset, size, (int)filesize, 0, 0);
	1084
	1085	if (f_offset >= filesize) {
	1086	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1087	(int)f_offset, 0, 0, 0, 0);
	1088	return(0);
	1089	}
	1090	if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	1091	size = (MAX_UPL_TRANSFER * PAGE_SIZE);
	1092	else
	1093	size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1094
	1095	if ((off_t)size > (filesize - f_offset))
	1096	size = filesize - f_offset;
	1097	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1098
	1099	advisory_read(vp, filesize, f_offset, size);
	1100
	1101	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1102	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	1103
	1104	return (pages_in_prefetch);
	1105	}
	1106
	1107
	1108
	1109	static void
	1110	cluster_rd_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap)
	1111	{
	1112	daddr64_t r_addr;
	1113	off_t f_offset;
	1114	int size_of_prefetch;
	1115
	1116
	1117	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	1118	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	1119
	1120	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	1121	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1122	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	1123	return;
	1124	}
	1125	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
	1126	(extent->b_addr != (rap->cl_maxra + 1) \|\| rap->cl_ralen == 0))) {
	1127	rap->cl_ralen = 0;
	1128	rap->cl_maxra = 0;
	1129
	1130	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1131	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	1132
	1133	return;
	1134	}
	1135	if (extent->e_addr < rap->cl_maxra) {
	1136	if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
	1137
	1138	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1139	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	1140	return;
	1141	}
	1142	}
	1143	r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
	1144	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	1145
	1146	size_of_prefetch = 0;
	1147
	1148	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	1149
	1150	if (size_of_prefetch) {
	1151	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1152	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	1153	return;
	1154	}
	1155	if (f_offset < filesize) {
	1156	daddr64_t read_size;
	1157
	1158	rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
	1159
	1160	read_size = (extent->e_addr + 1) - extent->b_addr;
	1161
	1162	if (read_size > rap->cl_ralen) {
	1163	if (read_size > MAX_UPL_TRANSFER)
	1164	rap->cl_ralen = MAX_UPL_TRANSFER;
	1165	else
	1166	rap->cl_ralen = read_size;
	1167	}
	1168	size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
	1169
	1170	if (size_of_prefetch)
	1171	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	1172	}
	1173	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1174	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	1175	}
	1176
	1177	int
	1178	cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
	1179	int size, off_t filesize, int flags)
	1180	{
	1181	int io_size;
	1182	int rounded_size;
	1183	off_t max_size;
	1184	int local_flags;
	1185	struct cl_writebehind *wbp;
	1186
	1187	if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
	1188	/*
	1189	* if we know we're issuing this I/O to a virtual device (i.e. disk image)
	1190	* then we don't want to enforce this throttle... if we do, we can
	1191	* potentially deadlock since we're stalling the pageout thread at a time
	1192	* when the disk image might need additional memory (which won't be available
	1193	* if the pageout thread can't run)... instead we'll just depend on the throttle
	1194	* that the pageout thread now has in place to deal with external files
	1195	*/
	1196	local_flags = CL_PAGEOUT;
	1197	else
	1198	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	1199
	1200	if ((flags & UPL_IOSYNC) == 0)
	1201	local_flags \|= CL_ASYNC;
	1202	if ((flags & UPL_NOCOMMIT) == 0)
	1203	local_flags \|= CL_COMMIT;
	1204	if ((flags & UPL_KEEPCACHED))
	1205	local_flags \|= CL_KEEPCACHED;
	1206
	1207
	1208	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	1209	(int)f_offset, size, (int)filesize, local_flags, 0);
	1210
	1211	/*
	1212	* If they didn't specify any I/O, then we are done...
	1213	* we can't issue an abort because we don't know how
	1214	* big the upl really is
	1215	*/
	1216	if (size <= 0)
	1217	return (EINVAL);
	1218
	1219	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	1220	if (local_flags & CL_COMMIT)
	1221	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	1222	return (EROFS);
	1223	}
	1224	/*
	1225	* can't page-in from a negative offset
	1226	* or if we're starting beyond the EOF
	1227	* or if the file offset isn't page aligned
	1228	* or the size requested isn't a multiple of PAGE_SIZE
	1229	*/
	1230	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	1231	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	1232	if (local_flags & CL_COMMIT)
	1233	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	1234	return (EINVAL);
	1235	}
	1236	max_size = filesize - f_offset;
	1237
	1238	if (size < max_size)
	1239	io_size = size;
	1240	else
	1241	io_size = max_size;
	1242
	1243	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1244
	1245	if (size > rounded_size) {
	1246	if (local_flags & CL_COMMIT)
	1247	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	1248	UPL_ABORT_FREE_ON_EMPTY);
	1249	}
	1250	if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
	1251	wbp->cl_hasbeenpaged = 1;
	1252
	1253	return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
	1254	local_flags, (buf_t)NULL, (struct clios *)NULL));
	1255	}
	1256
	1257	int
	1258	cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
	1259	int size, off_t filesize, int flags)
	1260	{
	1261	u_int io_size;
	1262	int rounded_size;
	1263	off_t max_size;
	1264	int retval;
	1265	int local_flags = 0;
	1266
	1267	if (upl == NULL \|\| size < 0)
	1268	panic("cluster_pagein: NULL upl passed in");
	1269
	1270	if ((flags & UPL_IOSYNC) == 0)
	1271	local_flags \|= CL_ASYNC;
	1272	if ((flags & UPL_NOCOMMIT) == 0)
	1273	local_flags \|= CL_COMMIT;
	1274
	1275
	1276	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	1277	(int)f_offset, size, (int)filesize, local_flags, 0);
	1278
	1279	/*
	1280	* can't page-in from a negative offset
	1281	* or if we're starting beyond the EOF
	1282	* or if the file offset isn't page aligned
	1283	* or the size requested isn't a multiple of PAGE_SIZE
	1284	*/
	1285	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	1286	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	1287	if (local_flags & CL_COMMIT)
	1288	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	1289	return (EINVAL);
	1290	}
	1291	max_size = filesize - f_offset;
	1292
	1293	if (size < max_size)
	1294	io_size = size;
	1295	else
	1296	io_size = max_size;
	1297
	1298	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1299
	1300	if (size > rounded_size && (local_flags & CL_COMMIT))
	1301	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	1302	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	1303
	1304	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	1305	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
	1306
	1307	if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
	1308	struct cl_readahead *rap;
	1309
	1310	rap = cluster_get_rap(vp);
	1311
	1312	if (rap != NULL) {
	1313	struct cl_extent extent;
	1314
	1315	extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
	1316	extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
	1317
	1318	if (rounded_size == PAGE_SIZE) {
	1319	/*
	1320	* we haven't read the last page in of the file yet
	1321	* so let's try to read ahead if we're in
	1322	* a sequential access pattern
	1323	*/
	1324	cluster_rd_ahead(vp, &extent, filesize, rap);
	1325	}
	1326	rap->cl_lastr = extent.e_addr;
	1327
	1328	lck_mtx_unlock(&rap->cl_lockr);
	1329	}
	1330	}
	1331	return (retval);
	1332	}
	1333
	1334	int
	1335	cluster_bp(buf_t bp)
	1336	{
	1337	off_t f_offset;
	1338	int flags;
	1339
	1340	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	1341	(int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	1342
	1343	if (bp->b_flags & B_READ)
	1344	flags = CL_ASYNC \| CL_READ;
	1345	else
	1346	flags = CL_ASYNC;
	1347
	1348	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	1349
	1350	return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
	1351	}
	1352
	1353	int
	1354	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	1355	{
	1356	int prev_resid;
	1357	u_int clip_size;
	1358	off_t max_io_size;
	1359	int upl_size;
	1360	int upl_flags;
	1361	upl_t upl;
	1362	int retval = 0;
	1363	int flags;
	1364
	1365	flags = xflags;
	1366
	1367	if (vp->v_flag & VNOCACHE_DATA)
	1368	flags \|= IO_NOCACHE;
	1369
	1370	if ( (!(flags & IO_NOCACHE)) \|\| (!uio) \|\| (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
	1371	/*
	1372	* go do a write through the cache if one of the following is true....
	1373	* NOCACHE is not true
	1374	* there is no uio structure or it doesn't target USERSPACE
	1375	*/
	1376	return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
	1377	}
	1378
	1379	#if LP64_DEBUG
	1380	if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
	1381	panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
	1382	}
	1383	#endif /* LP64_DEBUG */
	1384
	1385	while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
	1386	user_size_t iov_len;
	1387	user_addr_t iov_base;
	1388
	1389	/*
	1390	* we know we have a resid, so this is safe
	1391	* skip over any emtpy vectors
	1392	*/
	1393	uio_update(uio, (user_size_t)0);
	1394
	1395	iov_len = uio_curriovlen(uio);
	1396	iov_base = uio_curriovbase(uio);
	1397
	1398	upl_size = PAGE_SIZE;
	1399	upl_flags = UPL_QUERY_OBJECT_TYPE;
	1400
	1401	// LP64todo - fix this!
	1402	if ((vm_map_get_upl(current_map(),
	1403	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	1404	&upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
	1405	/*
	1406	* the user app must have passed in an invalid address
	1407	*/
	1408	return (EFAULT);
	1409	}
	1410
	1411	/*
	1412	* We check every vector target but if it is physically
	1413	* contiguous space, we skip the sanity checks.
	1414	*/
	1415	if (upl_flags & UPL_PHYS_CONTIG) {
	1416	int zflags;
	1417
	1418	zflags = flags & ~IO_TAILZEROFILL;
	1419	zflags \|= IO_HEADZEROFILL;
	1420
	1421	if (flags & IO_HEADZEROFILL) {
	1422	/*
	1423	* in case we have additional vectors, we don't want to do this again
	1424	*/
	1425	flags &= ~IO_HEADZEROFILL;
	1426
	1427	if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
	1428	return(retval);
	1429	}
	1430	retval = cluster_phys_write(vp, uio, newEOF);
	1431
	1432	if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
	1433	return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
	1434	}
	1435	}
	1436	else if ((uio_resid(uio) < PAGE_SIZE) \|\| (flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL))) {
	1437	/*
	1438	* we're here because we're don't have a physically contiguous target buffer
	1439	* go do a write through the cache if one of the following is true....
	1440	* the total xfer size is less than a page...
	1441	* we're being asked to ZEROFILL either the head or the tail of the I/O...
	1442	*/
	1443	return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
	1444	}
	1445	// LP64todo - fix this!
	1446	else if (((int)uio->uio_offset & PAGE_MASK) \|\| (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
	1447	if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
	1448	/*
	1449	* Bring the file offset write up to a pagesize boundary
	1450	* this will also bring the base address to a page boundary
	1451	* since they both are currently on the same offset within a page
	1452	* note: if we get here, uio->uio_resid is greater than PAGE_SIZE
	1453	* so the computed clip_size must always be less than the current uio_resid
	1454	*/
	1455	clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
	1456
	1457	/*
	1458	* Fake the resid going into the cluster_write_x call
	1459	* and restore it on the way out.
	1460	*/
	1461	// LP64todo - fix this
	1462	prev_resid = uio_resid(uio);
	1463	uio_setresid(uio, clip_size);
	1464
	1465	retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
	1466
	1467	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	1468	} else {
	1469	/*
	1470	* can't get both the file offset and the buffer offset aligned to a page boundary
	1471	* so fire an I/O through the cache for this entire vector
	1472	*/
	1473	// LP64todo - fix this
	1474	clip_size = iov_len;
	1475	// LP64todo - fix this
	1476	prev_resid = uio_resid(uio);
	1477	uio_setresid(uio, clip_size);
	1478
	1479	retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
	1480
	1481	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	1482	}
	1483	} else {
	1484	/*
	1485	* If we come in here, we know the offset into
	1486	* the file is on a pagesize boundary and the
	1487	* target buffer address is also on a page boundary
	1488	*/
	1489	max_io_size = newEOF - uio->uio_offset;
	1490	// LP64todo - fix this
	1491	clip_size = uio_resid(uio);
	1492	if (iov_len < clip_size)
	1493	// LP64todo - fix this!
	1494	clip_size = iov_len;
	1495	if (max_io_size < clip_size)
	1496	clip_size = max_io_size;
	1497
	1498	if (clip_size < PAGE_SIZE) {
	1499	/*
	1500	* Take care of tail end of write in this vector
	1501	*/
	1502	// LP64todo - fix this
	1503	prev_resid = uio_resid(uio);
	1504	uio_setresid(uio, clip_size);
	1505
	1506	retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
	1507
	1508	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	1509	} else {
	1510	/* round clip_size down to a multiple of pagesize */
	1511	clip_size = clip_size & ~(PAGE_MASK);
	1512	// LP64todo - fix this
	1513	prev_resid = uio_resid(uio);
	1514	uio_setresid(uio, clip_size);
	1515
	1516	retval = cluster_nocopy_write(vp, uio, newEOF);
	1517
	1518	if ((retval == 0) && uio_resid(uio))
	1519	retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
	1520
	1521	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	1522	}
	1523	} /* end else */
	1524	} /* end while */
	1525
	1526	return(retval);
	1527	}
	1528
	1529
	1530	static int
	1531	cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
	1532	{
	1533	upl_t upl;
	1534	upl_page_info_t *pl;
	1535	vm_offset_t upl_offset;
	1536	int io_size;
	1537	int io_flag;
	1538	int upl_size;
	1539	int upl_needed_size;
	1540	int pages_in_pl;
	1541	int upl_flags;
	1542	kern_return_t kret;
	1543	int i;
	1544	int force_data_sync;
	1545	int error = 0;
	1546	struct clios iostate;
	1547	struct cl_writebehind *wbp;
	1548
	1549
	1550	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	1551	(int)uio->uio_offset, (int)uio_resid(uio),
	1552	(int)newEOF, 0, 0);
	1553
	1554	/*
	1555	* When we enter this routine, we know
	1556	* -- the offset into the file is on a pagesize boundary
	1557	* -- the resid is a page multiple
	1558	* -- the resid will not exceed iov_len
	1559	*/
	1560
	1561	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
	1562
	1563	cluster_try_push(wbp, vp, newEOF, 0, 1);
	1564
	1565	lck_mtx_unlock(&wbp->cl_lockw);
	1566	}
	1567	iostate.io_completed = 0;
	1568	iostate.io_issued = 0;
	1569	iostate.io_error = 0;
	1570	iostate.io_wanted = 0;
	1571
	1572	while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
	1573	user_addr_t iov_base;
	1574
	1575	io_size = uio_resid(uio);
	1576
	1577	if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	1578	io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	1579
	1580	iov_base = uio_curriovbase(uio);
	1581
	1582	// LP64todo - fix this!
	1583	upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
	1584
	1585	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	1586
	1587	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	1588	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	1589
	1590	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	1591	pages_in_pl = 0;
	1592	upl_size = upl_needed_size;
	1593	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	1594	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	1595
	1596	// LP64todo - fix this!
	1597	kret = vm_map_get_upl(current_map(),
	1598	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	1599	&upl_size,
	1600	&upl,
	1601	NULL,
	1602	&pages_in_pl,
	1603	&upl_flags,
	1604	force_data_sync);
	1605
	1606	if (kret != KERN_SUCCESS) {
	1607	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	1608	0, 0, 0, kret, 0);
	1609	/*
	1610	* cluster_nocopy_write: failed to get pagelist
	1611	*
	1612	* we may have already spun some portion of this request
	1613	* off as async requests... we need to wait for the I/O
	1614	* to complete before returning
	1615	*/
	1616	goto wait_for_writes;
	1617	}
	1618	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	1619	pages_in_pl = upl_size / PAGE_SIZE;
	1620
	1621	for (i = 0; i < pages_in_pl; i++) {
	1622	if (!upl_valid_page(pl, i))
	1623	break;
	1624	}
	1625	if (i == pages_in_pl)
	1626	break;
	1627
	1628	/*
	1629	* didn't get all the pages back that we
	1630	* needed... release this upl and try again
	1631	*/
	1632	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
	1633	UPL_ABORT_FREE_ON_EMPTY);
	1634	}
	1635	if (force_data_sync >= 3) {
	1636	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	1637	i, pages_in_pl, upl_size, kret, 0);
	1638	/*
	1639	* for some reason, we couldn't acquire a hold on all
	1640	* the pages needed in the user's address space
	1641	*
	1642	* we may have already spun some portion of this request
	1643	* off as async requests... we need to wait for the I/O
	1644	* to complete before returning
	1645	*/
	1646	goto wait_for_writes;
	1647	}
	1648
	1649	/*
	1650	* Consider the possibility that upl_size wasn't satisfied.
	1651	*/
	1652	if (upl_size != upl_needed_size)
	1653	io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
	1654
	1655	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	1656	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	1657
	1658	if (io_size == 0) {
	1659	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
	1660	UPL_ABORT_FREE_ON_EMPTY);
	1661	/*
	1662	* we may have already spun some portion of this request
	1663	* off as async requests... we need to wait for the I/O
	1664	* to complete before returning
	1665	*/
	1666	goto wait_for_writes;
	1667	}
	1668	/*
	1669	* Now look for pages already in the cache
	1670	* and throw them away.
	1671	* uio->uio_offset is page aligned within the file
	1672	* io_size is a multiple of PAGE_SIZE
	1673	*/
	1674	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
	1675
	1676	/*
	1677	* we want push out these writes asynchronously so that we can overlap
	1678	* the preparation of the next I/O
	1679	* if there are already too many outstanding writes
	1680	* wait until some complete before issuing the next
	1681	*/
	1682	lck_mtx_lock(cl_mtxp);
	1683
	1684	while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
	1685	iostate.io_wanted = 1;
	1686	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
	1687	}
	1688	lck_mtx_unlock(cl_mtxp);
	1689
	1690	if (iostate.io_error) {
	1691	/*
	1692	* one of the earlier writes we issued ran into a hard error
	1693	* don't issue any more writes, cleanup the UPL
	1694	* that was just created but not used, then
	1695	* go wait for all writes that are part of this stream
	1696	* to complete before returning the error to the caller
	1697	*/
	1698	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
	1699	UPL_ABORT_FREE_ON_EMPTY);
	1700
	1701	goto wait_for_writes;
	1702	}
	1703	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE;
	1704
	1705	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	1706	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	1707
	1708	error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	1709	io_size, io_flag, (buf_t)NULL, &iostate);
	1710
	1711	uio_update(uio, (user_size_t)io_size);
	1712
	1713	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	1714	(int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
	1715
	1716	} /* end while */
	1717
	1718	wait_for_writes:
	1719	/*
	1720	* make sure all async writes issued as part of this stream
	1721	* have completed before we return
	1722	*/
	1723	lck_mtx_lock(cl_mtxp);
	1724
	1725	while (iostate.io_issued != iostate.io_completed) {
	1726	iostate.io_wanted = 1;
	1727	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
	1728	}
	1729	lck_mtx_unlock(cl_mtxp);
	1730
	1731	if (iostate.io_error)
	1732	error = iostate.io_error;
	1733
	1734	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	1735	(int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
	1736
	1737	return (error);
	1738	}
	1739
	1740
	1741	static int
	1742	cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
	1743	{
	1744	upl_page_info_t *pl;
	1745	addr64_t src_paddr;
	1746	upl_t upl;
	1747	vm_offset_t upl_offset;
	1748	int tail_size;
	1749	int io_size;
	1750	int upl_size;
	1751	int upl_needed_size;
	1752	int pages_in_pl;
	1753	int upl_flags;
	1754	kern_return_t kret;
	1755	int error = 0;
	1756	user_addr_t iov_base;
	1757	int devblocksize;
	1758	struct cl_writebehind *wbp;
	1759
	1760	devblocksize = vp->v_mount->mnt_devblocksize;
	1761	/*
	1762	* When we enter this routine, we know
	1763	* -- the resid will not exceed iov_len
	1764	* -- the vector target address is physcially contiguous
	1765	*/
	1766	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
	1767
	1768	cluster_try_push(wbp, vp, newEOF, 0, 1);
	1769
	1770	lck_mtx_unlock(&wbp->cl_lockw);
	1771	}
	1772	#if LP64_DEBUG
	1773	if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
	1774	panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
	1775	}
	1776	#endif /* LP64_DEBUG */
	1777
	1778	// LP64todo - fix this!
	1779	io_size = (int)uio_curriovlen(uio);
	1780	iov_base = uio_curriovbase(uio);
	1781
	1782	upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
	1783	upl_needed_size = upl_offset + io_size;
	1784
	1785	pages_in_pl = 0;
	1786	upl_size = upl_needed_size;
	1787	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	1788	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	1789
	1790	// LP64todo - fix this!
	1791	kret = vm_map_get_upl(current_map(),
	1792	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	1793	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
	1794
	1795	if (kret != KERN_SUCCESS) {
	1796	/*
	1797	* cluster_phys_write: failed to get pagelist
	1798	* note: return kret here
	1799	*/
	1800	return(EINVAL);
	1801	}
	1802	/*
	1803	* Consider the possibility that upl_size wasn't satisfied.
	1804	* This is a failure in the physical memory case.
	1805	*/
	1806	if (upl_size < upl_needed_size) {
	1807	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	1808	return(EINVAL);
	1809	}
	1810	pl = ubc_upl_pageinfo(upl);
	1811
	1812	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
	1813
	1814	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	1815	int head_size;
	1816
	1817	head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
	1818
	1819	if (head_size > io_size)
	1820	head_size = io_size;
	1821
	1822	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
	1823
	1824	if (error) {
	1825	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	1826
	1827	return(EINVAL);
	1828	}
	1829	upl_offset += head_size;
	1830	src_paddr += head_size;
	1831	io_size -= head_size;
	1832	}
	1833	tail_size = io_size & (devblocksize - 1);
	1834	io_size -= tail_size;
	1835
	1836	if (io_size) {
	1837	/*
	1838	* issue a synchronous write to cluster_io
	1839	*/
	1840	error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	1841	io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
	1842	}
	1843	if (error == 0) {
	1844	/*
	1845	* The cluster_io write completed successfully,
	1846	* update the uio structure
	1847	*/
	1848	uio_update(uio, (user_size_t)io_size);
	1849
	1850	src_paddr += io_size;
	1851
	1852	if (tail_size)
	1853	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
	1854	}
	1855	/*
	1856	* just release our hold on the physically contiguous
	1857	* region without changing any state
	1858	*/
	1859	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	1860
	1861	return (error);
	1862	}
	1863
	1864
	1865	static int
	1866	cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
	1867	{
	1868	upl_page_info_t *pl;
	1869	upl_t upl;
	1870	vm_offset_t upl_offset = 0;
	1871	int upl_size;
	1872	off_t upl_f_offset;
	1873	int pages_in_upl;
	1874	int start_offset;
	1875	int xfer_resid;
	1876	int io_size;
	1877	int io_offset;
	1878	int bytes_to_zero;
	1879	int bytes_to_move;
	1880	kern_return_t kret;
	1881	int retval = 0;
	1882	int io_resid;
	1883	long long total_size;
	1884	long long zero_cnt;
	1885	off_t zero_off;
	1886	long long zero_cnt1;
	1887	off_t zero_off1;
	1888	struct cl_extent cl;
	1889	int intersection;
	1890	struct cl_writebehind *wbp;
	1891
	1892	if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
	1893	{
	1894	if (wbp->cl_hasbeenpaged) {
	1895	/*
	1896	* this vnode had pages cleaned to it by
	1897	* the pager which indicates that either
	1898	* it's not very 'hot', or the system is
	1899	* being overwhelmed by a lot of dirty
	1900	* data being delayed in the VM cache...
	1901	* in either event, we'll push our remaining
	1902	* delayed data at this point... this will
	1903	* be more efficient than paging out 1 page at
	1904	* a time, and will also act as a throttle
	1905	* by delaying this client from writing any
	1906	* more data until all his delayed data has
	1907	* at least been queued to the uderlying driver.
	1908	*/
	1909	if (wbp->cl_number \|\| wbp->cl_scmap)
	1910	cluster_push_EOF(vp, newEOF);
	1911
	1912	wbp->cl_hasbeenpaged = 0;
	1913	}
	1914	}
	1915	if (uio) {
	1916	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	1917	(int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
	1918
	1919	// LP64todo - fix this
	1920	io_resid = uio_resid(uio);
	1921	} else {
	1922	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	1923	0, 0, (int)oldEOF, (int)newEOF, 0);
	1924
	1925	io_resid = 0;
	1926	}
	1927	zero_cnt = 0;
	1928	zero_cnt1 = 0;
	1929	zero_off = 0;
	1930	zero_off1 = 0;
	1931
	1932	if (flags & IO_HEADZEROFILL) {
	1933	/*
	1934	* some filesystems (HFS is one) don't support unallocated holes within a file...
	1935	* so we zero fill the intervening space between the old EOF and the offset
	1936	* where the next chunk of real data begins.... ftruncate will also use this
	1937	* routine to zero fill to the new EOF when growing a file... in this case, the
	1938	* uio structure will not be provided
	1939	*/
	1940	if (uio) {
	1941	if (headOff < uio->uio_offset) {
	1942	zero_cnt = uio->uio_offset - headOff;
	1943	zero_off = headOff;
	1944	}
	1945	} else if (headOff < newEOF) {
	1946	zero_cnt = newEOF - headOff;
	1947	zero_off = headOff;
	1948	}
	1949	}
	1950	if (flags & IO_TAILZEROFILL) {
	1951	if (uio) {
	1952	// LP64todo - fix this
	1953	zero_off1 = uio->uio_offset + uio_resid(uio);
	1954
	1955	if (zero_off1 < tailOff)
	1956	zero_cnt1 = tailOff - zero_off1;
	1957	}
	1958	}
	1959	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	1960	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	1961	retval, 0, 0, 0, 0);
	1962	return (0);
	1963	}
	1964
	1965	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	1966	/*
	1967	* for this iteration of the loop, figure out where our starting point is
	1968	*/
	1969	if (zero_cnt) {
	1970	start_offset = (int)(zero_off & PAGE_MASK_64);
	1971	upl_f_offset = zero_off - start_offset;
	1972	} else if (io_resid) {
	1973	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	1974	upl_f_offset = uio->uio_offset - start_offset;
	1975	} else {
	1976	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	1977	upl_f_offset = zero_off1 - start_offset;
	1978	}
	1979	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	1980	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	1981
	1982	if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	1983	total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	1984
	1985	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	1986
	1987	if (uio && ((flags & (IO_NOCACHE \| IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	1988	/*
	1989	* assumption... total_size <= io_resid
	1990	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	1991	*/
	1992	if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
	1993	total_size -= start_offset;
	1994	xfer_resid = total_size;
	1995
	1996	retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
	1997
	1998	if (retval)
	1999	break;
	2000
	2001	io_resid -= (total_size - xfer_resid);
	2002	total_size = xfer_resid;
	2003	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	2004	upl_f_offset = uio->uio_offset - start_offset;
	2005
	2006	if (total_size == 0) {
	2007	if (start_offset) {
	2008	/*
	2009	* the write did not finish on a page boundary
	2010	* which will leave upl_f_offset pointing to the
	2011	* beginning of the last page written instead of
	2012	* the page beyond it... bump it in this case
	2013	* so that the cluster code records the last page
	2014	* written as dirty
	2015	*/
	2016	upl_f_offset += PAGE_SIZE_64;
	2017	}
	2018	upl_size = 0;
	2019
	2020	goto check_cluster;
	2021	}
	2022	}
	2023	/*
	2024	* compute the size of the upl needed to encompass
	2025	* the requested write... limit each call to cluster_io
	2026	* to the maximum UPL size... cluster_io will clip if
	2027	* this exceeds the maximum io_size for the device,
	2028	* make sure to account for
	2029	* a starting offset that's not page aligned
	2030	*/
	2031	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2032
	2033	if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	2034	upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	2035
	2036	pages_in_upl = upl_size / PAGE_SIZE;
	2037	io_size = upl_size - start_offset;
	2038
	2039	if ((long long)io_size > total_size)
	2040	io_size = total_size;
	2041
	2042	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	2043
	2044
	2045	/*
	2046	* Gather the pages from the buffer cache.
	2047	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	2048	* that we intend to modify these pages.
	2049	*/
	2050	kret = ubc_create_upl(vp,
	2051	upl_f_offset,
	2052	upl_size,
	2053	&upl,
	2054	&pl,
	2055	UPL_SET_LITE \| UPL_WILL_MODIFY);
	2056	if (kret != KERN_SUCCESS)
	2057	panic("cluster_write: failed to get pagelist");
	2058
	2059	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	2060	(int)upl, (int)upl_f_offset, start_offset, 0, 0);
	2061
	2062	if (start_offset && !upl_valid_page(pl, 0)) {
	2063	int read_size;
	2064
	2065	/*
	2066	* we're starting in the middle of the first page of the upl
	2067	* and the page isn't currently valid, so we're going to have
	2068	* to read it in first... this is a synchronous operation
	2069	*/
	2070	read_size = PAGE_SIZE;
	2071
	2072	if ((upl_f_offset + read_size) > newEOF)
	2073	read_size = newEOF - upl_f_offset;
	2074
	2075	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	2076	CL_READ, (buf_t)NULL, (struct clios *)NULL);
	2077	if (retval) {
	2078	/*
	2079	* we had an error during the read which causes us to abort
	2080	* the current cluster_write request... before we do, we need
	2081	* to release the rest of the pages in the upl without modifying
	2082	* there state and mark the failed page in error
	2083	*/
	2084	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
	2085
	2086	if (upl_size > PAGE_SIZE)
	2087	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	2088
	2089	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	2090	(int)upl, 0, 0, retval, 0);
	2091	break;
	2092	}
	2093	}
	2094	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	2095	/*
	2096	* the last offset we're writing to in this upl does not end on a page
	2097	* boundary... if it's not beyond the old EOF, then we'll also need to
	2098	* pre-read this page in if it isn't already valid
	2099	*/
	2100	upl_offset = upl_size - PAGE_SIZE;
	2101
	2102	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	2103	!upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
	2104	int read_size;
	2105
	2106	read_size = PAGE_SIZE;
	2107
	2108	if ((upl_f_offset + upl_offset + read_size) > newEOF)
	2109	read_size = newEOF - (upl_f_offset + upl_offset);
	2110
	2111	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	2112	CL_READ, (buf_t)NULL, (struct clios *)NULL);
	2113	if (retval) {
	2114	/*
	2115	* we had an error during the read which causes us to abort
	2116	* the current cluster_write request... before we do, we
	2117	* need to release the rest of the pages in the upl without
	2118	* modifying there state and mark the failed page in error
	2119	*/
	2120	ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
	2121
	2122	if (upl_size > PAGE_SIZE)
	2123	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	2124
	2125	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	2126	(int)upl, 0, 0, retval, 0);
	2127	break;
	2128	}
	2129	}
	2130	}
	2131	xfer_resid = io_size;
	2132	io_offset = start_offset;
	2133
	2134	while (zero_cnt && xfer_resid) {
	2135
	2136	if (zero_cnt < (long long)xfer_resid)
	2137	bytes_to_zero = zero_cnt;
	2138	else
	2139	bytes_to_zero = xfer_resid;
	2140
	2141	if ( !(flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2142	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2143	} else {
	2144	int zero_pg_index;
	2145
	2146	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	2147	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	2148
	2149	if ( !upl_valid_page(pl, zero_pg_index)) {
	2150	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2151
	2152	} else if ((flags & (IO_NOZERODIRTY \| IO_NOZEROVALID)) == IO_NOZERODIRTY &&
	2153	!upl_dirty_page(pl, zero_pg_index)) {
	2154	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2155	}
	2156	}
	2157	xfer_resid -= bytes_to_zero;
	2158	zero_cnt -= bytes_to_zero;
	2159	zero_off += bytes_to_zero;
	2160	io_offset += bytes_to_zero;
	2161	}
	2162	if (xfer_resid && io_resid) {
	2163	bytes_to_move = min(io_resid, xfer_resid);
	2164
	2165	retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
	2166
	2167	if (retval) {
	2168
	2169	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	2170
	2171	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	2172	(int)upl, 0, 0, retval, 0);
	2173	} else {
	2174	io_resid -= bytes_to_move;
	2175	xfer_resid -= bytes_to_move;
	2176	io_offset += bytes_to_move;
	2177	}
	2178	}
	2179	while (xfer_resid && zero_cnt1 && retval == 0) {
	2180
	2181	if (zero_cnt1 < (long long)xfer_resid)
	2182	bytes_to_zero = zero_cnt1;
	2183	else
	2184	bytes_to_zero = xfer_resid;
	2185
	2186	if ( !(flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2187	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2188	} else {
	2189	int zero_pg_index;
	2190
	2191	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
	2192	zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
	2193
	2194	if ( !upl_valid_page(pl, zero_pg_index)) {
	2195	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2196	} else if ((flags & (IO_NOZERODIRTY \| IO_NOZEROVALID)) == IO_NOZERODIRTY &&
	2197	!upl_dirty_page(pl, zero_pg_index)) {
	2198	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2199	}
	2200	}
	2201	xfer_resid -= bytes_to_zero;
	2202	zero_cnt1 -= bytes_to_zero;
	2203	zero_off1 += bytes_to_zero;
	2204	io_offset += bytes_to_zero;
	2205	}
	2206
	2207	if (retval == 0) {
	2208	int cl_index;
	2209	int can_delay;
	2210
	2211	io_size += start_offset;
	2212
	2213	if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
	2214	/*
	2215	* if we're extending the file with this write
	2216	* we'll zero fill the rest of the page so that
	2217	* if the file gets extended again in such a way as to leave a
	2218	* hole starting at this EOF, we'll have zero's in the correct spot
	2219	*/
	2220	cluster_zero(upl, io_size, upl_size - io_size, NULL);
	2221	}
	2222	if (flags & IO_SYNC)
	2223	/*
	2224	* if the IO_SYNC flag is set than we need to
	2225	* bypass any clusters and immediately issue
	2226	* the I/O
	2227	*/
	2228	goto issue_io;
	2229	check_cluster:
	2230	/*
	2231	* take the lock to protect our accesses
	2232	* of the writebehind and sparse cluster state
	2233	*/
	2234	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	2235
	2236	/*
	2237	* calculate the last logical block number
	2238	* that this delayed I/O encompassed
	2239	*/
	2240	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	2241
	2242	if (wbp->cl_scmap) {
	2243
	2244	if ( !(flags & IO_NOCACHE)) {
	2245	/*
	2246	* we've fallen into the sparse
	2247	* cluster method of delaying dirty pages
	2248	* first, we need to release the upl if we hold one
	2249	* since pages in it may be present in the sparse cluster map
	2250	* and may span 2 separate buckets there... if they do and
	2251	* we happen to have to flush a bucket to make room and it intersects
	2252	* this upl, a deadlock may result on page BUSY
	2253	*/
	2254	if (upl_size)
	2255	ubc_upl_commit_range(upl, 0, upl_size,
	2256	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2257
	2258	sparse_cluster_add(wbp, vp, &cl, newEOF);
	2259
	2260	lck_mtx_unlock(&wbp->cl_lockw);
	2261
	2262	continue;
	2263	}
	2264	/*
	2265	* must have done cached writes that fell into
	2266	* the sparse cluster mechanism... we've switched
	2267	* to uncached writes on the file, so go ahead
	2268	* and push whatever's in the sparse map
	2269	* and switch back to normal clustering
	2270	*
	2271	* see the comment above concerning a possible deadlock...
	2272	*/
	2273	if (upl_size) {
	2274	ubc_upl_commit_range(upl, 0, upl_size,
	2275	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2276	/*
	2277	* setting upl_size to 0 keeps us from committing a
	2278	* second time in the start_new_cluster path
	2279	*/
	2280	upl_size = 0;
	2281	}
	2282	sparse_cluster_push(wbp, vp, newEOF, 1);
	2283
	2284	wbp->cl_number = 0;
	2285	/*
	2286	* no clusters of either type present at this point
	2287	* so just go directly to start_new_cluster since
	2288	* we know we need to delay this I/O since we've
	2289	* already released the pages back into the cache
	2290	* to avoid the deadlock with sparse_cluster_push
	2291	*/
	2292	goto start_new_cluster;
	2293	}
	2294	upl_offset = 0;
	2295
	2296	if (wbp->cl_number == 0)
	2297	/*
	2298	* no clusters currently present
	2299	*/
	2300	goto start_new_cluster;
	2301
	2302	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	2303	/*
	2304	* check each cluster that we currently hold
	2305	* try to merge some or all of this write into
	2306	* one or more of the existing clusters... if
	2307	* any portion of the write remains, start a
	2308	* new cluster
	2309	*/
	2310	if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	2311	/*
	2312	* the current write starts at or after the current cluster
	2313	*/
	2314	if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
	2315	/*
	2316	* we have a write that fits entirely
	2317	* within the existing cluster limits
	2318	*/
	2319	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
	2320	/*
	2321	* update our idea of where the cluster ends
	2322	*/
	2323	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	2324	break;
	2325	}
	2326	if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
	2327	/*
	2328	* we have a write that starts in the middle of the current cluster
	2329	* but extends beyond the cluster's limit... we know this because
	2330	* of the previous checks
	2331	* we'll extend the current cluster to the max
	2332	* and update the b_addr for the current write to reflect that
	2333	* the head of it was absorbed into this cluster...
	2334	* note that we'll always have a leftover tail in this case since
	2335	* full absorbtion would have occurred in the clause above
	2336	*/
	2337	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
	2338
	2339	if (upl_size) {
	2340	daddr64_t start_pg_in_upl;
	2341
	2342	start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	2343
	2344	if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
	2345	intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
	2346
	2347	ubc_upl_commit_range(upl, upl_offset, intersection,
	2348	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2349	upl_f_offset += intersection;
	2350	upl_offset += intersection;
	2351	upl_size -= intersection;
	2352	}
	2353	}
	2354	cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
	2355	}
	2356	/*
	2357	* we come here for the case where the current write starts
	2358	* beyond the limit of the existing cluster or we have a leftover
	2359	* tail after a partial absorbtion
	2360	*
	2361	* in either case, we'll check the remaining clusters before
	2362	* starting a new one
	2363	*/
	2364	} else {
	2365	/*
	2366	* the current write starts in front of the cluster we're currently considering
	2367	*/
	2368	if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
	2369	/*
	2370	* we can just merge the new request into
	2371	* this cluster and leave it in the cache
	2372	* since the resulting cluster is still
	2373	* less than the maximum allowable size
	2374	*/
	2375	wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
	2376
	2377	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
	2378	/*
	2379	* the current write completely
	2380	* envelops the existing cluster and since
	2381	* each write is limited to at most MAX_UPL_TRANSFER bytes
	2382	* we can just use the start and last blocknos of the write
	2383	* to generate the cluster limits
	2384	*/
	2385	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	2386	}
	2387	break;
	2388	}
	2389
	2390	/*
	2391	* if we were to combine this write with the current cluster
	2392	* we would exceed the cluster size limit.... so,
	2393	* let's see if there's any overlap of the new I/O with
	2394	* the cluster we're currently considering... in fact, we'll
	2395	* stretch the cluster out to it's full limit and see if we
	2396	* get an intersection with the current write
	2397	*
	2398	*/
	2399	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
	2400	/*
	2401	* the current write extends into the proposed cluster
	2402	* clip the length of the current write after first combining it's
	2403	* tail with the newly shaped cluster
	2404	*/
	2405	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
	2406
	2407	if (upl_size) {
	2408	intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
	2409
	2410	if (intersection > upl_size)
	2411	/*
	2412	* because the current write may consist of a number of pages found in the cache
	2413	* which are not part of the UPL, we may have an intersection that exceeds
	2414	* the size of the UPL that is also part of this write
	2415	*/
	2416	intersection = upl_size;
	2417
	2418	ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
	2419	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2420	upl_size -= intersection;
	2421	}
	2422	cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
	2423	}
	2424	/*
	2425	* if we get here, there was no way to merge
	2426	* any portion of this write with this cluster
	2427	* or we could only merge part of it which
	2428	* will leave a tail...
	2429	* we'll check the remaining clusters before starting a new one
	2430	*/
	2431	}
	2432	}
	2433	if (cl_index < wbp->cl_number)
	2434	/*
	2435	* we found an existing cluster(s) that we
	2436	* could entirely merge this I/O into
	2437	*/
	2438	goto delay_io;
	2439
	2440	if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
	2441	/*
	2442	* we didn't find an existing cluster to
	2443	* merge into, but there's room to start
	2444	* a new one
	2445	*/
	2446	goto start_new_cluster;
	2447
	2448	/*
	2449	* no exisitng cluster to merge with and no
	2450	* room to start a new one... we'll try
	2451	* pushing one of the existing ones... if none of
	2452	* them are able to be pushed, we'll switch
	2453	* to the sparse cluster mechanism
	2454	* cluster_try_push updates cl_number to the
	2455	* number of remaining clusters... and
	2456	* returns the number of currently unused clusters
	2457	*/
	2458	int ret_cluster_try_push = 0;
	2459	/* if writes are not deferred, call cluster push immediately */
	2460	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
	2461	if (flags & IO_NOCACHE)
	2462	can_delay = 0;
	2463	else
	2464	can_delay = 1;
	2465
	2466	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
	2467	}
	2468
	2469	/* execute following regardless writes are deferred or not */
	2470	if (ret_cluster_try_push == 0) {
	2471	/*
	2472	* no more room in the normal cluster mechanism
	2473	* so let's switch to the more expansive but expensive
	2474	* sparse mechanism....
	2475	* first, we need to release the upl if we hold one
	2476	* since pages in it may be present in the sparse cluster map (after the cluster_switch)
	2477	* and may span 2 separate buckets there... if they do and
	2478	* we happen to have to flush a bucket to make room and it intersects
	2479	* this upl, a deadlock may result on page BUSY
	2480	*/
	2481	if (upl_size)
	2482	ubc_upl_commit_range(upl, upl_offset, upl_size,
	2483	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2484
	2485	sparse_cluster_switch(wbp, vp, newEOF);
	2486	sparse_cluster_add(wbp, vp, &cl, newEOF);
	2487
	2488	lck_mtx_unlock(&wbp->cl_lockw);
	2489
	2490	continue;
	2491	}
	2492	/*
	2493	* we pushed one cluster successfully, so we must be sequentially writing this file
	2494	* otherwise, we would have failed and fallen into the sparse cluster support
	2495	* so let's take the opportunity to push out additional clusters as long as we
	2496	* remain below the throttle... this will give us better I/O locality if we're
	2497	* in a copy loop (i.e. we won't jump back and forth between the read and write points
	2498	* however, we don't want to push so much out that the write throttle kicks in and
	2499	* hangs this thread up until some of the I/O completes...
	2500	*/
	2501	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
	2502	while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
	2503	cluster_try_push(wbp, vp, newEOF, 0, 0);
	2504	}
	2505
	2506	start_new_cluster:
	2507	wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
	2508	wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
	2509
	2510	if (flags & IO_NOCACHE)
	2511	wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
	2512	else
	2513	wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
	2514	wbp->cl_number++;
	2515	delay_io:
	2516	if (upl_size)
	2517	ubc_upl_commit_range(upl, upl_offset, upl_size,
	2518	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2519
	2520	lck_mtx_unlock(&wbp->cl_lockw);
	2521
	2522	continue;
	2523	issue_io:
	2524	/*
	2525	* we don't hold the vnode lock at this point
	2526	*
	2527	* because we had to ask for a UPL that provides currenty non-present pages, the
	2528	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	2529	* upon committing it... this is not the behavior we want since it's possible for
	2530	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	2531	* in order to maintain some semblance of coherency with mapped writes
	2532	* we need to drop the current upl and pick it back up with COPYOUT_FROM set
	2533	* so that we correctly deal with a change in state of the hardware modify bit...
	2534	* we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
	2535	* cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
	2536	* responsible for generating the correct sized I/O(s)
	2537	*/
	2538	ubc_upl_commit_range(upl, 0, upl_size,
	2539	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	2540
	2541	cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
	2542
	2543	retval = cluster_push_x(vp, &cl, newEOF, flags);
	2544	}
	2545	}
	2546	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	2547	retval, 0, io_resid, 0, 0);
	2548
	2549	return (retval);
	2550	}
	2551
	2552	int
	2553	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	2554	{
	2555	int prev_resid;
	2556	u_int clip_size;
	2557	off_t max_io_size;
	2558	int upl_size;
	2559	int upl_flags;
	2560	upl_t upl;
	2561	int retval = 0;
	2562	int flags;
	2563
	2564	flags = xflags;
	2565
	2566	if (vp->v_flag & VNOCACHE_DATA)
	2567	flags \|= IO_NOCACHE;
	2568	if (vp->v_flag & VRAOFF)
	2569	flags \|= IO_RAOFF;
	2570
	2571	if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
	2572	/*
	2573	* go do a read through the cache if one of the following is true....
	2574	* NOCACHE is not true
	2575	* the uio request doesn't target USERSPACE
	2576	*/
	2577	return (cluster_read_x(vp, uio, filesize, flags));
	2578	}
	2579
	2580	#if LP64_DEBUG
	2581	if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
	2582	panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
	2583	}
	2584	#endif /* LP64_DEBUG */
	2585
	2586	while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
	2587	user_size_t iov_len;
	2588	user_addr_t iov_base;
	2589
	2590	/*
	2591	* we know we have a resid, so this is safe
	2592	* skip over any emtpy vectors
	2593	*/
	2594	uio_update(uio, (user_size_t)0);
	2595
	2596	iov_len = uio_curriovlen(uio);
	2597	iov_base = uio_curriovbase(uio);
	2598
	2599	upl_size = PAGE_SIZE;
	2600	upl_flags = UPL_QUERY_OBJECT_TYPE;
	2601
	2602	// LP64todo - fix this!
	2603	if ((vm_map_get_upl(current_map(),
	2604	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2605	&upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
	2606	/*
	2607	* the user app must have passed in an invalid address
	2608	*/
	2609	return (EFAULT);
	2610	}
	2611
	2612	/*
	2613	* We check every vector target but if it is physically
	2614	* contiguous space, we skip the sanity checks.
	2615	*/
	2616	if (upl_flags & UPL_PHYS_CONTIG) {
	2617	retval = cluster_phys_read(vp, uio, filesize);
	2618	}
	2619	else if (uio_resid(uio) < PAGE_SIZE) {
	2620	/*
	2621	* we're here because we're don't have a physically contiguous target buffer
	2622	* go do a read through the cache if
	2623	* the total xfer size is less than a page...
	2624	*/
	2625	return (cluster_read_x(vp, uio, filesize, flags));
	2626	}
	2627	// LP64todo - fix this!
	2628	else if (((int)uio->uio_offset & PAGE_MASK) \|\| (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
	2629	if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
	2630	/*
	2631	* Bring the file offset read up to a pagesize boundary
	2632	* this will also bring the base address to a page boundary
	2633	* since they both are currently on the same offset within a page
	2634	* note: if we get here, uio->uio_resid is greater than PAGE_SIZE
	2635	* so the computed clip_size must always be less than the current uio_resid
	2636	*/
	2637	clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
	2638
	2639	/*
	2640	* Fake the resid going into the cluster_read_x call
	2641	* and restore it on the way out.
	2642	*/
	2643	prev_resid = uio_resid(uio);
	2644	// LP64todo - fix this
	2645	uio_setresid(uio, clip_size);
	2646
	2647	retval = cluster_read_x(vp, uio, filesize, flags);
	2648
	2649	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	2650	} else {
	2651	/*
	2652	* can't get both the file offset and the buffer offset aligned to a page boundary
	2653	* so fire an I/O through the cache for this entire vector
	2654	*/
	2655	// LP64todo - fix this!
	2656	clip_size = iov_len;
	2657	prev_resid = uio_resid(uio);
	2658	uio_setresid(uio, clip_size);
	2659
	2660	retval = cluster_read_x(vp, uio, filesize, flags);
	2661
	2662	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	2663	}
	2664	} else {
	2665	/*
	2666	* If we come in here, we know the offset into
	2667	* the file is on a pagesize boundary
	2668	*/
	2669	max_io_size = filesize - uio->uio_offset;
	2670	// LP64todo - fix this
	2671	clip_size = uio_resid(uio);
	2672	if (iov_len < clip_size)
	2673	clip_size = iov_len;
	2674	if (max_io_size < clip_size)
	2675	clip_size = (int)max_io_size;
	2676
	2677	if (clip_size < PAGE_SIZE) {
	2678	/*
	2679	* Take care of the tail end of the read in this vector.
	2680	*/
	2681	// LP64todo - fix this
	2682	prev_resid = uio_resid(uio);
	2683	uio_setresid(uio, clip_size);
	2684
	2685	retval = cluster_read_x(vp, uio, filesize, flags);
	2686
	2687	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	2688	} else {
	2689	/* round clip_size down to a multiple of pagesize */
	2690	clip_size = clip_size & ~(PAGE_MASK);
	2691	// LP64todo - fix this
	2692	prev_resid = uio_resid(uio);
	2693	uio_setresid(uio, clip_size);
	2694
	2695	retval = cluster_nocopy_read(vp, uio, filesize);
	2696
	2697	if ((retval==0) && uio_resid(uio))
	2698	retval = cluster_read_x(vp, uio, filesize, flags);
	2699
	2700	uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
	2701	}
	2702	} /* end else */
	2703	} /* end while */
	2704
	2705	return(retval);
	2706	}
	2707
	2708	static int
	2709	cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
	2710	{
	2711	upl_page_info_t *pl;
	2712	upl_t upl;
	2713	vm_offset_t upl_offset;
	2714	int upl_size;
	2715	off_t upl_f_offset;
	2716	int start_offset;
	2717	int start_pg;
	2718	int last_pg;
	2719	int uio_last = 0;
	2720	int pages_in_upl;
	2721	off_t max_size;
	2722	off_t last_ioread_offset;
	2723	off_t last_request_offset;
	2724	u_int size_of_prefetch;
	2725	u_int io_size;
	2726	kern_return_t kret;
	2727	int error = 0;
	2728	int retval = 0;
	2729	u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	2730	u_int rd_ahead_enabled = 1;
	2731	u_int prefetch_enabled = 1;
	2732	struct cl_readahead * rap;
	2733	struct clios iostate;
	2734	struct cl_extent extent;
	2735
	2736	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	2737	(int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
	2738
	2739	// LP64todo - fix this
	2740	last_request_offset = uio->uio_offset + uio_resid(uio);
	2741
	2742	if ((flags & (IO_RAOFF\|IO_NOCACHE)) \|\|
	2743	((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	2744	rd_ahead_enabled = 0;
	2745	rap = NULL;
	2746	} else {
	2747	if (cluster_hard_throttle_on(vp)) {
	2748	rd_ahead_enabled = 0;
	2749	prefetch_enabled = 0;
	2750
	2751	max_rd_size = HARD_THROTTLE_MAXSIZE;
	2752	}
	2753	if ((rap = cluster_get_rap(vp)) == NULL)
	2754	rd_ahead_enabled = 0;
	2755	}
	2756	if (last_request_offset > filesize)
	2757	last_request_offset = filesize;
	2758	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	2759	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	2760
	2761	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	2762	/*
	2763	* determine if we already have a read-ahead in the pipe courtesy of the
	2764	* last read systemcall that was issued...
	2765	* if so, pick up it's extent to determine where we should start
	2766	* with respect to any read-ahead that might be necessary to
	2767	* garner all the data needed to complete this read systemcall
	2768	*/
	2769	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	2770
	2771	if (last_ioread_offset < uio->uio_offset)
	2772	last_ioread_offset = (off_t)0;
	2773	else if (last_ioread_offset > last_request_offset)
	2774	last_ioread_offset = last_request_offset;
	2775	} else
	2776	last_ioread_offset = (off_t)0;
	2777
	2778	while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
	2779	/*
	2780	* compute the size of the upl needed to encompass
	2781	* the requested read... limit each call to cluster_io
	2782	* to the maximum UPL size... cluster_io will clip if
	2783	* this exceeds the maximum io_size for the device,
	2784	* make sure to account for
	2785	* a starting offset that's not page aligned
	2786	*/
	2787	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	2788	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	2789	max_size = filesize - uio->uio_offset;
	2790
	2791	// LP64todo - fix this!
	2792	if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
	2793	io_size = uio_resid(uio);
	2794	else
	2795	io_size = max_size;
	2796
	2797	if (!(flags & IO_NOCACHE)) {
	2798
	2799	while (io_size) {
	2800	u_int io_resid;
	2801	u_int io_requested;
	2802
	2803	/*
	2804	* if we keep finding the pages we need already in the cache, then
	2805	* don't bother to call cluster_rd_prefetch since it costs CPU cycles
	2806	* to determine that we have all the pages we need... once we miss in
	2807	* the cache and have issued an I/O, than we'll assume that we're likely
	2808	* to continue to miss in the cache and it's to our advantage to try and prefetch
	2809	*/
	2810	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
	2811	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	2812	/*
	2813	* we've already issued I/O for this request and
	2814	* there's still work to do and
	2815	* our prefetch stream is running dry, so issue a
	2816	* pre-fetch I/O... the I/O latency will overlap
	2817	* with the copying of the data
	2818	*/
	2819	if (size_of_prefetch > max_rd_size)
	2820	size_of_prefetch = max_rd_size;
	2821
	2822	size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
	2823
	2824	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	2825
	2826	if (last_ioread_offset > last_request_offset)
	2827	last_ioread_offset = last_request_offset;
	2828	}
	2829	}
	2830	/*
	2831	* limit the size of the copy we're about to do so that
	2832	* we can notice that our I/O pipe is running dry and
	2833	* get the next I/O issued before it does go dry
	2834	*/
	2835	if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
	2836	io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
	2837	else
	2838	io_resid = io_size;
	2839
	2840	io_requested = io_resid;
	2841
	2842	retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
	2843
	2844	io_size -= (io_requested - io_resid);
	2845
	2846	if (retval \|\| io_resid)
	2847	/*
	2848	* if we run into a real error or
	2849	* a page that is not in the cache
	2850	* we need to leave streaming mode
	2851	*/
	2852	break;
	2853
	2854	if ((io_size == 0 \|\| last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
	2855	/*
	2856	* we're already finished the I/O for this read request
	2857	* let's see if we should do a read-ahead
	2858	*/
	2859	cluster_rd_ahead(vp, &extent, filesize, rap);
	2860	}
	2861	}
	2862	if (retval)
	2863	break;
	2864	if (io_size == 0) {
	2865	if (rap != NULL) {
	2866	if (extent.e_addr < rap->cl_lastr)
	2867	rap->cl_maxra = 0;
	2868	rap->cl_lastr = extent.e_addr;
	2869	}
	2870	break;
	2871	}
	2872	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	2873	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	2874	max_size = filesize - uio->uio_offset;
	2875	}
	2876	if (io_size > max_rd_size)
	2877	io_size = max_rd_size;
	2878
	2879	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2880
	2881	if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
	2882	upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
	2883	pages_in_upl = upl_size / PAGE_SIZE;
	2884
	2885	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	2886	(int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
	2887
	2888	kret = ubc_create_upl(vp,
	2889	upl_f_offset,
	2890	upl_size,
	2891	&upl,
	2892	&pl,
	2893	UPL_SET_LITE);
	2894	if (kret != KERN_SUCCESS)
	2895	panic("cluster_read: failed to get pagelist");
	2896
	2897	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	2898	(int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
	2899
	2900	/*
	2901	* scan from the beginning of the upl looking for the first
	2902	* non-valid page.... this will become the first page in
	2903	* the request we're going to make to 'cluster_io'... if all
	2904	* of the pages are valid, we won't call through to 'cluster_io'
	2905	*/
	2906	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	2907	if (!upl_valid_page(pl, start_pg))
	2908	break;
	2909	}
	2910
	2911	/*
	2912	* scan from the starting invalid page looking for a valid
	2913	* page before the end of the upl is reached, if we
	2914	* find one, then it will be the last page of the request to
	2915	* 'cluster_io'
	2916	*/
	2917	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	2918	if (upl_valid_page(pl, last_pg))
	2919	break;
	2920	}
	2921	iostate.io_completed = 0;
	2922	iostate.io_issued = 0;
	2923	iostate.io_error = 0;
	2924	iostate.io_wanted = 0;
	2925
	2926	if (start_pg < last_pg) {
	2927	/*
	2928	* we found a range of 'invalid' pages that must be filled
	2929	* if the last page in this range is the last page of the file
	2930	* we may have to clip the size of it to keep from reading past
	2931	* the end of the last physical block associated with the file
	2932	*/
	2933	upl_offset = start_pg * PAGE_SIZE;
	2934	io_size = (last_pg - start_pg) * PAGE_SIZE;
	2935
	2936	if ((upl_f_offset + upl_offset + io_size) > filesize)
	2937	io_size = filesize - (upl_f_offset + upl_offset);
	2938
	2939	/*
	2940	* issue an asynchronous read to cluster_io
	2941	*/
	2942
	2943	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	2944	io_size, CL_READ \| CL_ASYNC, (buf_t)NULL, &iostate);
	2945	}
	2946	if (error == 0) {
	2947	/*
	2948	* if the read completed successfully, or there was no I/O request
	2949	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	2950	* we'll first add on any 'valid'
	2951	* pages that were present in the upl when we acquired it.
	2952	*/
	2953	u_int val_size;
	2954
	2955	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	2956	if (!upl_valid_page(pl, uio_last))
	2957	break;
	2958	}
	2959	/*
	2960	* compute size to transfer this round, if uio->uio_resid is
	2961	* still non-zero after this attempt, we'll loop around and
	2962	* set up for another I/O.
	2963	*/
	2964	val_size = (uio_last * PAGE_SIZE) - start_offset;
	2965
	2966	if (val_size > max_size)
	2967	val_size = max_size;
	2968
	2969	if (val_size > uio_resid(uio))
	2970	// LP64todo - fix this
	2971	val_size = uio_resid(uio);
	2972
	2973	if (last_ioread_offset == 0)
	2974	last_ioread_offset = uio->uio_offset + val_size;
	2975
	2976	if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	2977	/*
	2978	* if there's still I/O left to do for this request, and...
	2979	* we're not in hard throttle mode, then issue a
	2980	* pre-fetch I/O... the I/O latency will overlap
	2981	* with the copying of the data
	2982	*/
	2983	size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
	2984
	2985	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	2986
	2987	if (last_ioread_offset > last_request_offset)
	2988	last_ioread_offset = last_request_offset;
	2989
	2990	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	2991	/*
	2992	* this transfer will finish this request, so...
	2993	* let's try to read ahead if we're in
	2994	* a sequential access pattern and we haven't
	2995	* explicitly disabled it
	2996	*/
	2997	if (rd_ahead_enabled)
	2998	cluster_rd_ahead(vp, &extent, filesize, rap);
	2999
	3000	if (rap != NULL) {
	3001	if (extent.e_addr < rap->cl_lastr)
	3002	rap->cl_maxra = 0;
	3003	rap->cl_lastr = extent.e_addr;
	3004	}
	3005	}
	3006	lck_mtx_lock(cl_mtxp);
	3007
	3008	while (iostate.io_issued != iostate.io_completed) {
	3009	iostate.io_wanted = 1;
	3010	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
	3011	}
	3012	lck_mtx_unlock(cl_mtxp);
	3013
	3014	if (iostate.io_error)
	3015	error = iostate.io_error;
	3016	else
	3017	retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
	3018	}
	3019	if (start_pg < last_pg) {
	3020	/*
	3021	* compute the range of pages that we actually issued an I/O for
	3022	* and either commit them as valid if the I/O succeeded
	3023	* or abort them if the I/O failed
	3024	*/
	3025	io_size = (last_pg - start_pg) * PAGE_SIZE;
	3026
	3027	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	3028	(int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
	3029
	3030	if (error \|\| (flags & IO_NOCACHE))
	3031	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	3032	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3033	else
	3034	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
	3035	UPL_COMMIT_CLEAR_DIRTY \|
	3036	UPL_COMMIT_FREE_ON_EMPTY \|
	3037	UPL_COMMIT_INACTIVATE);
	3038
	3039	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END,
	3040	(int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
	3041	}
	3042	if ((last_pg - start_pg) < pages_in_upl) {
	3043	int cur_pg;
	3044	int commit_flags;
	3045
	3046	/*
	3047	* the set of pages that we issued an I/O for did not encompass
	3048	* the entire upl... so just release these without modifying
	3049	* their state
	3050	*/
	3051	if (error)
	3052	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3053	else {
	3054	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	3055	(int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	3056
	3057	if (start_pg) {
	3058	/*
	3059	* we found some already valid pages at the beginning of
	3060	* the upl commit these back to the inactive list with
	3061	* reference cleared
	3062	*/
	3063	for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
	3064	commit_flags = UPL_COMMIT_FREE_ON_EMPTY
	3065	\| UPL_COMMIT_INACTIVATE;
	3066
	3067	if (upl_dirty_page(pl, cur_pg))
	3068	commit_flags \|= UPL_COMMIT_SET_DIRTY;
	3069
	3070	if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
	3071	ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
	3072	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3073	else
	3074	ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
	3075	PAGE_SIZE, commit_flags);
	3076	}
	3077	}
	3078	if (last_pg < uio_last) {
	3079	/*
	3080	* we found some already valid pages immediately after the
	3081	* pages we issued I/O for, commit these back to the
	3082	* inactive list with reference cleared
	3083	*/
	3084	for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
	3085	commit_flags = UPL_COMMIT_FREE_ON_EMPTY
	3086	\| UPL_COMMIT_INACTIVATE;
	3087
	3088	if (upl_dirty_page(pl, cur_pg))
	3089	commit_flags \|= UPL_COMMIT_SET_DIRTY;
	3090
	3091	if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
	3092	ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
	3093	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3094	else
	3095	ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
	3096	PAGE_SIZE, commit_flags);
	3097	}
	3098	}
	3099	if (uio_last < pages_in_upl) {
	3100	/*
	3101	* there were some invalid pages beyond the valid pages
	3102	* that we didn't issue an I/O for, just release them
	3103	* unchanged
	3104	*/
	3105	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	3106	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	3107	}
	3108
	3109	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END,
	3110	(int)upl, -1, -1, 0, 0);
	3111	}
	3112	}
	3113	if (retval == 0)
	3114	retval = error;
	3115
	3116	if ( uio_resid(uio) ) {
	3117	if (cluster_hard_throttle_on(vp)) {
	3118	rd_ahead_enabled = 0;
	3119	prefetch_enabled = 0;
	3120
	3121	max_rd_size = HARD_THROTTLE_MAXSIZE;
	3122	} else {
	3123	if (rap != NULL)
	3124	rd_ahead_enabled = 1;
	3125	prefetch_enabled = 1;
	3126
	3127	max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	3128	}
	3129	}
	3130	}
	3131	if (rap != NULL) {
	3132	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	3133	(int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
	3134
	3135	lck_mtx_unlock(&rap->cl_lockr);
	3136	} else {
	3137	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	3138	(int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
	3139	}
	3140
	3141	return (retval);
	3142	}
	3143
	3144
	3145	static int
	3146	cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
	3147	{
	3148	upl_t upl;
	3149	upl_page_info_t *pl;
	3150	vm_offset_t upl_offset;
	3151	off_t max_io_size;
	3152	int io_size;
	3153	int upl_size;
	3154	int upl_needed_size;
	3155	int pages_in_pl;
	3156	int upl_flags;
	3157	kern_return_t kret;
	3158	int i;
	3159	int force_data_sync;
	3160	int retval = 0;
	3161	int no_zero_fill = 0;
	3162	int abort_flag = 0;
	3163	struct clios iostate;
	3164	u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	3165	u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
	3166
	3167
	3168	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	3169	(int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
	3170
	3171	/*
	3172	* When we enter this routine, we know
	3173	* -- the offset into the file is on a pagesize boundary
	3174	* -- the resid is a page multiple
	3175	* -- the resid will not exceed iov_len
	3176	*/
	3177
	3178	iostate.io_completed = 0;
	3179	iostate.io_issued = 0;
	3180	iostate.io_error = 0;
	3181	iostate.io_wanted = 0;
	3182
	3183	while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
	3184	user_addr_t iov_base;
	3185
	3186	if (cluster_hard_throttle_on(vp)) {
	3187	max_rd_size = HARD_THROTTLE_MAXSIZE;
	3188	max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
	3189	} else {
	3190	max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	3191	max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
	3192	}
	3193	max_io_size = filesize - uio->uio_offset;
	3194
	3195	// LP64todo - fix this
	3196	if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
	3197	io_size = max_io_size;
	3198	else
	3199	io_size = uio_resid(uio);
	3200
	3201	/*
	3202	* First look for pages already in the cache
	3203	* and move them to user space.
	3204	*/
	3205	retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
	3206
	3207	if (retval) {
	3208	/*
	3209	* we may have already spun some portion of this request
	3210	* off as async requests... we need to wait for the I/O
	3211	* to complete before returning
	3212	*/
	3213	goto wait_for_reads;
	3214	}
	3215	/*
	3216	* If we are already finished with this read, then return
	3217	*/
	3218	if (io_size == 0) {
	3219	/*
	3220	* we may have already spun some portion of this request
	3221	* off as async requests... we need to wait for the I/O
	3222	* to complete before returning
	3223	*/
	3224	goto wait_for_reads;
	3225	}
	3226	max_io_size = io_size;
	3227
	3228	if (max_io_size > max_rd_size)
	3229	max_io_size = max_rd_size;
	3230
	3231	io_size = 0;
	3232
	3233	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
	3234
	3235	if (io_size == 0)
	3236	/*
	3237	* we may have already spun some portion of this request
	3238	* off as async requests... we need to wait for the I/O
	3239	* to complete before returning
	3240	*/
	3241	goto wait_for_reads;
	3242
	3243	iov_base = uio_curriovbase(uio);
	3244
	3245	// LP64todo - fix this!
	3246	upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
	3247	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	3248
	3249	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	3250	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	3251
	3252	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
	3253	no_zero_fill = 1;
	3254	abort_flag = UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY;
	3255	} else {
	3256	no_zero_fill = 0;
	3257	abort_flag = UPL_ABORT_FREE_ON_EMPTY;
	3258	}
	3259	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	3260	pages_in_pl = 0;
	3261	upl_size = upl_needed_size;
	3262	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	3263
	3264	if (no_zero_fill)
	3265	upl_flags \|= UPL_NOZEROFILL;
	3266	if (force_data_sync)
	3267	upl_flags \|= UPL_FORCE_DATA_SYNC;
	3268
	3269	// LP64todo - fix this!
	3270	kret = vm_map_create_upl(current_map(),
	3271	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	3272	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
	3273
	3274	if (kret != KERN_SUCCESS) {
	3275	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	3276	(int)upl_offset, upl_size, io_size, kret, 0);
	3277	/*
	3278	* cluster_nocopy_read: failed to get pagelist
	3279	*
	3280	* we may have already spun some portion of this request
	3281	* off as async requests... we need to wait for the I/O
	3282	* to complete before returning
	3283	*/
	3284	goto wait_for_reads;
	3285	}
	3286	pages_in_pl = upl_size / PAGE_SIZE;
	3287	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	3288
	3289	for (i = 0; i < pages_in_pl; i++) {
	3290	if (!upl_valid_page(pl, i))
	3291	break;
	3292	}
	3293	if (i == pages_in_pl)
	3294	break;
	3295
	3296	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
	3297	}
	3298	if (force_data_sync >= 3) {
	3299	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	3300	(int)upl_offset, upl_size, io_size, kret, 0);
	3301
	3302	goto wait_for_reads;
	3303	}
	3304	/*
	3305	* Consider the possibility that upl_size wasn't satisfied.
	3306	*/
	3307	if (upl_size != upl_needed_size)
	3308	io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
	3309
	3310	if (io_size == 0) {
	3311	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
	3312	goto wait_for_reads;
	3313	}
	3314	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	3315	(int)upl_offset, upl_size, io_size, kret, 0);
	3316
	3317	/*
	3318	* request asynchronously so that we can overlap
	3319	* the preparation of the next I/O
	3320	* if there are already too many outstanding reads
	3321	* wait until some have completed before issuing the next read
	3322	*/
	3323	lck_mtx_lock(cl_mtxp);
	3324
	3325	while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
	3326	iostate.io_wanted = 1;
	3327	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
	3328	}
	3329	lck_mtx_unlock(cl_mtxp);
	3330
	3331	if (iostate.io_error) {
	3332	/*
	3333	* one of the earlier reads we issued ran into a hard error
	3334	* don't issue any more reads, cleanup the UPL
	3335	* that was just created but not used, then
	3336	* go wait for any other reads to complete before
	3337	* returning the error to the caller
	3338	*/
	3339	ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
	3340
	3341	goto wait_for_reads;
	3342	}
	3343	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	3344	(int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	3345
	3346	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
	3347	CL_PRESERVE \| CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO,
	3348	(buf_t)NULL, &iostate);
	3349
	3350	/*
	3351	* update the uio structure
	3352	*/
	3353	uio_update(uio, (user_size_t)io_size);
	3354
	3355	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	3356	(int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
	3357
	3358	} /* end while */
	3359
	3360	wait_for_reads:
	3361	/*
	3362	* make sure all async reads that are part of this stream
	3363	* have completed before we return
	3364	*/
	3365	lck_mtx_lock(cl_mtxp);
	3366
	3367	while (iostate.io_issued != iostate.io_completed) {
	3368	iostate.io_wanted = 1;
	3369	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
	3370	}
	3371	lck_mtx_unlock(cl_mtxp);
	3372
	3373	if (iostate.io_error)
	3374	retval = iostate.io_error;
	3375
	3376	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	3377	(int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
	3378
	3379	return (retval);
	3380	}
	3381
	3382
	3383	static int
	3384	cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
	3385	{
	3386	upl_page_info_t *pl;
	3387	upl_t upl;
	3388	vm_offset_t upl_offset;
	3389	addr64_t dst_paddr;
	3390	off_t max_size;
	3391	int io_size;
	3392	user_size_t iov_len;
	3393	user_addr_t iov_base;
	3394	int tail_size;
	3395	int upl_size;
	3396	int upl_needed_size;
	3397	int pages_in_pl;
	3398	int upl_flags;
	3399	kern_return_t kret;
	3400	struct clios iostate;
	3401	int error;
	3402	int devblocksize;
	3403
	3404	devblocksize = vp->v_mount->mnt_devblocksize;
	3405	/*
	3406	* When we enter this routine, we know
	3407	* -- the resid will not exceed iov_len
	3408	* -- the target address is physically contiguous
	3409	*/
	3410
	3411	#if LP64_DEBUG
	3412	if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
	3413	panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
	3414	}
	3415	#endif /* LP64_DEBUG */
	3416
	3417	iov_len = uio_curriovlen(uio);
	3418	iov_base = uio_curriovbase(uio);
	3419
	3420	max_size = filesize - uio->uio_offset;
	3421
	3422	// LP64todo - fix this!
	3423	if (max_size < 0 \|\| (u_int64_t)max_size > iov_len)
	3424	io_size = iov_len;
	3425	else
	3426	io_size = max_size;
	3427
	3428	// LP64todo - fix this!
	3429	upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
	3430	upl_needed_size = upl_offset + io_size;
	3431
	3432	error = 0;
	3433	pages_in_pl = 0;
	3434	upl_size = upl_needed_size;
	3435	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	3436
	3437	kret = vm_map_get_upl(current_map(),
	3438	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	3439	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
	3440
	3441	if (kret != KERN_SUCCESS) {
	3442	/*
	3443	* cluster_phys_read: failed to get pagelist
	3444	*/
	3445	return(EINVAL);
	3446	}
	3447	if (upl_size < upl_needed_size) {
	3448	/*
	3449	* The upl_size wasn't satisfied.
	3450	*/
	3451	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3452
	3453	return(EINVAL);
	3454	}
	3455	pl = ubc_upl_pageinfo(upl);
	3456
	3457	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
	3458
	3459	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	3460	int head_size;
	3461
	3462	head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
	3463
	3464	if (head_size > io_size)
	3465	head_size = io_size;
	3466
	3467	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
	3468
	3469	if (error) {
	3470	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3471
	3472	return(EINVAL);
	3473	}
	3474	upl_offset += head_size;
	3475	dst_paddr += head_size;
	3476	io_size -= head_size;
	3477	}
	3478	tail_size = io_size & (devblocksize - 1);
	3479	io_size -= tail_size;
	3480
	3481	iostate.io_completed = 0;
	3482	iostate.io_issued = 0;
	3483	iostate.io_error = 0;
	3484	iostate.io_wanted = 0;
	3485
	3486	while (io_size && error == 0) {
	3487	int xsize;
	3488
	3489	if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	3490	xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
	3491	else
	3492	xsize = io_size;
	3493	/*
	3494	* request asynchronously so that we can overlap
	3495	* the preparation of the next I/O... we'll do
	3496	* the commit after all the I/O has completed
	3497	* since its all issued against the same UPL
	3498	* if there are already too many outstanding reads
	3499	* wait until some have completed before issuing the next
	3500	*/
	3501	lck_mtx_lock(cl_mtxp);
	3502
	3503	while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
	3504	iostate.io_wanted = 1;
	3505	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
	3506	}
	3507	lck_mtx_unlock(cl_mtxp);
	3508
	3509	error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
	3510	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC,
	3511	(buf_t)NULL, &iostate);
	3512	/*
	3513	* The cluster_io read was issued successfully,
	3514	* update the uio structure
	3515	*/
	3516	if (error == 0) {
	3517	uio_update(uio, (user_size_t)xsize);
	3518
	3519	dst_paddr += xsize;
	3520	upl_offset += xsize;
	3521	io_size -= xsize;
	3522	}
	3523	}
	3524	/*
	3525	* make sure all async reads that are part of this stream
	3526	* have completed before we proceed
	3527	*/
	3528	lck_mtx_lock(cl_mtxp);
	3529
	3530	while (iostate.io_issued != iostate.io_completed) {
	3531	iostate.io_wanted = 1;
	3532	msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
	3533	}
	3534	lck_mtx_unlock(cl_mtxp);
	3535
	3536	if (iostate.io_error)
	3537	error = iostate.io_error;
	3538
	3539	if (error == 0 && tail_size)
	3540	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
	3541
	3542	/*
	3543	* just release our hold on the physically contiguous
	3544	* region without changing any state
	3545	*/
	3546	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3547
	3548	return (error);
	3549	}
	3550
	3551
	3552	/*
	3553	* generate advisory I/O's in the largest chunks possible
	3554	* the completed pages will be released into the VM cache
	3555	*/
	3556	int
	3557	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	3558	{
	3559	upl_page_info_t *pl;
	3560	upl_t upl;
	3561	vm_offset_t upl_offset;
	3562	int upl_size;
	3563	off_t upl_f_offset;
	3564	int start_offset;
	3565	int start_pg;
	3566	int last_pg;
	3567	int pages_in_upl;
	3568	off_t max_size;
	3569	int io_size;
	3570	kern_return_t kret;
	3571	int retval = 0;
	3572	int issued_io;
	3573	int skip_range;
	3574
	3575	if ( !UBCINFOEXISTS(vp))
	3576	return(EINVAL);
	3577
	3578	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	3579	(int)f_offset, resid, (int)filesize, 0, 0);
	3580
	3581	while (resid && f_offset < filesize && retval == 0) {
	3582	/*
	3583	* compute the size of the upl needed to encompass
	3584	* the requested read... limit each call to cluster_io
	3585	* to the maximum UPL size... cluster_io will clip if
	3586	* this exceeds the maximum io_size for the device,
	3587	* make sure to account for
	3588	* a starting offset that's not page aligned
	3589	*/
	3590	start_offset = (int)(f_offset & PAGE_MASK_64);
	3591	upl_f_offset = f_offset - (off_t)start_offset;
	3592	max_size = filesize - f_offset;
	3593
	3594	if (resid < max_size)
	3595	io_size = resid;
	3596	else
	3597	io_size = max_size;
	3598
	3599	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3600	if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
	3601	upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
	3602
	3603	skip_range = 0;
	3604	/*
	3605	* return the number of contiguously present pages in the cache
	3606	* starting at upl_f_offset within the file
	3607	*/
	3608	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	3609
	3610	if (skip_range) {
	3611	/*
	3612	* skip over pages already present in the cache
	3613	*/
	3614	io_size = skip_range - start_offset;
	3615
	3616	f_offset += io_size;
	3617	resid -= io_size;
	3618
	3619	if (skip_range == upl_size)
	3620	continue;
	3621	/*
	3622	* have to issue some real I/O
	3623	* at this point, we know it's starting on a page boundary
	3624	* because we've skipped over at least the first page in the request
	3625	*/
	3626	start_offset = 0;
	3627	upl_f_offset += skip_range;
	3628	upl_size -= skip_range;
	3629	}
	3630	pages_in_upl = upl_size / PAGE_SIZE;
	3631
	3632	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	3633	(int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3634
	3635	kret = ubc_create_upl(vp,
	3636	upl_f_offset,
	3637	upl_size,
	3638	&upl,
	3639	&pl,
	3640	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE);
	3641	if (kret != KERN_SUCCESS)
	3642	return(retval);
	3643	issued_io = 0;
	3644
	3645	/*
	3646	* before we start marching forward, we must make sure we end on
	3647	* a present page, otherwise we will be working with a freed
	3648	* upl
	3649	*/
	3650	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	3651	if (upl_page_present(pl, last_pg))
	3652	break;
	3653	}
	3654	pages_in_upl = last_pg + 1;
	3655
	3656
	3657	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	3658	(int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3659
	3660
	3661	for (last_pg = 0; last_pg < pages_in_upl; ) {
	3662	/*
	3663	* scan from the beginning of the upl looking for the first
	3664	* page that is present.... this will become the first page in
	3665	* the request we're going to make to 'cluster_io'... if all
	3666	* of the pages are absent, we won't call through to 'cluster_io'
	3667	*/
	3668	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	3669	if (upl_page_present(pl, start_pg))
	3670	break;
	3671	}
	3672
	3673	/*
	3674	* scan from the starting present page looking for an absent
	3675	* page before the end of the upl is reached, if we
	3676	* find one, then it will terminate the range of pages being
	3677	* presented to 'cluster_io'
	3678	*/
	3679	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	3680	if (!upl_page_present(pl, last_pg))
	3681	break;
	3682	}
	3683
	3684	if (last_pg > start_pg) {
	3685	/*
	3686	* we found a range of pages that must be filled
	3687	* if the last page in this range is the last page of the file
	3688	* we may have to clip the size of it to keep from reading past
	3689	* the end of the last physical block associated with the file
	3690	*/
	3691	upl_offset = start_pg * PAGE_SIZE;
	3692	io_size = (last_pg - start_pg) * PAGE_SIZE;
	3693
	3694	if ((upl_f_offset + upl_offset + io_size) > filesize)
	3695	io_size = filesize - (upl_f_offset + upl_offset);
	3696
	3697	/*
	3698	* issue an asynchronous read to cluster_io
	3699	*/
	3700	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	3701	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE, (buf_t)NULL, (struct clios *)NULL);
	3702
	3703	issued_io = 1;
	3704	}
	3705	}
	3706	if (issued_io == 0)
	3707	ubc_upl_abort(upl, 0);
	3708
	3709	io_size = upl_size - start_offset;
	3710
	3711	if (io_size > resid)
	3712	io_size = resid;
	3713	f_offset += io_size;
	3714	resid -= io_size;
	3715	}
	3716
	3717	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	3718	(int)f_offset, resid, retval, 0, 0);
	3719
	3720	return(retval);
	3721	}
	3722
	3723
	3724	int
	3725	cluster_push(vnode_t vp, int flags)
	3726	{
	3727	int retval;
	3728	struct cl_writebehind *wbp;
	3729
	3730	if ( !UBCINFOEXISTS(vp)) {
	3731	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
	3732	return (0);
	3733	}
	3734	/* return if deferred write is set */
	3735	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	3736	return (0);
	3737	}
	3738	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	3739	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
	3740	return (0);
	3741	}
	3742	if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	3743	lck_mtx_unlock(&wbp->cl_lockw);
	3744
	3745	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
	3746	return(0);
	3747	}
	3748	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	3749	(int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	3750
	3751	if (wbp->cl_scmap) {
	3752	sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
	3753
	3754	retval = 1;
	3755	} else
	3756	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
	3757
	3758	lck_mtx_unlock(&wbp->cl_lockw);
	3759
	3760	if (flags & IO_SYNC)
	3761	(void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
	3762
	3763	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	3764	(int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
	3765
	3766	return (retval);
	3767	}
	3768
	3769
	3770	__private_extern__ void
	3771	cluster_release(struct ubc_info *ubc)
	3772	{
	3773	struct cl_writebehind *wbp;
	3774	struct cl_readahead *rap;
	3775
	3776	if ((wbp = ubc->cl_wbehind)) {
	3777
	3778	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
	3779
	3780	if (wbp->cl_scmap)
	3781	vfs_drt_control(&(wbp->cl_scmap), 0);
	3782	} else {
	3783	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
	3784	}
	3785
	3786	rap = ubc->cl_rahead;
	3787
	3788	if (wbp != NULL) {
	3789	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	3790	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	3791	}
	3792	if ((rap = ubc->cl_rahead)) {
	3793	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	3794	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	3795	}
	3796	ubc->cl_rahead = NULL;
	3797	ubc->cl_wbehind = NULL;
	3798
	3799	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
	3800	}
	3801
	3802
	3803	static void
	3804	cluster_push_EOF(vnode_t vp, off_t EOF)
	3805	{
	3806	struct cl_writebehind *wbp;
	3807
	3808	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3809
	3810	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	3811	(int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
	3812
	3813	if (wbp->cl_scmap)
	3814	sparse_cluster_push(wbp, vp, EOF, 1);
	3815	else
	3816	cluster_try_push(wbp, vp, EOF, 0, 1);
	3817
	3818	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	3819	(int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
	3820
	3821	lck_mtx_unlock(&wbp->cl_lockw);
	3822	}
	3823
	3824
	3825	static int
	3826	cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
	3827	{
	3828	int cl_index;
	3829	int cl_index1;
	3830	int min_index;
	3831	int cl_len;
	3832	int cl_pushed = 0;
	3833	struct cl_wextent l_clusters[MAX_CLUSTERS];
	3834
	3835	/*
	3836	* the write behind context exists and has
	3837	* already been locked...
	3838	*
	3839	* make a local 'sorted' copy of the clusters
	3840	* and clear wbp->cl_number so that new clusters can
	3841	* be developed
	3842	*/
	3843	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3844	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	3845	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
	3846	continue;
	3847	if (min_index == -1)
	3848	min_index = cl_index1;
	3849	else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
	3850	min_index = cl_index1;
	3851	}
	3852	if (min_index == -1)
	3853	break;
	3854	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	3855	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	3856	l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
	3857
	3858	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	3859	}
	3860	wbp->cl_number = 0;
	3861
	3862	cl_len = cl_index;
	3863
	3864	if (can_delay && cl_len == MAX_CLUSTERS) {
	3865	int i;
	3866
	3867	/*
	3868	* determine if we appear to be writing the file sequentially
	3869	* if not, by returning without having pushed any clusters
	3870	* we will cause this vnode to be pushed into the sparse cluster mechanism
	3871	* used for managing more random I/O patterns
	3872	*
	3873	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	3874	* that's why we're in try_push with can_delay true...
	3875	*
	3876	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	3877	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	3878	* so we can just make a simple pass through, up to, but not including the last one...
	3879	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	3880	* are sequential
	3881	*
	3882	* we let the last one be partial as long as it was adjacent to the previous one...
	3883	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	3884	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	3885	*/
	3886	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	3887	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
	3888	goto dont_try;
	3889	if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
	3890	goto dont_try;
	3891	}
	3892	}
	3893	/*
	3894	* drop the lock while we're firing off the I/Os...
	3895	* this is safe since I'm working off of a private sorted copy
	3896	* of the clusters, and I'm going to re-evaluate the public
	3897	* state after I retake the lock
	3898	*/
	3899	lck_mtx_unlock(&wbp->cl_lockw);
	3900
	3901	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	3902	int flags;
	3903	struct cl_extent cl;
	3904
	3905	/*
	3906	* try to push each cluster in turn...
	3907	*/
	3908	if (l_clusters[cl_index].io_nocache)
	3909	flags = IO_NOCACHE;
	3910	else
	3911	flags = 0;
	3912	cl.b_addr = l_clusters[cl_index].b_addr;
	3913	cl.e_addr = l_clusters[cl_index].e_addr;
	3914
	3915	cluster_push_x(vp, &cl, EOF, flags);
	3916
	3917	l_clusters[cl_index].b_addr = 0;
	3918	l_clusters[cl_index].e_addr = 0;
	3919
	3920	cl_pushed++;
	3921
	3922	if (push_all == 0)
	3923	break;
	3924	}
	3925	lck_mtx_lock(&wbp->cl_lockw);
	3926
	3927	dont_try:
	3928	if (cl_len > cl_pushed) {
	3929	/*
	3930	* we didn't push all of the clusters, so
	3931	* lets try to merge them back in to the vnode
	3932	*/
	3933	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	3934	/*
	3935	* we picked up some new clusters while we were trying to
	3936	* push the old ones... this can happen because I've dropped
	3937	* the vnode lock... the sum of the
	3938	* leftovers plus the new cluster count exceeds our ability
	3939	* to represent them, so switch to the sparse cluster mechanism
	3940	*
	3941	* collect the active public clusters...
	3942	*/
	3943	sparse_cluster_switch(wbp, vp, EOF);
	3944
	3945	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	3946	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	3947	continue;
	3948	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	3949	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	3950	wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
	3951
	3952	cl_index1++;
	3953	}
	3954	/*
	3955	* update the cluster count
	3956	*/
	3957	wbp->cl_number = cl_index1;
	3958
	3959	/*
	3960	* and collect the original clusters that were moved into the
	3961	* local storage for sorting purposes
	3962	*/
	3963	sparse_cluster_switch(wbp, vp, EOF);
	3964
	3965	} else {
	3966	/*
	3967	* we've got room to merge the leftovers back in
	3968	* just append them starting at the next 'hole'
	3969	* represented by wbp->cl_number
	3970	*/
	3971	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	3972	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	3973	continue;
	3974
	3975	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	3976	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	3977	wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
	3978
	3979	cl_index1++;
	3980	}
	3981	/*
	3982	* update the cluster count
	3983	*/
	3984	wbp->cl_number = cl_index1;
	3985	}
	3986	}
	3987	return(MAX_CLUSTERS - wbp->cl_number);
	3988	}
	3989
	3990
	3991
	3992	static int
	3993	cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
	3994	{
	3995	upl_page_info_t *pl;
	3996	upl_t upl;
	3997	vm_offset_t upl_offset;
	3998	int upl_size;
	3999	off_t upl_f_offset;
	4000	int pages_in_upl;
	4001	int start_pg;
	4002	int last_pg;
	4003	int io_size;
	4004	int io_flags;
	4005	int upl_flags;
	4006	int size;
	4007	int error = 0;
	4008	int retval;
	4009	kern_return_t kret;
	4010
	4011
	4012	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	4013	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	4014
	4015	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	4016	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	4017
	4018	return (0);
	4019	}
	4020	upl_size = pages_in_upl * PAGE_SIZE;
	4021	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	4022
	4023	if (upl_f_offset + upl_size >= EOF) {
	4024
	4025	if (upl_f_offset >= EOF) {
	4026	/*
	4027	* must have truncated the file and missed
	4028	* clearing a dangling cluster (i.e. it's completely
	4029	* beyond the new EOF
	4030	*/
	4031	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	4032
	4033	return(0);
	4034	}
	4035	size = EOF - upl_f_offset;
	4036
	4037	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	4038	pages_in_upl = upl_size / PAGE_SIZE;
	4039	} else
	4040	size = upl_size;
	4041
	4042	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	4043
	4044	/*
	4045	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	4046	*
	4047	* - only pages that are currently dirty are returned... these are the ones we need to clean
	4048	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	4049	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	4050	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	4051	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	4052	*
	4053	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	4054	*/
	4055
	4056	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE))
	4057	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	4058	else
	4059	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	4060
	4061	kret = ubc_create_upl(vp,
	4062	upl_f_offset,
	4063	upl_size,
	4064	&upl,
	4065	&pl,
	4066	upl_flags);
	4067	if (kret != KERN_SUCCESS)
	4068	panic("cluster_push: failed to get pagelist");
	4069
	4070	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
	4071
	4072	/*
	4073	* since we only asked for the dirty pages back
	4074	* it's possible that we may only get a few or even none, so...
	4075	* before we start marching forward, we must make sure we know
	4076	* where the last present page is in the UPL, otherwise we could
	4077	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	4078	* employed by commit_range and abort_range.
	4079	*/
	4080	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	4081	if (upl_page_present(pl, last_pg))
	4082	break;
	4083	}
	4084	pages_in_upl = last_pg + 1;
	4085
	4086	if (pages_in_upl == 0) {
	4087	ubc_upl_abort(upl, 0);
	4088
	4089	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	4090	return(0);
	4091	}
	4092
	4093	for (last_pg = 0; last_pg < pages_in_upl; ) {
	4094	/*
	4095	* find the next dirty page in the UPL
	4096	* this will become the first page in the
	4097	* next I/O to generate
	4098	*/
	4099	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	4100	if (upl_dirty_page(pl, start_pg))
	4101	break;
	4102	if (upl_page_present(pl, start_pg))
	4103	/*
	4104	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	4105	* just release these unchanged since we're not going
	4106	* to steal them or change their state
	4107	*/
	4108	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4109	}
	4110	if (start_pg >= pages_in_upl)
	4111	/*
	4112	* done... no more dirty pages to push
	4113	*/
	4114	break;
	4115	if (start_pg > last_pg)
	4116	/*
	4117	* skipped over some non-dirty pages
	4118	*/
	4119	size -= ((start_pg - last_pg) * PAGE_SIZE);
	4120
	4121	/*
	4122	* find a range of dirty pages to write
	4123	*/
	4124	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4125	if (!upl_dirty_page(pl, last_pg))
	4126	break;
	4127	}
	4128	upl_offset = start_pg * PAGE_SIZE;
	4129
	4130	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	4131
	4132	io_flags = CL_THROTTLE \| CL_COMMIT;
	4133
	4134	if ( !(flags & IO_SYNC))
	4135	io_flags \|= CL_ASYNC;
	4136
	4137	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	4138	io_flags, (buf_t)NULL, (struct clios *)NULL);
	4139
	4140	if (error == 0 && retval)
	4141	error = retval;
	4142
	4143	size -= io_size;
	4144	}
	4145	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, 0, 0, 0);
	4146
	4147	return(error);
	4148	}
	4149
	4150
	4151	/*
	4152	* sparse_cluster_switch is called with the write behind lock held
	4153	*/
	4154	static void
	4155	sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
	4156	{
	4157	int cl_index;
	4158
	4159	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
	4160
	4161	if (wbp->cl_scmap == NULL)
	4162	wbp->cl_scdirty = 0;
	4163
	4164	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	4165	int flags;
	4166	struct cl_extent cl;
	4167
	4168	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	4169
	4170	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
	4171	if (flags & UPL_POP_DIRTY) {
	4172	cl.e_addr = cl.b_addr + 1;
	4173
	4174	sparse_cluster_add(wbp, vp, &cl, EOF);
	4175	}
	4176	}
	4177	}
	4178	}
	4179	wbp->cl_number = 0;
	4180
	4181	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
	4182	}
	4183
	4184
	4185	/*
	4186	* sparse_cluster_push is called with the write behind lock held
	4187	*/
	4188	static void
	4189	sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
	4190	{
	4191	struct cl_extent cl;
	4192	off_t offset;
	4193	u_int length;
	4194
	4195	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
	4196
	4197	if (push_all)
	4198	vfs_drt_control(&(wbp->cl_scmap), 1);
	4199
	4200	for (;;) {
	4201	if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
	4202	break;
	4203
	4204	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	4205	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	4206
	4207	wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
	4208
	4209	cluster_push_x(vp, &cl, EOF, 0);
	4210
	4211	if (push_all == 0)
	4212	break;
	4213	}
	4214	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
	4215	}
	4216
	4217
	4218	/*
	4219	* sparse_cluster_add is called with the write behind lock held
	4220	*/
	4221	static void
	4222	sparse_cluster_add(struct cl_writebehind wbp, vnode_t vp, struct cl_extent cl, off_t EOF)
	4223	{
	4224	u_int new_dirty;
	4225	u_int length;
	4226	off_t offset;
	4227
	4228	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
	4229
	4230	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	4231	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	4232
	4233	while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
	4234	/*
	4235	* no room left in the map
	4236	* only a partial update was done
	4237	* push out some pages and try again
	4238	*/
	4239	wbp->cl_scdirty += new_dirty;
	4240
	4241	sparse_cluster_push(wbp, vp, EOF, 0);
	4242
	4243	offset += (new_dirty * PAGE_SIZE_64);
	4244	length -= (new_dirty * PAGE_SIZE);
	4245	}
	4246	wbp->cl_scdirty += new_dirty;
	4247
	4248	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
	4249	}
	4250
	4251
	4252	static int
	4253	cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
	4254	{
	4255	upl_page_info_t *pl;
	4256	upl_t upl;
	4257	addr64_t ubc_paddr;
	4258	kern_return_t kret;
	4259	int error = 0;
	4260	int did_read = 0;
	4261	int abort_flags;
	4262	int upl_flags;
	4263
	4264	upl_flags = UPL_SET_LITE;
	4265	if (! (flags & CL_READ)) {
	4266	/*
	4267	* "write" operation: let the UPL subsystem know
	4268	* that we intend to modify the buffer cache pages
	4269	* we're gathering.
	4270	*/
	4271	upl_flags \|= UPL_WILL_MODIFY;
	4272	}
	4273
	4274	kret = ubc_create_upl(vp,
	4275	uio->uio_offset & ~PAGE_MASK_64,
	4276	PAGE_SIZE,
	4277	&upl,
	4278	&pl,
	4279	upl_flags);
	4280
	4281	if (kret != KERN_SUCCESS)
	4282	return(EINVAL);
	4283
	4284	if (!upl_valid_page(pl, 0)) {
	4285	/*
	4286	* issue a synchronous read to cluster_io
	4287	*/
	4288	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	4289	CL_READ, (buf_t)NULL, (struct clios *)NULL);
	4290	if (error) {
	4291	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	4292
	4293	return(error);
	4294	}
	4295	did_read = 1;
	4296	}
	4297	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	4298
	4299	/*
	4300	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	4301	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	4302	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	4303	* way to do so without exporting them to kexts as well.
	4304	*/
	4305	if (flags & CL_READ)
	4306	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	4307	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	4308	else
	4309	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	4310	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	4311
	4312	if ( !(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	4313	/*
	4314	* issue a synchronous write to cluster_io
	4315	*/
	4316	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	4317	0, (buf_t)NULL, (struct clios *)NULL);
	4318	}
	4319	if (error == 0)
	4320	uio_update(uio, (user_size_t)xsize);
	4321
	4322	if (did_read)
	4323	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	4324	else
	4325	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	4326
	4327	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	4328
	4329	return (error);
	4330	}
	4331
	4332
	4333
	4334	int
	4335	cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
	4336	{
	4337	int pg_offset;
	4338	int pg_index;
	4339	int csize;
	4340	int segflg;
	4341	int retval = 0;
	4342	upl_page_info_t *pl;
	4343
	4344	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	4345	(int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
	4346
	4347	segflg = uio->uio_segflg;
	4348
	4349	switch(segflg) {
	4350
	4351	case UIO_USERSPACE32:
	4352	case UIO_USERISPACE32:
	4353	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	4354	break;
	4355
	4356	case UIO_USERSPACE:
	4357	case UIO_USERISPACE:
	4358	uio->uio_segflg = UIO_PHYS_USERSPACE;
	4359	break;
	4360
	4361	case UIO_USERSPACE64:
	4362	case UIO_USERISPACE64:
	4363	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	4364	break;
	4365
	4366	case UIO_SYSSPACE32:
	4367	uio->uio_segflg = UIO_PHYS_SYSSPACE32;
	4368	break;
	4369
	4370	case UIO_SYSSPACE:
	4371	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	4372	break;
	4373
	4374	case UIO_SYSSPACE64:
	4375	uio->uio_segflg = UIO_PHYS_SYSSPACE64;
	4376	break;
	4377	}
	4378	pl = ubc_upl_pageinfo(upl);
	4379
	4380	pg_index = upl_offset / PAGE_SIZE;
	4381	pg_offset = upl_offset & PAGE_MASK;
	4382	csize = min(PAGE_SIZE - pg_offset, xsize);
	4383
	4384	while (xsize && retval == 0) {
	4385	addr64_t paddr;
	4386
	4387	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
	4388
	4389	retval = uiomove64(paddr, csize, uio);
	4390
	4391	pg_index += 1;
	4392	pg_offset = 0;
	4393	xsize -= csize;
	4394	csize = min(PAGE_SIZE, xsize);
	4395	}
	4396	uio->uio_segflg = segflg;
	4397
	4398	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	4399	(int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
	4400
	4401	return (retval);
	4402	}
	4403
	4404
	4405	int
	4406	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	4407	{
	4408	int segflg;
	4409	int io_size;
	4410	int xsize;
	4411	int start_offset;
	4412	int retval = 0;
	4413	memory_object_control_t control;
	4414
	4415
	4416	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	4417	(int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
	4418
	4419	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	4420	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	4421	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	4422	(int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
	4423
	4424	return(0);
	4425	}
	4426	segflg = uio->uio_segflg;
	4427
	4428	switch(segflg) {
	4429
	4430	case UIO_USERSPACE32:
	4431	case UIO_USERISPACE32:
	4432	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	4433	break;
	4434
	4435	case UIO_USERSPACE64:
	4436	case UIO_USERISPACE64:
	4437	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	4438	break;
	4439
	4440	case UIO_SYSSPACE32:
	4441	uio->uio_segflg = UIO_PHYS_SYSSPACE32;
	4442	break;
	4443
	4444	case UIO_SYSSPACE64:
	4445	uio->uio_segflg = UIO_PHYS_SYSSPACE64;
	4446	break;
	4447
	4448	case UIO_USERSPACE:
	4449	case UIO_USERISPACE:
	4450	uio->uio_segflg = UIO_PHYS_USERSPACE;
	4451	break;
	4452
	4453	case UIO_SYSSPACE:
	4454	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	4455	break;
	4456	}
	4457
	4458	if ( (io_size = *io_resid) ) {
	4459	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	4460	xsize = uio_resid(uio);
	4461
	4462	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
	4463	uio, start_offset, io_size, mark_dirty);
	4464	xsize -= uio_resid(uio);
	4465	io_size -= xsize;
	4466	}
	4467	uio->uio_segflg = segflg;
	4468	*io_resid = io_size;
	4469
	4470	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	4471	(int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 \| segflg, 0);
	4472
	4473	return(retval);
	4474	}
	4475
	4476
	4477	int
	4478	is_file_clean(vnode_t vp, off_t filesize)
	4479	{
	4480	off_t f_offset;
	4481	int flags;
	4482	int total_dirty = 0;
	4483
	4484	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	4485	if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
	4486	if (flags & UPL_POP_DIRTY) {
	4487	total_dirty++;
	4488	}
	4489	}
	4490	}
	4491	if (total_dirty)
	4492	return(EINVAL);
	4493
	4494	return (0);
	4495	}
	4496
	4497
	4498
	4499	/*
	4500	* Dirty region tracking/clustering mechanism.
	4501	*
	4502	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	4503	* dirty regions within a larger space (file). It is primarily intended to
	4504	* support clustering in large files with many dirty areas.
	4505	*
	4506	* The implementation assumes that the dirty regions are pages.
	4507	*
	4508	* To represent dirty pages within the file, we store bit vectors in a
	4509	* variable-size circular hash.
	4510	*/
	4511
	4512	/*
	4513	* Bitvector size. This determines the number of pages we group in a
	4514	* single hashtable entry. Each hashtable entry is aligned to this
	4515	* size within the file.
	4516	*/
	4517	#define DRT_BITVECTOR_PAGES 256
	4518
	4519	/*
	4520	* File offset handling.
	4521	*
	4522	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	4523	* the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
	4524	*/
	4525	#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
	4526	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	4527
	4528	/*
	4529	* Hashtable address field handling.
	4530	*
	4531	* The low-order bits of the hashtable address are used to conserve
	4532	* space.
	4533	*
	4534	* DRT_HASH_COUNT_MASK must be large enough to store the range
	4535	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	4536	* to indicate that the bucket is actually unoccupied.
	4537	*/
	4538	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	4539	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	4540	do { \
	4541	(scm)->scm_hashtable[(i)].dhe_control = \
	4542	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	4543	} while (0)
	4544	#define DRT_HASH_COUNT_MASK 0x1ff
	4545	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	4546	#define DRT_HASH_SET_COUNT(scm, i, c) \
	4547	do { \
	4548	(scm)->scm_hashtable[(i)].dhe_control = \
	4549	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	4550	} while (0)
	4551	#define DRT_HASH_CLEAR(scm, i) \
	4552	do { \
	4553	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	4554	} while (0)
	4555	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	4556	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	4557	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	4558	do { \
	4559	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	4560	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	4561	} while(0);
	4562
	4563
	4564	/*
	4565	* Hash table moduli.
	4566	*
	4567	* Since the hashtable entry's size is dependent on the size of
	4568	* the bitvector, and since the hashtable size is constrained to
	4569	* both being prime and fitting within the desired allocation
	4570	* size, these values need to be manually determined.
	4571	*
	4572	* For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
	4573	*
	4574	* The small hashtable allocation is 1024 bytes, so the modulus is 23.
	4575	* The large hashtable allocation is 16384 bytes, so the modulus is 401.
	4576	*/
	4577	#define DRT_HASH_SMALL_MODULUS 23
	4578	#define DRT_HASH_LARGE_MODULUS 401
	4579
	4580	#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
	4581	#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
	4582
	4583	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	4584
	4585	/*
	4586	* Hashtable bitvector handling.
	4587	*
	4588	* Bitvector fields are 32 bits long.
	4589	*/
	4590
	4591	#define DRT_HASH_SET_BIT(scm, i, bit) \
	4592	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	4593
	4594	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	4595	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	4596
	4597	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	4598	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	4599
	4600	#define DRT_BITVECTOR_CLEAR(scm, i) \
	4601	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	4602
	4603	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	4604	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	4605	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	4606	(DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	4607
	4608
	4609
	4610	/*
	4611	* Hashtable entry.
	4612	*/
	4613	struct vfs_drt_hashentry {
	4614	u_int64_t dhe_control;
	4615	u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	4616	};
	4617
	4618	/*
	4619	* Dirty Region Tracking structure.
	4620	*
	4621	* The hashtable is allocated entirely inside the DRT structure.
	4622	*
	4623	* The hash is a simple circular prime modulus arrangement, the structure
	4624	* is resized from small to large if it overflows.
	4625	*/
	4626
	4627	struct vfs_drt_clustermap {
	4628	u_int32_t scm_magic; /* sanity/detection */
	4629	#define DRT_SCM_MAGIC 0x12020003
	4630	u_int32_t scm_modulus; /* current ring size */
	4631	u_int32_t scm_buckets; /* number of occupied buckets */
	4632	u_int32_t scm_lastclean; /* last entry we cleaned */
	4633	u_int32_t scm_iskips; /* number of slot skips */
	4634
	4635	struct vfs_drt_hashentry scm_hashtable[0];
	4636	};
	4637
	4638
	4639	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	4640	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	4641
	4642	/*
	4643	* Debugging codes and arguments.
	4644	*/
	4645	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	4646	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	4647	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	4648	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	4649	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	4650	* dirty */
	4651	/* 0, setcount */
	4652	/* 1 (clean, no map) */
	4653	/* 2 (map alloc fail) */
	4654	/* 3, resid (partial) */
	4655	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	4656	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	4657	* lastclean, iskips */
	4658
	4659
	4660	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	4661	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	4662	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	4663	u_int64_t offset, int *indexp);
	4664	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	4665	u_int64_t offset,
	4666	int *indexp,
	4667	int recursed);
	4668	static kern_return_t vfs_drt_do_mark_pages(
	4669	void **cmapp,
	4670	u_int64_t offset,
	4671	u_int length,
	4672	int *setcountp,
	4673	int dirty);
	4674	static void vfs_drt_trace(
	4675	struct vfs_drt_clustermap *cmap,
	4676	int code,
	4677	int arg1,
	4678	int arg2,
	4679	int arg3,
	4680	int arg4);
	4681
	4682
	4683	/*
	4684	* Allocate and initialise a sparse cluster map.
	4685	*
	4686	* Will allocate a new map, resize or compact an existing map.
	4687	*
	4688	* XXX we should probably have at least one intermediate map size,
	4689	* as the 1:16 ratio seems a bit drastic.
	4690	*/
	4691	static kern_return_t
	4692	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	4693	{
	4694	struct vfs_drt_clustermap cmap, ocmap;
	4695	kern_return_t kret;
	4696	u_int64_t offset;
	4697	int nsize, i, active_buckets, index, copycount;
	4698
	4699	ocmap = NULL;
	4700	if (cmapp != NULL)
	4701	ocmap = *cmapp;
	4702
	4703	/*
	4704	* Decide on the size of the new map.
	4705	*/
	4706	if (ocmap == NULL) {
	4707	nsize = DRT_HASH_SMALL_MODULUS;
	4708	} else {
	4709	/* count the number of active buckets in the old map */
	4710	active_buckets = 0;
	4711	for (i = 0; i < ocmap->scm_modulus; i++) {
	4712	if (!DRT_HASH_VACANT(ocmap, i) &&
	4713	(DRT_HASH_GET_COUNT(ocmap, i) != 0))
	4714	active_buckets++;
	4715	}
	4716	/*
	4717	* If we're currently using the small allocation, check to
	4718	* see whether we should grow to the large one.
	4719	*/
	4720	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	4721	/* if the ring is nearly full */
	4722	if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
	4723	nsize = DRT_HASH_LARGE_MODULUS;
	4724	} else {
	4725	nsize = DRT_HASH_SMALL_MODULUS;
	4726	}
	4727	} else {
	4728	/* already using the large modulus */
	4729	nsize = DRT_HASH_LARGE_MODULUS;
	4730	/*
	4731	* If the ring is completely full, there's
	4732	* nothing useful for us to do. Behave as
	4733	* though we had compacted into the new
	4734	* array and return.
	4735	*/
	4736	if (active_buckets >= DRT_HASH_LARGE_MODULUS)
	4737	return(KERN_SUCCESS);
	4738	}
	4739	}
	4740
	4741	/*
	4742	* Allocate and initialise the new map.
	4743	*/
	4744
	4745	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
	4746	(nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	4747	if (kret != KERN_SUCCESS)
	4748	return(kret);
	4749	cmap->scm_magic = DRT_SCM_MAGIC;
	4750	cmap->scm_modulus = nsize;
	4751	cmap->scm_buckets = 0;
	4752	cmap->scm_lastclean = 0;
	4753	cmap->scm_iskips = 0;
	4754	for (i = 0; i < cmap->scm_modulus; i++) {
	4755	DRT_HASH_CLEAR(cmap, i);
	4756	DRT_HASH_VACATE(cmap, i);
	4757	DRT_BITVECTOR_CLEAR(cmap, i);
	4758	}
	4759
	4760	/*
	4761	* If there's an old map, re-hash entries from it into the new map.
	4762	*/
	4763	copycount = 0;
	4764	if (ocmap != NULL) {
	4765	for (i = 0; i < ocmap->scm_modulus; i++) {
	4766	/* skip empty buckets */
	4767	if (DRT_HASH_VACANT(ocmap, i) \|\|
	4768	(DRT_HASH_GET_COUNT(ocmap, i) == 0))
	4769	continue;
	4770	/* get new index */
	4771	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	4772	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	4773	if (kret != KERN_SUCCESS) {
	4774	/* XXX need to bail out gracefully here */
	4775	panic("vfs_drt: new cluster map mysteriously too small");
	4776	}
	4777	/* copy */
	4778	DRT_HASH_COPY(ocmap, i, cmap, index);
	4779	copycount++;
	4780	}
	4781	}
	4782
	4783	/* log what we've done */
	4784	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	4785
	4786	/*
	4787	* It's important to ensure that *cmapp always points to
	4788	* a valid map, so we must overwrite it before freeing
	4789	* the old map.
	4790	*/
	4791	*cmapp = cmap;
	4792	if (ocmap != NULL) {
	4793	/* emit stats into trace buffer */
	4794	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	4795	ocmap->scm_modulus,
	4796	ocmap->scm_buckets,
	4797	ocmap->scm_lastclean,
	4798	ocmap->scm_iskips);
	4799
	4800	vfs_drt_free_map(ocmap);
	4801	}
	4802	return(KERN_SUCCESS);
	4803	}
	4804
	4805
	4806	/*
	4807	* Free a sparse cluster map.
	4808	*/
	4809	static kern_return_t
	4810	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	4811	{
	4812	kmem_free(kernel_map, (vm_offset_t)cmap,
	4813	(cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	4814	return(KERN_SUCCESS);
	4815	}
	4816
	4817
	4818	/*
	4819	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	4820	*/
	4821	static kern_return_t
	4822	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	4823	{
	4824	int index, i;
	4825
	4826	offset = DRT_ALIGN_ADDRESS(offset);
	4827	index = DRT_HASH(cmap, offset);
	4828
	4829	/* traverse the hashtable */
	4830	for (i = 0; i < cmap->scm_modulus; i++) {
	4831
	4832	/*
	4833	* If the slot is vacant, we can stop.
	4834	*/
	4835	if (DRT_HASH_VACANT(cmap, index))
	4836	break;
	4837
	4838	/*
	4839	* If the address matches our offset, we have success.
	4840	*/
	4841	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	4842	*indexp = index;
	4843	return(KERN_SUCCESS);
	4844	}
	4845
	4846	/*
	4847	* Move to the next slot, try again.
	4848	*/
	4849	index = DRT_HASH_NEXT(cmap, index);
	4850	}
	4851	/*
	4852	* It's not there.
	4853	*/
	4854	return(KERN_FAILURE);
	4855	}
	4856
	4857	/*
	4858	* Find the hashtable slot for the supplied offset. If we haven't allocated
	4859	* one yet, allocate one and populate the address field. Note that it will
	4860	* not have a nonzero page count and thus will still technically be free, so
	4861	* in the case where we are called to clean pages, the slot will remain free.
	4862	*/
	4863	static kern_return_t
	4864	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	4865	{
	4866	struct vfs_drt_clustermap *cmap;
	4867	kern_return_t kret;
	4868	int index, i;
	4869
	4870	cmap = *cmapp;
	4871
	4872	/* look for an existing entry */
	4873	kret = vfs_drt_search_index(cmap, offset, indexp);
	4874	if (kret == KERN_SUCCESS)
	4875	return(kret);
	4876
	4877	/* need to allocate an entry */
	4878	offset = DRT_ALIGN_ADDRESS(offset);
	4879	index = DRT_HASH(cmap, offset);
	4880
	4881	/* scan from the index forwards looking for a vacant slot */
	4882	for (i = 0; i < cmap->scm_modulus; i++) {
	4883	/* slot vacant? */
	4884	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap,index) == 0) {
	4885	cmap->scm_buckets++;
	4886	if (index < cmap->scm_lastclean)
	4887	cmap->scm_lastclean = index;
	4888	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	4889	DRT_HASH_SET_COUNT(cmap, index, 0);
	4890	DRT_BITVECTOR_CLEAR(cmap, index);
	4891	*indexp = index;
	4892	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	4893	return(KERN_SUCCESS);
	4894	}
	4895	cmap->scm_iskips += i;
	4896	index = DRT_HASH_NEXT(cmap, index);
	4897	}
	4898
	4899	/*
	4900	* We haven't found a vacant slot, so the map is full. If we're not
	4901	* already recursed, try reallocating/compacting it.
	4902	*/
	4903	if (recursed)
	4904	return(KERN_FAILURE);
	4905	kret = vfs_drt_alloc_map(cmapp);
	4906	if (kret == KERN_SUCCESS) {
	4907	/* now try to insert again */
	4908	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	4909	}
	4910	return(kret);
	4911	}
	4912
	4913	/*
	4914	* Implementation of set dirty/clean.
	4915	*
	4916	* In the 'clean' case, not finding a map is OK.
	4917	*/
	4918	static kern_return_t
	4919	vfs_drt_do_mark_pages(
	4920	void **private,
	4921	u_int64_t offset,
	4922	u_int length,
	4923	int *setcountp,
	4924	int dirty)
	4925	{
	4926	struct vfs_drt_clustermap cmap, *cmapp;
	4927	kern_return_t kret;
	4928	int i, index, pgoff, pgcount, setcount, ecount;
	4929
	4930	cmapp = (struct vfs_drt_clustermap **)private;
	4931	cmap = *cmapp;
	4932
	4933	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	4934
	4935	if (setcountp != NULL)
	4936	*setcountp = 0;
	4937
	4938	/* allocate a cluster map if we don't already have one */
	4939	if (cmap == NULL) {
	4940	/* no cluster map, nothing to clean */
	4941	if (!dirty) {
	4942	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	4943	return(KERN_SUCCESS);
	4944	}
	4945	kret = vfs_drt_alloc_map(cmapp);
	4946	if (kret != KERN_SUCCESS) {
	4947	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	4948	return(kret);
	4949	}
	4950	}
	4951	setcount = 0;
	4952
	4953	/*
	4954	* Iterate over the length of the region.
	4955	*/
	4956	while (length > 0) {
	4957	/*
	4958	* Get the hashtable index for this offset.
	4959	*
	4960	* XXX this will add blank entries if we are clearing a range
	4961	* that hasn't been dirtied.
	4962	*/
	4963	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	4964	cmap = cmapp; / may have changed! */
	4965	/* this may be a partial-success return */
	4966	if (kret != KERN_SUCCESS) {
	4967	if (setcountp != NULL)
	4968	*setcountp = setcount;
	4969	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	4970
	4971	return(kret);
	4972	}
	4973
	4974	/*
	4975	* Work out how many pages we're modifying in this
	4976	* hashtable entry.
	4977	*/
	4978	pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
	4979	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	4980
	4981	/*
	4982	* Iterate over pages, dirty/clearing as we go.
	4983	*/
	4984	ecount = DRT_HASH_GET_COUNT(cmap, index);
	4985	for (i = 0; i < pgcount; i++) {
	4986	if (dirty) {
	4987	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	4988	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	4989	ecount++;
	4990	setcount++;
	4991	}
	4992	} else {
	4993	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	4994	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	4995	ecount--;
	4996	setcount++;
	4997	}
	4998	}
	4999	}
	5000	DRT_HASH_SET_COUNT(cmap, index, ecount);
	5001
	5002	offset += pgcount * PAGE_SIZE;
	5003	length -= pgcount * PAGE_SIZE;
	5004	}
	5005	if (setcountp != NULL)
	5006	*setcountp = setcount;
	5007
	5008	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	5009
	5010	return(KERN_SUCCESS);
	5011	}
	5012
	5013	/*
	5014	* Mark a set of pages as dirty/clean.
	5015	*
	5016	* This is a public interface.
	5017	*
	5018	* cmapp
	5019	* Pointer to storage suitable for holding a pointer. Note that
	5020	* this must either be NULL or a value set by this function.
	5021	*
	5022	* size
	5023	* Current file size in bytes.
	5024	*
	5025	* offset
	5026	* Offset of the first page to be marked as dirty, in bytes. Must be
	5027	* page-aligned.
	5028	*
	5029	* length
	5030	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	5031	*
	5032	* setcountp
	5033	* Number of pages newly marked dirty by this call (optional).
	5034	*
	5035	* Returns KERN_SUCCESS if all the pages were successfully marked.
	5036	*/
	5037	static kern_return_t
	5038	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, int setcountp)
	5039	{
	5040	/* XXX size unused, drop from interface */
	5041	return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
	5042	}
	5043
	5044	#if 0
	5045	static kern_return_t
	5046	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	5047	{
	5048	return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
	5049	}
	5050	#endif
	5051
	5052	/*
	5053	* Get a cluster of dirty pages.
	5054	*
	5055	* This is a public interface.
	5056	*
	5057	* cmapp
	5058	* Pointer to storage managed by drt_mark_pages. Note that this must
	5059	* be NULL or a value set by drt_mark_pages.
	5060	*
	5061	* offsetp
	5062	* Returns the byte offset into the file of the first page in the cluster.
	5063	*
	5064	* lengthp
	5065	* Returns the length in bytes of the cluster of dirty pages.
	5066	*
	5067	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	5068	* are no dirty pages meeting the minmum size criteria. Private storage will
	5069	* be released if there are no more dirty pages left in the map
	5070	*
	5071	*/
	5072	static kern_return_t
	5073	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	5074	{
	5075	struct vfs_drt_clustermap *cmap;
	5076	u_int64_t offset;
	5077	u_int length;
	5078	int index, i, j, fs, ls;
	5079
	5080	/* sanity */
	5081	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	5082	return(KERN_FAILURE);
	5083	cmap = *cmapp;
	5084
	5085	/* walk the hashtable */
	5086	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	5087	index = DRT_HASH(cmap, offset);
	5088
	5089	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0))
	5090	continue;
	5091
	5092	/* scan the bitfield for a string of bits */
	5093	fs = -1;
	5094
	5095	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	5096	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	5097	fs = i;
	5098	break;
	5099	}
	5100	}
	5101	if (fs == -1) {
	5102	/* didn't find any bits set */
	5103	panic("vfs_drt: entry summary count > 0 but no bits set in map");
	5104	}
	5105	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	5106	if (!DRT_HASH_TEST_BIT(cmap, index, i))
	5107	break;
	5108	}
	5109
	5110	/* compute offset and length, mark pages clean */
	5111	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	5112	length = ls * PAGE_SIZE;
	5113	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	5114	cmap->scm_lastclean = index;
	5115
	5116	/* return successful */
	5117	*offsetp = (off_t)offset;
	5118	*lengthp = length;
	5119
	5120	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	5121	return(KERN_SUCCESS);
	5122	}
	5123	/*
	5124	* We didn't find anything... hashtable is empty
	5125	* emit stats into trace buffer and
	5126	* then free it
	5127	*/
	5128	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	5129	cmap->scm_modulus,
	5130	cmap->scm_buckets,
	5131	cmap->scm_lastclean,
	5132	cmap->scm_iskips);
	5133
	5134	vfs_drt_free_map(cmap);
	5135	*cmapp = NULL;
	5136
	5137	return(KERN_FAILURE);
	5138	}
	5139
	5140
	5141	static kern_return_t
	5142	vfs_drt_control(void **cmapp, int op_type)
	5143	{
	5144	struct vfs_drt_clustermap *cmap;
	5145
	5146	/* sanity */
	5147	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	5148	return(KERN_FAILURE);
	5149	cmap = *cmapp;
	5150
	5151	switch (op_type) {
	5152	case 0:
	5153	/* emit stats into trace buffer */
	5154	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	5155	cmap->scm_modulus,
	5156	cmap->scm_buckets,
	5157	cmap->scm_lastclean,
	5158	cmap->scm_iskips);
	5159
	5160	vfs_drt_free_map(cmap);
	5161	*cmapp = NULL;
	5162	break;
	5163
	5164	case 1:
	5165	cmap->scm_lastclean = 0;
	5166	break;
	5167	}
	5168	return(KERN_SUCCESS);
	5169	}
	5170
	5171
	5172
	5173	/*
	5174	* Emit a summary of the state of the clustermap into the trace buffer
	5175	* along with some caller-provided data.
	5176	*/
	5177	#if KDEBUG
	5178	static void
	5179	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	5180	{
	5181	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	5182	}
	5183	#else
	5184	static void
	5185	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	5186	__unused int arg1, __unused int arg2, __unused int arg3,
	5187	__unused int arg4)
	5188	{
	5189	}
	5190	#endif
	5191
	5192	#if 0
	5193	/*
	5194	* Perform basic sanity check on the hash entry summary count
	5195	* vs. the actual bits set in the entry.
	5196	*/
	5197	static void
	5198	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	5199	{
	5200	int index, i;
	5201	int bits_on;
	5202
	5203	for (index = 0; index < cmap->scm_modulus; index++) {
	5204	if (DRT_HASH_VACANT(cmap, index))
	5205	continue;
	5206
	5207	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	5208	if (DRT_HASH_TEST_BIT(cmap, index, i))
	5209	bits_on++;
	5210	}
	5211	if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
	5212	panic("bits_on = %d, index = %d\n", bits_on, index);
	5213	}
	5214	}
	5215	#endif