git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/buf_internal.h>
	67	#include <sys/mount_internal.h>
	68	#include <sys/vnode_internal.h>
	69	#include <sys/trace.h>
	70	#include <sys/malloc.h>
	71	#include <sys/time.h>
	72	#include <sys/kernel.h>
	73	#include <sys/resourcevar.h>
	74	#include <miscfs/specfs/specdev.h>
	75	#include <sys/uio_internal.h>
	76	#include <libkern/libkern.h>
	77	#include <machine/machine_routines.h>
	78
	79	#include <sys/ubc_internal.h>
	80	#include <vm/vnode_pager.h>
	81
	82	#include <mach/mach_types.h>
	83	#include <mach/memory_object_types.h>
	84	#include <mach/vm_map.h>
	85	#include <mach/upl.h>
	86	#include <kern/task.h>
	87	#include <kern/policy_internal.h>
	88
	89	#include <vm/vm_kern.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_fault.h>
	93
	94	#include <sys/kdebug.h>
	95	#include <libkern/OSAtomic.h>
	96
	97	#include <sys/sdt.h>
	98
	99	#include <stdbool.h>
	100
	101	#if 0
	102	#undef KERNEL_DEBUG
	103	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	104	#endif
	105
	106
	107	#define CL_READ 0x01
	108	#define CL_WRITE 0x02
	109	#define CL_ASYNC 0x04
	110	#define CL_COMMIT 0x08
	111	#define CL_PAGEOUT 0x10
	112	#define CL_AGE 0x20
	113	#define CL_NOZERO 0x40
	114	#define CL_PAGEIN 0x80
	115	#define CL_DEV_MEMORY 0x100
	116	#define CL_PRESERVE 0x200
	117	#define CL_THROTTLE 0x400
	118	#define CL_KEEPCACHED 0x800
	119	#define CL_DIRECT_IO 0x1000
	120	#define CL_PASSIVE 0x2000
	121	#define CL_IOSTREAMING 0x4000
	122	#define CL_CLOSE 0x8000
	123	#define CL_ENCRYPTED 0x10000
	124	#define CL_RAW_ENCRYPTED 0x20000
	125	#define CL_NOCACHE 0x40000
	126
	127	#define MAX_VECTOR_UPL_ELEMENTS 8
	128	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
	129
	130	#define CLUSTER_IO_WAITING ((buf_t)1)
	131
	132	extern upl_t vector_upl_create(vm_offset_t);
	133	extern boolean_t vector_upl_is_valid(upl_t);
	134	extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
	135	extern void vector_upl_set_pagelist(upl_t);
	136	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
	137
	138	struct clios {
	139	lck_mtx_t io_mtxp;
	140	u_int io_completed; /* amount of io that has currently completed */
	141	u_int io_issued; /* amount of io that was successfully issued */
	142	int io_error; /* error code of first error encountered */
	143	int io_wanted; /* someone is sleeping waiting for a change in state */
	144	};
	145
	146	struct cl_direct_read_lock {
	147	LIST_ENTRY(cl_direct_read_lock) chain;
	148	int32_t ref_count;
	149	vnode_t vp;
	150	lck_rw_t rw_lock;
	151	};
	152
	153	#define CL_DIRECT_READ_LOCK_BUCKETS 61
	154
	155	static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
	156	cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
	157
	158	static lck_spin_t cl_direct_read_spin_lock;
	159
	160	static lck_grp_t *cl_mtx_grp;
	161	static lck_attr_t *cl_mtx_attr;
	162	static lck_grp_attr_t *cl_mtx_grp_attr;
	163	static lck_mtx_t *cl_transaction_mtxp;
	164
	165	#define IO_UNKNOWN 0
	166	#define IO_DIRECT 1
	167	#define IO_CONTIG 2
	168	#define IO_COPY 3
	169
	170	#define PUSH_DELAY 0x01
	171	#define PUSH_ALL 0x02
	172	#define PUSH_SYNC 0x04
	173
	174
	175	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
	176	static void cluster_wait_IO(buf_t cbp_head, int async);
	177	static void cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait);
	178
	179	static int cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length);
	180
	181	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	182	int flags, buf_t real_bp, struct clios iostate, int ()(buf_t, void ), void callback_arg);
	183	static int cluster_iodone(buf_t bp, void *callback_arg);
	184	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
	185	static int cluster_is_throttled(vnode_t vp);
	186
	187	static void cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name);
	188
	189	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void ), void *callback_arg, int flags);
	190
	191	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
	192	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference);
	193
	194	static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
	195	int ()(buf_t, void ), void *callback_arg);
	196	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	197	int flags, int ()(buf_t, void ), void *callback_arg);
	198	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	199	int ()(buf_t, void ), void *callback_arg, int flags);
	200
	201	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
	202	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void ), void *callback_arg);
	203	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	204	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void ), void *callback_arg);
	205	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
	206	int write_type, u_int32_t write_length, int ()(buf_t, void ), void *callback_arg, int bflag);
	207
	208	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int ()(buf_t, void ), void callback_arg);
	209
	210	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	211	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	212
	213	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int flags, int ()(buf_t, void ), void callback_arg);
	214
	215	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int push_flag, int flags, int ()(buf_t, void ), void callback_arg, int *err);
	216
	217	static void sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int ()(buf_t, void ), void callback_arg);
	218	static int sparse_cluster_push(void *cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int ()(buf_t, void ), void callback_arg);
	219	static void sparse_cluster_add(void *cmapp, vnode_t vp, struct cl_extent , off_t EOF, int ()(buf_t, void ), void *callback_arg);
	220
	221	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
	222	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	223	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	224
	225
	226	/*
	227	* For throttled IO to check whether
	228	* a block is cached by the boot cache
	229	* and thus it can avoid delaying the IO.
	230	*
	231	* bootcache_contains_block is initially
	232	* NULL. The BootCache will set it while
	233	* the cache is active and clear it when
	234	* the cache is jettisoned.
	235	*
	236	* Returns 0 if the block is not
	237	* contained in the cache, 1 if it is
	238	* contained.
	239	*
	240	* The function pointer remains valid
	241	* after the cache has been evicted even
	242	* if bootcache_contains_block has been
	243	* cleared.
	244	*
	245	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
	246	*/
	247	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
	248
	249
	250	/*
	251	* limit the internal I/O size so that we
	252	* can represent it in a 32 bit int
	253	*/
	254	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
	255	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
	256	#define MAX_VECTS 16
	257	/*
	258	* The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
	259	* allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
	260	* we have not historically allowed the write to bypass the UBC.
	261	*/
	262	#define MIN_DIRECT_WRITE_SIZE (16384)
	263
	264	#define WRITE_THROTTLE 6
	265	#define WRITE_THROTTLE_SSD 2
	266	#define WRITE_BEHIND 1
	267	#define WRITE_BEHIND_SSD 1
	268
	269	#define PREFETCH 3
	270	#define PREFETCH_SSD 2
	271	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
	272	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
	273
	274
	275	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
	276	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
	277	#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)))
	278
	279	int ignore_is_ssd = 0;
	280	int speculative_reads_disabled = 0;
	281
	282	/*
	283	* throttle the number of async writes that
	284	* can be outstanding on a single vnode
	285	* before we issue a synchronous write
	286	*/
	287	#define THROTTLE_MAXCNT 0
	288
	289	uint32_t throttle_max_iosize = (128 * 1024);
	290
	291	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
	292
	293	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
	294
	295
	296	void
	297	cluster_init(void) {
	298	/*
	299	* allocate lock group attribute and group
	300	*/
	301	cl_mtx_grp_attr = lck_grp_attr_alloc_init();
	302	cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
	303
	304	/*
	305	* allocate the lock attribute
	306	*/
	307	cl_mtx_attr = lck_attr_alloc_init();
	308
	309	cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
	310
	311	if (cl_transaction_mtxp == NULL)
	312	panic("cluster_init: failed to allocate cl_transaction_mtxp");
	313
	314	lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
	315
	316	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i)
	317	LIST_INIT(&cl_direct_read_locks[i]);
	318	}
	319
	320
	321	uint32_t
	322	cluster_max_io_size(mount_t mp, int type)
	323	{
	324	uint32_t max_io_size;
	325	uint32_t segcnt;
	326	uint32_t maxcnt;
	327
	328	switch(type) {
	329
	330	case CL_READ:
	331	segcnt = mp->mnt_segreadcnt;
	332	maxcnt = mp->mnt_maxreadcnt;
	333	break;
	334	case CL_WRITE:
	335	segcnt = mp->mnt_segwritecnt;
	336	maxcnt = mp->mnt_maxwritecnt;
	337	break;
	338	default:
	339	segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
	340	maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
	341	break;
	342	}
	343	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
	344	/*
	345	* don't allow a size beyond the max UPL size we can create
	346	*/
	347	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
	348	}
	349	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
	350
	351	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
	352	/*
	353	* don't allow a size smaller than the old fixed limit
	354	*/
	355	max_io_size = MAX_UPL_TRANSFER_BYTES;
	356	} else {
	357	/*
	358	* make sure the size specified is a multiple of PAGE_SIZE
	359	*/
	360	max_io_size &= ~PAGE_MASK;
	361	}
	362	return (max_io_size);
	363	}
	364
	365
	366
	367
	368	#define CLW_ALLOCATE 0x01
	369	#define CLW_RETURNLOCKED 0x02
	370	#define CLW_IONOCACHE 0x04
	371	#define CLW_IOPASSIVE 0x08
	372
	373	/*
	374	* if the read ahead context doesn't yet exist,
	375	* allocate and initialize it...
	376	* the vnode lock serializes multiple callers
	377	* during the actual assignment... first one
	378	* to grab the lock wins... the other callers
	379	* will release the now unnecessary storage
	380	*
	381	* once the context is present, try to grab (but don't block on)
	382	* the lock associated with it... if someone
	383	* else currently owns it, than the read
	384	* will run without read-ahead. this allows
	385	* multiple readers to run in parallel and
	386	* since there's only 1 read ahead context,
	387	* there's no real loss in only allowing 1
	388	* reader to have read-ahead enabled.
	389	*/
	390	static struct cl_readahead *
	391	cluster_get_rap(vnode_t vp)
	392	{
	393	struct ubc_info *ubc;
	394	struct cl_readahead *rap;
	395
	396	ubc = vp->v_ubcinfo;
	397
	398	if ((rap = ubc->cl_rahead) == NULL) {
	399	MALLOC_ZONE(rap, struct cl_readahead , sizeof rap, M_CLRDAHEAD, M_WAITOK);
	400
	401	bzero(rap, sizeof *rap);
	402	rap->cl_lastr = -1;
	403	lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
	404
	405	vnode_lock(vp);
	406
	407	if (ubc->cl_rahead == NULL)
	408	ubc->cl_rahead = rap;
	409	else {
	410	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	411	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	412	rap = ubc->cl_rahead;
	413	}
	414	vnode_unlock(vp);
	415	}
	416	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
	417	return(rap);
	418
	419	return ((struct cl_readahead *)NULL);
	420	}
	421
	422
	423	/*
	424	* if the write behind context doesn't yet exist,
	425	* and CLW_ALLOCATE is specified, allocate and initialize it...
	426	* the vnode lock serializes multiple callers
	427	* during the actual assignment... first one
	428	* to grab the lock wins... the other callers
	429	* will release the now unnecessary storage
	430	*
	431	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	432	* the lock associated with the write behind context before
	433	* returning
	434	*/
	435
	436	static struct cl_writebehind *
	437	cluster_get_wbp(vnode_t vp, int flags)
	438	{
	439	struct ubc_info *ubc;
	440	struct cl_writebehind *wbp;
	441
	442	ubc = vp->v_ubcinfo;
	443
	444	if ((wbp = ubc->cl_wbehind) == NULL) {
	445
	446	if ( !(flags & CLW_ALLOCATE))
	447	return ((struct cl_writebehind *)NULL);
	448
	449	MALLOC_ZONE(wbp, struct cl_writebehind , sizeof wbp, M_CLWRBEHIND, M_WAITOK);
	450
	451	bzero(wbp, sizeof *wbp);
	452	lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
	453
	454	vnode_lock(vp);
	455
	456	if (ubc->cl_wbehind == NULL)
	457	ubc->cl_wbehind = wbp;
	458	else {
	459	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	460	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	461	wbp = ubc->cl_wbehind;
	462	}
	463	vnode_unlock(vp);
	464	}
	465	if (flags & CLW_RETURNLOCKED)
	466	lck_mtx_lock(&wbp->cl_lockw);
	467
	468	return (wbp);
	469	}
	470
	471
	472	static void
	473	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, int flags)
	474	{
	475	struct cl_writebehind *wbp;
	476
	477	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
	478
	479	if (wbp->cl_number) {
	480	lck_mtx_lock(&wbp->cl_lockw);
	481
	482	cluster_try_push(wbp, vp, newEOF, PUSH_ALL \| flags, 0, callback, callback_arg, NULL);
	483
	484	lck_mtx_unlock(&wbp->cl_lockw);
	485	}
	486	}
	487	}
	488
	489
	490	static int
	491	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
	492	{
	493	daddr64_t blkno;
	494	size_t io_size;
	495	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
	496
	497	if (bootcache_check_fn) {
	498	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL))
	499	return(0);
	500
	501	if (io_size == 0)
	502	return (0);
	503
	504	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
	505	return(1);
	506	}
	507	return(0);
	508	}
	509
	510
	511	static int
	512	cluster_is_throttled(vnode_t vp)
	513	{
	514	return (throttle_io_will_be_throttled(-1, vp->v_mount));
	515	}
	516
	517
	518	static void
	519	cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name)
	520	{
	521
	522	lck_mtx_lock(&iostate->io_mtxp);
	523
	524	while ((iostate->io_issued - iostate->io_completed) > target) {
	525
	526	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_START,
	527	iostate->io_issued, iostate->io_completed, target, 0, 0);
	528
	529	iostate->io_wanted = 1;
	530	msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
	531
	532	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_END,
	533	iostate->io_issued, iostate->io_completed, target, 0, 0);
	534	}
	535	lck_mtx_unlock(&iostate->io_mtxp);
	536	}
	537
	538	static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
	539	upl_offset_t upl_offset, upl_size_t size)
	540	{
	541	if (!size)
	542	return;
	543
	544	upl_t associated_upl = upl_associated_upl(upl);
	545
	546	if (!associated_upl)
	547	return;
	548
	549	#if 0
	550	printf("1: %d %d\n", upl_offset, upl_offset + size);
	551	#endif
	552
	553	/*
	554	* The associated UPL is page aligned to file offsets whereas the
	555	* UPL it's attached to has different alignment requirements. The
	556	* upl_offset that we have refers to @upl. The code that follows
	557	* has to deal with the first and last pages in this transaction
	558	* which might straddle pages in the associated UPL. To keep
	559	* track of these pages, we use the mark bits: if the mark bit is
	560	* set, we know another transaction has completed its part of that
	561	* page and so we can unlock that page here.
	562	*
	563	* The following illustrates what we have to deal with:
	564	*
	565	* MEM u <------------ 1 PAGE ------------> e
	566	* +-------------+----------------------+-----------------
	567	* \| \|######################\|#################
	568	* +-------------+----------------------+-----------------
	569	* FILE \| <--- a ---> o <------------ 1 PAGE ------------>
	570	*
	571	* So here we show a write to offset @o. The data that is to be
	572	* written is in a buffer that is not page aligned; it has offset
	573	* @a in the page. The upl that carries the data starts in memory
	574	* at @u. The associated upl starts in the file at offset @o. A
	575	* transaction will always end on a page boundary (like @e above)
	576	* except for the very last transaction in the group. We cannot
	577	* unlock the page at @o in the associated upl until both the
	578	* transaction ending at @e and the following transaction (that
	579	* starts at @e) has completed.
	580	*/
	581
	582	/*
	583	* We record whether or not the two UPLs are aligned as the mark
	584	* bit in the first page of @upl.
	585	*/
	586	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	587	bool is_unaligned = upl_page_get_mark(pl, 0);
	588
	589	if (is_unaligned) {
	590	upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
	591
	592	upl_offset_t upl_end = upl_offset + size;
	593	assert(upl_end >= PAGE_SIZE);
	594
	595	upl_size_t assoc_upl_size = upl_get_size(associated_upl);
	596
	597	/*
	598	* In the very first transaction in the group, upl_offset will
	599	* not be page aligned, but after that it will be and in that
	600	* case we want the preceding page in the associated UPL hence
	601	* the minus one.
	602	*/
	603	assert(upl_offset);
	604	if (upl_offset)
	605	upl_offset = trunc_page_32(upl_offset - 1);
	606
	607	lck_mtx_lock_spin(&iostate->io_mtxp);
	608
	609	// Look at the first page...
	610	if (upl_offset
	611	&& !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
	612	/*
	613	* The first page isn't marked so let another transaction
	614	* completion handle it.
	615	*/
	616	upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
	617	upl_offset += PAGE_SIZE;
	618	}
	619
	620	// And now the last page...
	621
	622	/*
	623	* This needs to be > rather than >= because if it's equal, it
	624	* means there's another transaction that is sharing the last
	625	* page.
	626	*/
	627	if (upl_end > assoc_upl_size)
	628	upl_end = assoc_upl_size;
	629	else {
	630	upl_end = trunc_page_32(upl_end);
	631	const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
	632
	633	if (!upl_page_get_mark(assoc_pl, last_pg)) {
	634	/*
	635	* The last page isn't marked so mark the page and let another
	636	* transaction completion handle it.
	637	*/
	638	upl_page_set_mark(assoc_pl, last_pg, true);
	639	upl_end -= PAGE_SIZE;
	640	}
	641	}
	642
	643	lck_mtx_unlock(&iostate->io_mtxp);
	644
	645	#if 0
	646	printf("2: %d %d\n", upl_offset, upl_end);
	647	#endif
	648
	649	if (upl_end <= upl_offset)
	650	return;
	651
	652	size = upl_end - upl_offset;
	653	} else {
	654	assert(!(upl_offset & PAGE_MASK));
	655	assert(!(size & PAGE_MASK));
	656	}
	657
	658	boolean_t empty;
	659
	660	/*
	661	* We can unlock these pages now and as this is for a
	662	* direct/uncached write, we want to dump the pages too.
	663	*/
	664	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
	665	UPL_ABORT_DUMP_PAGES, &empty);
	666
	667	assert(!kr);
	668
	669	if (!kr && empty) {
	670	upl_set_associated_upl(upl, NULL);
	671	upl_deallocate(associated_upl);
	672	}
	673	}
	674
	675	static int
	676	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
	677	{
	678	int upl_abort_code = 0;
	679	int page_in = 0;
	680	int page_out = 0;
	681
	682	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE))
	683	/*
	684	* direct write of any flavor, or a direct read that wasn't aligned
	685	*/
	686	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
	687	else {
	688	if (io_flags & B_PAGEIO) {
	689	if (io_flags & B_READ)
	690	page_in = 1;
	691	else
	692	page_out = 1;
	693	}
	694	if (io_flags & B_CACHE)
	695	/*
	696	* leave pages in the cache unchanged on error
	697	*/
	698	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	699	else if (page_out && ((error != ENXIO) \|\| vnode_isswap(vp)))
	700	/*
	701	* transient error... leave pages unchanged
	702	*/
	703	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	704	else if (page_in)
	705	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	706	else
	707	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	708
	709	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
	710	}
	711	return (upl_abort_code);
	712	}
	713
	714
	715	static int
	716	cluster_iodone(buf_t bp, void *callback_arg)
	717	{
	718	int b_flags;
	719	int error;
	720	int total_size;
	721	int total_resid;
	722	int upl_offset;
	723	int zero_offset;
	724	int pg_offset = 0;
	725	int commit_size = 0;
	726	int upl_flags = 0;
	727	int transaction_size = 0;
	728	upl_t upl;
	729	buf_t cbp;
	730	buf_t cbp_head;
	731	buf_t cbp_next;
	732	buf_t real_bp;
	733	vnode_t vp;
	734	struct clios *iostate;
	735	boolean_t transaction_complete = FALSE;
	736
	737	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
	738
	739	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	740	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	741
	742	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
	743	lck_mtx_lock_spin(cl_transaction_mtxp);
	744
	745	bp->b_flags \|= B_TDONE;
	746
	747	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	748	/*
	749	* all I/O requests that are part of this transaction
	750	* have to complete before we can process it
	751	*/
	752	if ( !(cbp->b_flags & B_TDONE)) {
	753
	754	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	755	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	756
	757	lck_mtx_unlock(cl_transaction_mtxp);
	758
	759	return 0;
	760	}
	761
	762	if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
	763	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	764	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	765
	766	lck_mtx_unlock(cl_transaction_mtxp);
	767	wakeup(cbp);
	768
	769	return 0;
	770	}
	771
	772	if (cbp->b_flags & B_EOT)
	773	transaction_complete = TRUE;
	774	}
	775	lck_mtx_unlock(cl_transaction_mtxp);
	776
	777	if (transaction_complete == FALSE) {
	778	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	779	cbp_head, 0, 0, 0, 0);
	780	return 0;
	781	}
	782	}
	783	error = 0;
	784	total_size = 0;
	785	total_resid = 0;
	786
	787	cbp = cbp_head;
	788	vp = cbp->b_vp;
	789	upl_offset = cbp->b_uploffset;
	790	upl = cbp->b_upl;
	791	b_flags = cbp->b_flags;
	792	real_bp = cbp->b_real_bp;
	793	zero_offset= cbp->b_validend;
	794	iostate = (struct clios *)cbp->b_iostate;
	795
	796	if (real_bp)
	797	real_bp->b_dev = cbp->b_dev;
	798
	799	while (cbp) {
	800	if ((cbp->b_flags & B_ERROR) && error == 0)
	801	error = cbp->b_error;
	802
	803	total_resid += cbp->b_resid;
	804	total_size += cbp->b_bcount;
	805
	806	cbp_next = cbp->b_trans_next;
	807
	808	if (cbp_next == NULL)
	809	/*
	810	* compute the overall size of the transaction
	811	* in case we created one that has 'holes' in it
	812	* 'total_size' represents the amount of I/O we
	813	* did, not the span of the transaction w/r to the UPL
	814	*/
	815	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
	816
	817	if (cbp != cbp_head)
	818	free_io_buf(cbp);
	819
	820	cbp = cbp_next;
	821	}
	822
	823	if (ISSET(b_flags, B_COMMIT_UPL)) {
	824	cluster_handle_associated_upl(iostate,
	825	cbp_head->b_upl,
	826	upl_offset,
	827	transaction_size);
	828	}
	829
	830	if (error == 0 && total_resid)
	831	error = EIO;
	832
	833	if (error == 0) {
	834	int (cliodone_func)(buf_t, void ) = (int ()(buf_t, void ))(cbp_head->b_cliodone);
	835
	836	if (cliodone_func != NULL) {
	837	cbp_head->b_bcount = transaction_size;
	838
	839	error = (*cliodone_func)(cbp_head, callback_arg);
	840	}
	841	}
	842	if (zero_offset)
	843	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	844
	845	free_io_buf(cbp_head);
	846
	847	if (iostate) {
	848	int need_wakeup = 0;
	849
	850	/*
	851	* someone has issued multiple I/Os asynchrounsly
	852	* and is waiting for them to complete (streaming)
	853	*/
	854	lck_mtx_lock_spin(&iostate->io_mtxp);
	855
	856	if (error && iostate->io_error == 0)
	857	iostate->io_error = error;
	858
	859	iostate->io_completed += total_size;
	860
	861	if (iostate->io_wanted) {
	862	/*
	863	* someone is waiting for the state of
	864	* this io stream to change
	865	*/
	866	iostate->io_wanted = 0;
	867	need_wakeup = 1;
	868	}
	869	lck_mtx_unlock(&iostate->io_mtxp);
	870
	871	if (need_wakeup)
	872	wakeup((caddr_t)&iostate->io_wanted);
	873	}
	874
	875	if (b_flags & B_COMMIT_UPL) {
	876	pg_offset = upl_offset & PAGE_MASK;
	877	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	878
	879	if (error)
	880	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
	881	else {
	882	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
	883
	884	if ((b_flags & B_PHYS) && (b_flags & B_READ))
	885	upl_flags \|= UPL_COMMIT_SET_DIRTY;
	886
	887	if (b_flags & B_AGE)
	888	upl_flags \|= UPL_COMMIT_INACTIVATE;
	889
	890	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
	891	}
	892	}
	893	if (real_bp) {
	894	if (error) {
	895	real_bp->b_flags \|= B_ERROR;
	896	real_bp->b_error = error;
	897	}
	898	real_bp->b_resid = total_resid;
	899
	900	buf_biodone(real_bp);
	901	}
	902	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	903	upl, upl_offset - pg_offset, commit_size, (error << 24) \| upl_flags, 0);
	904
	905	return (error);
	906	}
	907
	908
	909	uint32_t
	910	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
	911	{
	912	if (cluster_is_throttled(vp)) {
	913	*limit = THROTTLE_MAX_IOSIZE;
	914	return 1;
	915	}
	916	return 0;
	917	}
	918
	919
	920	void
	921	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
	922	{
	923
	924	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	925	upl_offset, size, bp, 0, 0);
	926
	927	if (bp == NULL \|\| bp->b_datap == 0) {
	928	upl_page_info_t *pl;
	929	addr64_t zero_addr;
	930
	931	pl = ubc_upl_pageinfo(upl);
	932
	933	if (upl_device_page(pl) == TRUE) {
	934	zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
	935
	936	bzero_phys_nc(zero_addr, size);
	937	} else {
	938	while (size) {
	939	int page_offset;
	940	int page_index;
	941	int zero_cnt;
	942
	943	page_index = upl_offset / PAGE_SIZE;
	944	page_offset = upl_offset & PAGE_MASK;
	945
	946	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
	947	zero_cnt = min(PAGE_SIZE - page_offset, size);
	948
	949	bzero_phys(zero_addr, zero_cnt);
	950
	951	size -= zero_cnt;
	952	upl_offset += zero_cnt;
	953	}
	954	}
	955	} else
	956	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	957
	958	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	959	upl_offset, size, 0, 0, 0);
	960	}
	961
	962
	963	static void
	964	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
	965	{
	966	cbp_head->b_validend = zero_offset;
	967	cbp_tail->b_flags \|= B_EOT;
	968	}
	969
	970	static void
	971	cluster_wait_IO(buf_t cbp_head, int async)
	972	{
	973	buf_t cbp;
	974
	975	if (async) {
	976	/*
	977	* Async callback completion will not normally generate a
	978	* wakeup upon I/O completion. To get woken up, we set
	979	* b_trans_next (which is safe for us to modify) on the last
	980	* buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
	981	* to wake us up when all buffers as part of this transaction
	982	* are completed. This is done under the umbrella of
	983	* cl_transaction_mtxp which is also taken in cluster_iodone.
	984	*/
	985	bool done = true;
	986	buf_t last = NULL;
	987
	988	lck_mtx_lock_spin(cl_transaction_mtxp);
	989
	990	for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
	991	if (!ISSET(cbp->b_flags, B_TDONE))
	992	done = false;
	993	}
	994
	995	if (!done) {
	996	last->b_trans_next = CLUSTER_IO_WAITING;
	997
	998	DTRACE_IO1(wait__start, buf_t, last);
	999	do {
	1000	msleep(last, cl_transaction_mtxp, PSPIN \| (PRIBIO+1), "cluster_wait_IO", NULL);
	1001
	1002	/*
	1003	* We should only have been woken up if all the
	1004	* buffers are completed, but just in case...
	1005	*/
	1006	done = true;
	1007	for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
	1008	if (!ISSET(cbp->b_flags, B_TDONE)) {
	1009	done = false;
	1010	break;
	1011	}
	1012	}
	1013	} while (!done);
	1014	DTRACE_IO1(wait__done, buf_t, last);
	1015
	1016	last->b_trans_next = NULL;
	1017	}
	1018
	1019	lck_mtx_unlock(cl_transaction_mtxp);
	1020	} else { // !async
	1021	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1022	buf_biowait(cbp);
	1023	}
	1024	}
	1025
	1026	static void
	1027	cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait)
	1028	{
	1029	buf_t cbp;
	1030	int error;
	1031	boolean_t isswapout = FALSE;
	1032
	1033	/*
	1034	* cluster_complete_transaction will
	1035	* only be called if we've issued a complete chain in synchronous mode
	1036	* or, we've already done a cluster_wait_IO on an incomplete chain
	1037	*/
	1038	if (needwait) {
	1039	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1040	buf_biowait(cbp);
	1041	}
	1042	/*
	1043	* we've already waited on all of the I/Os in this transaction,
	1044	* so mark all of the buf_t's in this transaction as B_TDONE
	1045	* so that cluster_iodone sees the transaction as completed
	1046	*/
	1047	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1048	cbp->b_flags \|= B_TDONE;
	1049	cbp = *cbp_head;
	1050
	1051	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
	1052	isswapout = TRUE;
	1053
	1054	error = cluster_iodone(cbp, callback_arg);
	1055
	1056	if ( !(flags & CL_ASYNC) && error && *retval == 0) {
	1057	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO))
	1058	*retval = error;
	1059	else if (isswapout == TRUE)
	1060	*retval = error;
	1061	}
	1062	*cbp_head = (buf_t)NULL;
	1063	}
	1064
	1065
	1066	static int
	1067	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	1068	int flags, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1069	{
	1070	buf_t cbp;
	1071	u_int size;
	1072	u_int io_size;
	1073	int io_flags;
	1074	int bmap_flags;
	1075	int error = 0;
	1076	int retval = 0;
	1077	buf_t cbp_head = NULL;
	1078	buf_t cbp_tail = NULL;
	1079	int trans_count = 0;
	1080	int max_trans_count;
	1081	u_int pg_count;
	1082	int pg_offset;
	1083	u_int max_iosize;
	1084	u_int max_vectors;
	1085	int priv;
	1086	int zero_offset = 0;
	1087	int async_throttle = 0;
	1088	mount_t mp;
	1089	vm_offset_t upl_end_offset;
	1090	boolean_t need_EOT = FALSE;
	1091
	1092	/*
	1093	* we currently don't support buffers larger than a page
	1094	*/
	1095	if (real_bp && non_rounded_size > PAGE_SIZE)
	1096	panic("%s(): Called with real buffer of size %d bytes which "
	1097	"is greater than the maximum allowed size of "
	1098	"%d bytes (the system PAGE_SIZE).\n",
	1099	__FUNCTION__, non_rounded_size, PAGE_SIZE);
	1100
	1101	mp = vp->v_mount;
	1102
	1103	/*
	1104	* we don't want to do any funny rounding of the size for IO requests
	1105	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
	1106	* belong to us... we can't extend (nor do we need to) the I/O to fill
	1107	* out a page
	1108	*/
	1109	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
	1110	/*
	1111	* round the requested size up so that this I/O ends on a
	1112	* page boundary in case this is a 'write'... if the filesystem
	1113	* has blocks allocated to back the page beyond the EOF, we want to
	1114	* make sure to write out the zero's that are sitting beyond the EOF
	1115	* so that in case the filesystem doesn't explicitly zero this area
	1116	* if a hole is created via a lseek/write beyond the current EOF,
	1117	* it will return zeros when it's read back from the disk. If the
	1118	* physical allocation doesn't extend for the whole page, we'll
	1119	* only write/read from the disk up to the end of this allocation
	1120	* via the extent info returned from the VNOP_BLOCKMAP call.
	1121	*/
	1122	pg_offset = upl_offset & PAGE_MASK;
	1123
	1124	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	1125	} else {
	1126	/*
	1127	* anyone advertising a blocksize of 1 byte probably
	1128	* can't deal with us rounding up the request size
	1129	* AFP is one such filesystem/device
	1130	*/
	1131	size = non_rounded_size;
	1132	}
	1133	upl_end_offset = upl_offset + size;
	1134
	1135	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
	1136
	1137	/*
	1138	* Set the maximum transaction size to the maximum desired number of
	1139	* buffers.
	1140	*/
	1141	max_trans_count = 8;
	1142	if (flags & CL_DEV_MEMORY)
	1143	max_trans_count = 16;
	1144
	1145	if (flags & CL_READ) {
	1146	io_flags = B_READ;
	1147	bmap_flags = VNODE_READ;
	1148
	1149	max_iosize = mp->mnt_maxreadcnt;
	1150	max_vectors = mp->mnt_segreadcnt;
	1151	} else {
	1152	io_flags = B_WRITE;
	1153	bmap_flags = VNODE_WRITE;
	1154
	1155	max_iosize = mp->mnt_maxwritecnt;
	1156	max_vectors = mp->mnt_segwritecnt;
	1157	}
	1158	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	1159
	1160	/*
	1161	* make sure the maximum iosize is a
	1162	* multiple of the page size
	1163	*/
	1164	max_iosize &= ~PAGE_MASK;
	1165
	1166	/*
	1167	* Ensure the maximum iosize is sensible.
	1168	*/
	1169	if (!max_iosize)
	1170	max_iosize = PAGE_SIZE;
	1171
	1172	if (flags & CL_THROTTLE) {
	1173	if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
	1174	if (max_iosize > THROTTLE_MAX_IOSIZE)
	1175	max_iosize = THROTTLE_MAX_IOSIZE;
	1176	async_throttle = THROTTLE_MAXCNT;
	1177	} else {
	1178	if ( (flags & CL_DEV_MEMORY) )
	1179	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
	1180	else {
	1181	u_int max_cluster;
	1182	u_int max_cluster_size;
	1183	u_int scale;
	1184
	1185	if (vp->v_mount->mnt_minsaturationbytecount) {
	1186	max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
	1187
	1188	scale = 1;
	1189	} else {
	1190	max_cluster_size = MAX_CLUSTER_SIZE(vp);
	1191
	1192	if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
	1193	scale = WRITE_THROTTLE_SSD;
	1194	else
	1195	scale = WRITE_THROTTLE;
	1196	}
	1197	if (max_iosize > max_cluster_size)
	1198	max_cluster = max_cluster_size;
	1199	else
	1200	max_cluster = max_iosize;
	1201
	1202	if (size < max_cluster)
	1203	max_cluster = size;
	1204
	1205	if (flags & CL_CLOSE)
	1206	scale += MAX_CLUSTERS;
	1207
	1208	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
	1209	}
	1210	}
	1211	}
	1212	if (flags & CL_AGE)
	1213	io_flags \|= B_AGE;
	1214	if (flags & (CL_PAGEIN \| CL_PAGEOUT))
	1215	io_flags \|= B_PAGEIO;
	1216	if (flags & (CL_IOSTREAMING))
	1217	io_flags \|= B_IOSTREAMING;
	1218	if (flags & CL_COMMIT)
	1219	io_flags \|= B_COMMIT_UPL;
	1220	if (flags & CL_DIRECT_IO)
	1221	io_flags \|= B_PHYS;
	1222	if (flags & (CL_PRESERVE \| CL_KEEPCACHED))
	1223	io_flags \|= B_CACHE;
	1224	if (flags & CL_PASSIVE)
	1225	io_flags \|= B_PASSIVE;
	1226	if (flags & CL_ENCRYPTED)
	1227	io_flags \|= B_ENCRYPTED_IO;
	1228
	1229	if (vp->v_flag & VSYSTEM)
	1230	io_flags \|= B_META;
	1231
	1232	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	1233	/*
	1234	* then we are going to end up
	1235	* with a page that we can't complete (the file size wasn't a multiple
	1236	* of PAGE_SIZE and we're trying to read to the end of the file
	1237	* so we'll go ahead and zero out the portion of the page we can't
	1238	* read in from the file
	1239	*/
	1240	zero_offset = upl_offset + non_rounded_size;
	1241	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
	1242	assert(ISSET(flags, CL_COMMIT));
	1243
	1244	// For a direct/uncached write, we need to lock pages...
	1245
	1246	upl_t cached_upl;
	1247
	1248	/*
	1249	* Create a UPL to lock the pages in the cache whilst the
	1250	* write is in progress.
	1251	*/
	1252	ubc_create_upl(vp, f_offset, non_rounded_size, &cached_upl,
	1253	NULL, UPL_SET_LITE);
	1254
	1255	/*
	1256	* Attach this UPL to the other UPL so that we can find it
	1257	* later.
	1258	*/
	1259	upl_set_associated_upl(upl, cached_upl);
	1260
	1261	if (upl_offset & PAGE_MASK) {
	1262	/*
	1263	* The two UPLs are not aligned, so mark the first page in
	1264	* @upl so that cluster_handle_associated_upl can handle
	1265	* it accordingly.
	1266	*/
	1267	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	1268	upl_page_set_mark(pl, 0, true);
	1269	}
	1270	}
	1271
	1272	while (size) {
	1273	daddr64_t blkno;
	1274	daddr64_t lblkno;
	1275	u_int io_size_wanted;
	1276	size_t io_size_tmp;
	1277
	1278	if (size > max_iosize)
	1279	io_size = max_iosize;
	1280	else
	1281	io_size = size;
	1282
	1283	io_size_wanted = io_size;
	1284	io_size_tmp = (size_t)io_size;
	1285
	1286	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
	1287	break;
	1288
	1289	if (io_size_tmp > io_size_wanted)
	1290	io_size = io_size_wanted;
	1291	else
	1292	io_size = (u_int)io_size_tmp;
	1293
	1294	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
	1295	real_bp->b_blkno = blkno;
	1296
	1297	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	1298	(int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
	1299
	1300	if (io_size == 0) {
	1301	/*
	1302	* vnop_blockmap didn't return an error... however, it did
	1303	* return an extent size of 0 which means we can't
	1304	* make forward progress on this I/O... a hole in the
	1305	* file would be returned as a blkno of -1 with a non-zero io_size
	1306	* a real extent is returned with a blkno != -1 and a non-zero io_size
	1307	*/
	1308	error = EINVAL;
	1309	break;
	1310	}
	1311	if ( !(flags & CL_READ) && blkno == -1) {
	1312	off_t e_offset;
	1313	int pageout_flags;
	1314
	1315	if (upl_get_internal_vectorupl(upl))
	1316	panic("Vector UPLs should not take this code-path\n");
	1317	/*
	1318	* we're writing into a 'hole'
	1319	*/
	1320	if (flags & CL_PAGEOUT) {
	1321	/*
	1322	* if we got here via cluster_pageout
	1323	* then just error the request and return
	1324	* the 'hole' should already have been covered
	1325	*/
	1326	error = EINVAL;
	1327	break;
	1328	}
	1329	/*
	1330	* we can get here if the cluster code happens to
	1331	* pick up a page that was dirtied via mmap vs
	1332	* a 'write' and the page targets a 'hole'...
	1333	* i.e. the writes to the cluster were sparse
	1334	* and the file was being written for the first time
	1335	*
	1336	* we can also get here if the filesystem supports
	1337	* 'holes' that are less than PAGE_SIZE.... because
	1338	* we can't know if the range in the page that covers
	1339	* the 'hole' has been dirtied via an mmap or not,
	1340	* we have to assume the worst and try to push the
	1341	* entire page to storage.
	1342	*
	1343	* Try paging out the page individually before
	1344	* giving up entirely and dumping it (the pageout
	1345	* path will insure that the zero extent accounting
	1346	* has been taken care of before we get back into cluster_io)
	1347	*
	1348	* go direct to vnode_pageout so that we don't have to
	1349	* unbusy the page from the UPL... we used to do this
	1350	* so that we could call ubc_msync, but that results
	1351	* in a potential deadlock if someone else races us to acquire
	1352	* that page and wins and in addition needs one of the pages
	1353	* we're continuing to hold in the UPL
	1354	*/
	1355	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
	1356
	1357	if ( !(flags & CL_ASYNC))
	1358	pageout_flags \|= UPL_IOSYNC;
	1359	if ( !(flags & CL_COMMIT))
	1360	pageout_flags \|= UPL_NOCOMMIT;
	1361
	1362	if (cbp_head) {
	1363	buf_t prev_cbp;
	1364	int bytes_in_last_page;
	1365
	1366	/*
	1367	* first we have to wait for the the current outstanding I/Os
	1368	* to complete... EOT hasn't been set yet on this transaction
	1369	* so the pages won't be released
	1370	*/
	1371	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1372
	1373	bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
	1374	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1375	bytes_in_last_page += cbp->b_bcount;
	1376	bytes_in_last_page &= PAGE_MASK;
	1377
	1378	while (bytes_in_last_page) {
	1379	/*
	1380	* we've got a transcation that
	1381	* includes the page we're about to push out through vnode_pageout...
	1382	* find the bp's in the list which intersect this page and either
	1383	* remove them entirely from the transaction (there could be multiple bp's), or
	1384	* round it's iosize down to the page boundary (there can only be one)...
	1385	*
	1386	* find the last bp in the list and act on it
	1387	*/
	1388	for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
	1389	prev_cbp = cbp;
	1390
	1391	if (bytes_in_last_page >= cbp->b_bcount) {
	1392	/*
	1393	* this buf no longer has any I/O associated with it
	1394	*/
	1395	bytes_in_last_page -= cbp->b_bcount;
	1396	cbp->b_bcount = 0;
	1397
	1398	free_io_buf(cbp);
	1399
	1400	if (cbp == cbp_head) {
	1401	assert(bytes_in_last_page == 0);
	1402	/*
	1403	* the buf we just freed was the only buf in
	1404	* this transaction... so there's no I/O to do
	1405	*/
	1406	cbp_head = NULL;
	1407	cbp_tail = NULL;
	1408	} else {
	1409	/*
	1410	* remove the buf we just freed from
	1411	* the transaction list
	1412	*/
	1413	prev_cbp->b_trans_next = NULL;
	1414	cbp_tail = prev_cbp;
	1415	}
	1416	} else {
	1417	/*
	1418	* this is the last bp that has I/O
	1419	* intersecting the page of interest
	1420	* only some of the I/O is in the intersection
	1421	* so clip the size but keep it in the transaction list
	1422	*/
	1423	cbp->b_bcount -= bytes_in_last_page;
	1424	cbp_tail = cbp;
	1425	bytes_in_last_page = 0;
	1426	}
	1427	}
	1428	if (cbp_head) {
	1429	/*
	1430	* there was more to the current transaction
	1431	* than just the page we are pushing out via vnode_pageout...
	1432	* mark it as finished and complete it... we've already
	1433	* waited for the I/Os to complete above in the call to cluster_wait_IO
	1434	*/
	1435	cluster_EOT(cbp_head, cbp_tail, 0);
	1436
	1437	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1438
	1439	trans_count = 0;
	1440	}
	1441	}
	1442	if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
	1443	error = EINVAL;
	1444	}
	1445	e_offset = round_page_64(f_offset + 1);
	1446	io_size = e_offset - f_offset;
	1447
	1448	f_offset += io_size;
	1449	upl_offset += io_size;
	1450
	1451	if (size >= io_size)
	1452	size -= io_size;
	1453	else
	1454	size = 0;
	1455	/*
	1456	* keep track of how much of the original request
	1457	* that we've actually completed... non_rounded_size
	1458	* may go negative due to us rounding the request
	1459	* to a page size multiple (i.e. size > non_rounded_size)
	1460	*/
	1461	non_rounded_size -= io_size;
	1462
	1463	if (non_rounded_size <= 0) {
	1464	/*
	1465	* we've transferred all of the data in the original
	1466	* request, but we were unable to complete the tail
	1467	* of the last page because the file didn't have
	1468	* an allocation to back that portion... this is ok.
	1469	*/
	1470	size = 0;
	1471	}
	1472	if (error) {
	1473	if (size == 0)
	1474	flags &= ~CL_COMMIT;
	1475	break;
	1476	}
	1477	continue;
	1478	}
	1479	lblkno = (daddr64_t)(f_offset / 0x1000);
	1480	/*
	1481	* we have now figured out how much I/O we can do - this is in 'io_size'
	1482	* pg_offset is the starting point in the first page for the I/O
	1483	* pg_count is the number of full and partial pages that 'io_size' encompasses
	1484	*/
	1485	pg_offset = upl_offset & PAGE_MASK;
	1486
	1487	if (flags & CL_DEV_MEMORY) {
	1488	/*
	1489	* treat physical requests as one 'giant' page
	1490	*/
	1491	pg_count = 1;
	1492	} else
	1493	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1494
	1495	if ((flags & CL_READ) && blkno == -1) {
	1496	vm_offset_t commit_offset;
	1497	int bytes_to_zero;
	1498	int complete_transaction_now = 0;
	1499
	1500	/*
	1501	* if we're reading and blkno == -1, then we've got a
	1502	* 'hole' in the file that we need to deal with by zeroing
	1503	* out the affected area in the upl
	1504	*/
	1505	if (io_size >= (u_int)non_rounded_size) {
	1506	/*
	1507	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	1508	* than 'zero_offset' will be non-zero
	1509	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	1510	* (indicated by the io_size finishing off the I/O request for this UPL)
	1511	* than we're not going to issue an I/O for the
	1512	* last page in this upl... we need to zero both the hole and the tail
	1513	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	1514	*/
	1515	bytes_to_zero = non_rounded_size;
	1516	if (!(flags & CL_NOZERO))
	1517	bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
	1518
	1519	zero_offset = 0;
	1520	} else
	1521	bytes_to_zero = io_size;
	1522
	1523	pg_count = 0;
	1524
	1525	cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
	1526
	1527	if (cbp_head) {
	1528	int pg_resid;
	1529
	1530	/*
	1531	* if there is a current I/O chain pending
	1532	* then the first page of the group we just zero'd
	1533	* will be handled by the I/O completion if the zero
	1534	* fill started in the middle of the page
	1535	*/
	1536	commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1537
	1538	pg_resid = commit_offset - upl_offset;
	1539
	1540	if (bytes_to_zero >= pg_resid) {
	1541	/*
	1542	* the last page of the current I/O
	1543	* has been completed...
	1544	* compute the number of fully zero'd
	1545	* pages that are beyond it
	1546	* plus the last page if its partial
	1547	* and we have no more I/O to issue...
	1548	* otherwise a partial page is left
	1549	* to begin the next I/O
	1550	*/
	1551	if ((int)io_size >= non_rounded_size)
	1552	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1553	else
	1554	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
	1555
	1556	complete_transaction_now = 1;
	1557	}
	1558	} else {
	1559	/*
	1560	* no pending I/O to deal with
	1561	* so, commit all of the fully zero'd pages
	1562	* plus the last page if its partial
	1563	* and we have no more I/O to issue...
	1564	* otherwise a partial page is left
	1565	* to begin the next I/O
	1566	*/
	1567	if ((int)io_size >= non_rounded_size)
	1568	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1569	else
	1570	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
	1571
	1572	commit_offset = upl_offset & ~PAGE_MASK;
	1573	}
	1574
	1575	// Associated UPL is currently only used in the direct write path
	1576	assert(!upl_associated_upl(upl));
	1577
	1578	if ( (flags & CL_COMMIT) && pg_count) {
	1579	ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
	1580	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	1581	}
	1582	upl_offset += io_size;
	1583	f_offset += io_size;
	1584	size -= io_size;
	1585
	1586	/*
	1587	* keep track of how much of the original request
	1588	* that we've actually completed... non_rounded_size
	1589	* may go negative due to us rounding the request
	1590	* to a page size multiple (i.e. size > non_rounded_size)
	1591	*/
	1592	non_rounded_size -= io_size;
	1593
	1594	if (non_rounded_size <= 0) {
	1595	/*
	1596	* we've transferred all of the data in the original
	1597	* request, but we were unable to complete the tail
	1598	* of the last page because the file didn't have
	1599	* an allocation to back that portion... this is ok.
	1600	*/
	1601	size = 0;
	1602	}
	1603	if (cbp_head && (complete_transaction_now \|\| size == 0)) {
	1604	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1605
	1606	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1607
	1608	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1609
	1610	trans_count = 0;
	1611	}
	1612	continue;
	1613	}
	1614	if (pg_count > max_vectors) {
	1615	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	1616	io_size = PAGE_SIZE - pg_offset;
	1617	pg_count = 1;
	1618	} else {
	1619	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	1620	pg_count = max_vectors;
	1621	}
	1622	}
	1623	/*
	1624	* If the transaction is going to reach the maximum number of
	1625	* desired elements, truncate the i/o to the nearest page so
	1626	* that the actual i/o is initiated after this buffer is
	1627	* created and added to the i/o chain.
	1628	*
	1629	* I/O directed to physically contiguous memory
	1630	* doesn't have a requirement to make sure we 'fill' a page
	1631	*/
	1632	if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
	1633	((upl_offset + io_size) & PAGE_MASK)) {
	1634	vm_offset_t aligned_ofs;
	1635
	1636	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
	1637	/*
	1638	* If the io_size does not actually finish off even a
	1639	* single page we have to keep adding buffers to the
	1640	* transaction despite having reached the desired limit.
	1641	*
	1642	* Eventually we get here with the page being finished
	1643	* off (and exceeded) and then we truncate the size of
	1644	* this i/o request so that it is page aligned so that
	1645	* we can finally issue the i/o on the transaction.
	1646	*/
	1647	if (aligned_ofs > upl_offset) {
	1648	io_size = aligned_ofs - upl_offset;
	1649	pg_count--;
	1650	}
	1651	}
	1652
	1653	if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
	1654	/*
	1655	* if we're not targeting a virtual device i.e. a disk image
	1656	* it's safe to dip into the reserve pool since real devices
	1657	* can complete this I/O request without requiring additional
	1658	* bufs from the alloc_io_buf pool
	1659	*/
	1660	priv = 1;
	1661	else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
	1662	/*
	1663	* Throttle the speculative IO
	1664	*/
	1665	priv = 0;
	1666	else
	1667	priv = 1;
	1668
	1669	cbp = alloc_io_buf(vp, priv);
	1670
	1671	if (flags & CL_PAGEOUT) {
	1672	u_int i;
	1673
	1674	/*
	1675	* since blocks are in offsets of 0x1000, scale
	1676	* iteration to (PAGE_SIZE * pg_count) of blks.
	1677	*/
	1678	for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) {
	1679	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
	1680	panic("BUSY bp found in cluster_io");
	1681	}
	1682	}
	1683	if (flags & CL_ASYNC) {
	1684	if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
	1685	panic("buf_setcallback failed\n");
	1686	}
	1687	cbp->b_cliodone = (void *)callback;
	1688	cbp->b_flags \|= io_flags;
	1689	if (flags & CL_NOCACHE)
	1690	cbp->b_attr.ba_flags \|= BA_NOCACHE;
	1691
	1692	cbp->b_lblkno = lblkno;
	1693	cbp->b_blkno = blkno;
	1694	cbp->b_bcount = io_size;
	1695
	1696	if (buf_setupl(cbp, upl, upl_offset))
	1697	panic("buf_setupl failed\n");
	1698	#if CONFIG_IOSCHED
	1699	upl_set_blkno(upl, upl_offset, io_size, blkno);
	1700	#endif
	1701	cbp->b_trans_next = (buf_t)NULL;
	1702
	1703	if ((cbp->b_iostate = (void *)iostate))
	1704	/*
	1705	* caller wants to track the state of this
	1706	* io... bump the amount issued against this stream
	1707	*/
	1708	iostate->io_issued += io_size;
	1709
	1710	if (flags & CL_READ) {
	1711	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	1712	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1713	}
	1714	else {
	1715	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	1716	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1717	}
	1718
	1719	if (cbp_head) {
	1720	cbp_tail->b_trans_next = cbp;
	1721	cbp_tail = cbp;
	1722	} else {
	1723	cbp_head = cbp;
	1724	cbp_tail = cbp;
	1725
	1726	if ( (cbp_head->b_real_bp = real_bp) )
	1727	real_bp = (buf_t)NULL;
	1728	}
	1729	(buf_t )(&cbp->b_trans_head) = cbp_head;
	1730
	1731	trans_count++;
	1732
	1733	upl_offset += io_size;
	1734	f_offset += io_size;
	1735	size -= io_size;
	1736	/*
	1737	* keep track of how much of the original request
	1738	* that we've actually completed... non_rounded_size
	1739	* may go negative due to us rounding the request
	1740	* to a page size multiple (i.e. size > non_rounded_size)
	1741	*/
	1742	non_rounded_size -= io_size;
	1743
	1744	if (non_rounded_size <= 0) {
	1745	/*
	1746	* we've transferred all of the data in the original
	1747	* request, but we were unable to complete the tail
	1748	* of the last page because the file didn't have
	1749	* an allocation to back that portion... this is ok.
	1750	*/
	1751	size = 0;
	1752	}
	1753	if (size == 0) {
	1754	/*
	1755	* we have no more I/O to issue, so go
	1756	* finish the final transaction
	1757	*/
	1758	need_EOT = TRUE;
	1759	} else if ( ((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == 0) &&
	1760	((flags & CL_ASYNC) \|\| trans_count > max_trans_count) ) {
	1761	/*
	1762	* I/O directed to physically contiguous memory...
	1763	* which doesn't have a requirement to make sure we 'fill' a page
	1764	* or...
	1765	* the current I/O we've prepared fully
	1766	* completes the last page in this request
	1767	* and ...
	1768	* it's either an ASYNC request or
	1769	* we've already accumulated more than 8 I/O's into
	1770	* this transaction so mark it as complete so that
	1771	* it can finish asynchronously or via the cluster_complete_transaction
	1772	* below if the request is synchronous
	1773	*/
	1774	need_EOT = TRUE;
	1775	}
	1776	if (need_EOT == TRUE)
	1777	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1778
	1779	if (flags & CL_THROTTLE)
	1780	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
	1781
	1782	if ( !(io_flags & B_READ))
	1783	vnode_startwrite(vp);
	1784
	1785	if (flags & CL_RAW_ENCRYPTED) {
	1786	/*
	1787	* User requested raw encrypted bytes.
	1788	* Twiddle the bit in the ba_flags for the buffer
	1789	*/
	1790	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
	1791	}
	1792
	1793	(void) VNOP_STRATEGY(cbp);
	1794
	1795	if (need_EOT == TRUE) {
	1796	if ( !(flags & CL_ASYNC))
	1797	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
	1798
	1799	need_EOT = FALSE;
	1800	trans_count = 0;
	1801	cbp_head = NULL;
	1802	}
	1803	}
	1804	if (error) {
	1805	int abort_size;
	1806
	1807	io_size = 0;
	1808
	1809	if (cbp_head) {
	1810	/*
	1811	* Wait until all of the outstanding I/O
	1812	* for this partial transaction has completed
	1813	*/
	1814	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1815
	1816	/*
	1817	* Rewind the upl offset to the beginning of the
	1818	* transaction.
	1819	*/
	1820	upl_offset = cbp_head->b_uploffset;
	1821	}
	1822
	1823	if (ISSET(flags, CL_COMMIT)) {
	1824	cluster_handle_associated_upl(iostate, upl, upl_offset,
	1825	upl_end_offset - upl_offset);
	1826	}
	1827
	1828	// Free all the IO buffers in this transaction
	1829	for (cbp = cbp_head; cbp;) {
	1830	buf_t cbp_next;
	1831
	1832	size += cbp->b_bcount;
	1833	io_size += cbp->b_bcount;
	1834
	1835	cbp_next = cbp->b_trans_next;
	1836	free_io_buf(cbp);
	1837	cbp = cbp_next;
	1838	}
	1839
	1840	if (iostate) {
	1841	int need_wakeup = 0;
	1842
	1843	/*
	1844	* update the error condition for this stream
	1845	* since we never really issued the io
	1846	* just go ahead and adjust it back
	1847	*/
	1848	lck_mtx_lock_spin(&iostate->io_mtxp);
	1849
	1850	if (iostate->io_error == 0)
	1851	iostate->io_error = error;
	1852	iostate->io_issued -= io_size;
	1853
	1854	if (iostate->io_wanted) {
	1855	/*
	1856	* someone is waiting for the state of
	1857	* this io stream to change
	1858	*/
	1859	iostate->io_wanted = 0;
	1860	need_wakeup = 1;
	1861	}
	1862	lck_mtx_unlock(&iostate->io_mtxp);
	1863
	1864	if (need_wakeup)
	1865	wakeup((caddr_t)&iostate->io_wanted);
	1866	}
	1867
	1868	if (flags & CL_COMMIT) {
	1869	int upl_flags;
	1870
	1871	pg_offset = upl_offset & PAGE_MASK;
	1872	abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
	1873
	1874	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
	1875
	1876	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1877	upl, upl_offset - pg_offset, abort_size, (error << 24) \| upl_flags, 0);
	1878	}
	1879	if (retval == 0)
	1880	retval = error;
	1881	} else if (cbp_head)
	1882	panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
	1883
	1884	if (real_bp) {
	1885	/*
	1886	* can get here if we either encountered an error
	1887	* or we completely zero-filled the request and
	1888	* no I/O was issued
	1889	*/
	1890	if (error) {
	1891	real_bp->b_flags \|= B_ERROR;
	1892	real_bp->b_error = error;
	1893	}
	1894	buf_biodone(real_bp);
	1895	}
	1896	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
	1897
	1898	return (retval);
	1899	}
	1900
	1901	#define reset_vector_run_state() \
	1902	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
	1903
	1904	static int
	1905	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
	1906	int io_flag, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1907	{
	1908	vector_upl_set_pagelist(vector_upl);
	1909
	1910	if(io_flag & CL_READ) {
	1911	if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
	1912	io_flag &= ~CL_PRESERVE; /don't zero fill/
	1913	else
	1914	io_flag \|= CL_PRESERVE; /zero fill/
	1915	}
	1916	return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
	1917
	1918	}
	1919
	1920	static int
	1921	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	1922	{
	1923	int pages_in_prefetch;
	1924
	1925	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	1926	(int)f_offset, size, (int)filesize, 0, 0);
	1927
	1928	if (f_offset >= filesize) {
	1929	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1930	(int)f_offset, 0, 0, 0, 0);
	1931	return(0);
	1932	}
	1933	if ((off_t)size > (filesize - f_offset))
	1934	size = filesize - f_offset;
	1935	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1936
	1937	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
	1938
	1939	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1940	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	1941
	1942	return (pages_in_prefetch);
	1943	}
	1944
	1945
	1946
	1947	static void
	1948	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap, int (callback)(buf_t, void ), void *callback_arg,
	1949	int bflag)
	1950	{
	1951	daddr64_t r_addr;
	1952	off_t f_offset;
	1953	int size_of_prefetch;
	1954	u_int max_prefetch;
	1955
	1956
	1957	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	1958	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	1959
	1960	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	1961	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1962	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	1963	return;
	1964	}
	1965	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
	1966	rap->cl_ralen = 0;
	1967	rap->cl_maxra = 0;
	1968
	1969	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1970	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	1971
	1972	return;
	1973	}
	1974	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD));
	1975
	1976	if (max_prefetch > speculative_prefetch_max)
	1977	max_prefetch = speculative_prefetch_max;
	1978
	1979	if (max_prefetch <= PAGE_SIZE) {
	1980	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1981	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
	1982	return;
	1983	}
	1984	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
	1985	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
	1986
	1987	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1988	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	1989	return;
	1990	}
	1991	}
	1992	r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
	1993	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	1994
	1995	size_of_prefetch = 0;
	1996
	1997	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	1998
	1999	if (size_of_prefetch) {
	2000	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2001	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	2002	return;
	2003	}
	2004	if (f_offset < filesize) {
	2005	daddr64_t read_size;
	2006
	2007	rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
	2008
	2009	read_size = (extent->e_addr + 1) - extent->b_addr;
	2010
	2011	if (read_size > rap->cl_ralen) {
	2012	if (read_size > max_prefetch / PAGE_SIZE)
	2013	rap->cl_ralen = max_prefetch / PAGE_SIZE;
	2014	else
	2015	rap->cl_ralen = read_size;
	2016	}
	2017	size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
	2018
	2019	if (size_of_prefetch)
	2020	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	2021	}
	2022	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2023	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	2024	}
	2025
	2026
	2027	int
	2028	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2029	int size, off_t filesize, int flags)
	2030	{
	2031	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2032
	2033	}
	2034
	2035
	2036	int
	2037	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2038	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2039	{
	2040	int io_size;
	2041	int rounded_size;
	2042	off_t max_size;
	2043	int local_flags;
	2044
	2045	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	2046
	2047	if ((flags & UPL_IOSYNC) == 0)
	2048	local_flags \|= CL_ASYNC;
	2049	if ((flags & UPL_NOCOMMIT) == 0)
	2050	local_flags \|= CL_COMMIT;
	2051	if ((flags & UPL_KEEPCACHED))
	2052	local_flags \|= CL_KEEPCACHED;
	2053	if (flags & UPL_PAGING_ENCRYPTED)
	2054	local_flags \|= CL_ENCRYPTED;
	2055
	2056
	2057	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	2058	(int)f_offset, size, (int)filesize, local_flags, 0);
	2059
	2060	/*
	2061	* If they didn't specify any I/O, then we are done...
	2062	* we can't issue an abort because we don't know how
	2063	* big the upl really is
	2064	*/
	2065	if (size <= 0)
	2066	return (EINVAL);
	2067
	2068	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	2069	if (local_flags & CL_COMMIT)
	2070	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2071	return (EROFS);
	2072	}
	2073	/*
	2074	* can't page-in from a negative offset
	2075	* or if we're starting beyond the EOF
	2076	* or if the file offset isn't page aligned
	2077	* or the size requested isn't a multiple of PAGE_SIZE
	2078	*/
	2079	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2080	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	2081	if (local_flags & CL_COMMIT)
	2082	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2083	return (EINVAL);
	2084	}
	2085	max_size = filesize - f_offset;
	2086
	2087	if (size < max_size)
	2088	io_size = size;
	2089	else
	2090	io_size = max_size;
	2091
	2092	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2093
	2094	if (size > rounded_size) {
	2095	if (local_flags & CL_COMMIT)
	2096	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	2097	UPL_ABORT_FREE_ON_EMPTY);
	2098	}
	2099	return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2100	local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
	2101	}
	2102
	2103
	2104	int
	2105	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2106	int size, off_t filesize, int flags)
	2107	{
	2108	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2109	}
	2110
	2111
	2112	int
	2113	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2114	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2115	{
	2116	u_int io_size;
	2117	int rounded_size;
	2118	off_t max_size;
	2119	int retval;
	2120	int local_flags = 0;
	2121
	2122	if (upl == NULL \|\| size < 0)
	2123	panic("cluster_pagein: NULL upl passed in");
	2124
	2125	if ((flags & UPL_IOSYNC) == 0)
	2126	local_flags \|= CL_ASYNC;
	2127	if ((flags & UPL_NOCOMMIT) == 0)
	2128	local_flags \|= CL_COMMIT;
	2129	if (flags & UPL_IOSTREAMING)
	2130	local_flags \|= CL_IOSTREAMING;
	2131	if (flags & UPL_PAGING_ENCRYPTED)
	2132	local_flags \|= CL_ENCRYPTED;
	2133
	2134
	2135	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	2136	(int)f_offset, size, (int)filesize, local_flags, 0);
	2137
	2138	/*
	2139	* can't page-in from a negative offset
	2140	* or if we're starting beyond the EOF
	2141	* or if the file offset isn't page aligned
	2142	* or the size requested isn't a multiple of PAGE_SIZE
	2143	*/
	2144	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2145	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	2146	if (local_flags & CL_COMMIT)
	2147	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2148	return (EINVAL);
	2149	}
	2150	max_size = filesize - f_offset;
	2151
	2152	if (size < max_size)
	2153	io_size = size;
	2154	else
	2155	io_size = max_size;
	2156
	2157	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2158
	2159	if (size > rounded_size && (local_flags & CL_COMMIT))
	2160	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	2161	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2162
	2163	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2164	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2165
	2166	return (retval);
	2167	}
	2168
	2169
	2170	int
	2171	cluster_bp(buf_t bp)
	2172	{
	2173	return cluster_bp_ext(bp, NULL, NULL);
	2174	}
	2175
	2176
	2177	int
	2178	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void ), void *callback_arg)
	2179	{
	2180	off_t f_offset;
	2181	int flags;
	2182
	2183	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	2184	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	2185
	2186	if (bp->b_flags & B_READ)
	2187	flags = CL_ASYNC \| CL_READ;
	2188	else
	2189	flags = CL_ASYNC;
	2190	if (bp->b_flags & B_PASSIVE)
	2191	flags \|= CL_PASSIVE;
	2192
	2193	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	2194
	2195	return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
	2196	}
	2197
	2198
	2199
	2200	int
	2201	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	2202	{
	2203	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
	2204	}
	2205
	2206
	2207	int
	2208	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
	2209	int xflags, int (callback)(buf_t, void ), void *callback_arg)
	2210	{
	2211	user_ssize_t cur_resid;
	2212	int retval = 0;
	2213	int flags;
	2214	int zflags;
	2215	int bflag;
	2216	int write_type = IO_COPY;
	2217	u_int32_t write_length;
	2218
	2219	flags = xflags;
	2220
	2221	if (flags & IO_PASSIVE)
	2222	bflag = CL_PASSIVE;
	2223	else
	2224	bflag = 0;
	2225
	2226	if (vp->v_flag & VNOCACHE_DATA){
	2227	flags \|= IO_NOCACHE;
	2228	bflag \|= CL_NOCACHE;
	2229	}
	2230	if (uio == NULL) {
	2231	/*
	2232	* no user data...
	2233	* this call is being made to zero-fill some range in the file
	2234	*/
	2235	retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
	2236
	2237	return(retval);
	2238	}
	2239	/*
	2240	* do a write through the cache if one of the following is true....
	2241	* NOCACHE is not true or NODIRECT is true
	2242	* the uio request doesn't target USERSPACE
	2243	* otherwise, find out if we want the direct or contig variant for
	2244	* the first vector in the uio request
	2245	*/
	2246	if ( ((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
	2247	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2248
	2249	if ( (flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT)
	2250	/*
	2251	* must go through the cached variant in this case
	2252	*/
	2253	write_type = IO_COPY;
	2254
	2255	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
	2256
	2257	switch (write_type) {
	2258
	2259	case IO_COPY:
	2260	/*
	2261	* make sure the uio_resid isn't too big...
	2262	* internally, we want to handle all of the I/O in
	2263	* chunk sizes that fit in a 32 bit int
	2264	*/
	2265	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	2266	/*
	2267	* we're going to have to call cluster_write_copy
	2268	* more than once...
	2269	*
	2270	* only want the last call to cluster_write_copy to
	2271	* have the IO_TAILZEROFILL flag set and only the
	2272	* first call should have IO_HEADZEROFILL
	2273	*/
	2274	zflags = flags & ~IO_TAILZEROFILL;
	2275	flags &= ~IO_HEADZEROFILL;
	2276
	2277	write_length = MAX_IO_REQUEST_SIZE;
	2278	} else {
	2279	/*
	2280	* last call to cluster_write_copy
	2281	*/
	2282	zflags = flags;
	2283
	2284	write_length = (u_int32_t)cur_resid;
	2285	}
	2286	retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
	2287	break;
	2288
	2289	case IO_CONTIG:
	2290	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
	2291
	2292	if (flags & IO_HEADZEROFILL) {
	2293	/*
	2294	* only do this once per request
	2295	*/
	2296	flags &= ~IO_HEADZEROFILL;
	2297
	2298	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
	2299	headOff, (off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2300	if (retval)
	2301	break;
	2302	}
	2303	retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
	2304
	2305	if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
	2306	/*
	2307	* we're done with the data from the user specified buffer(s)
	2308	* and we've been requested to zero fill at the tail
	2309	* treat this as an IO_HEADZEROFILL which doesn't require a uio
	2310	* by rearranging the args and passing in IO_HEADZEROFILL
	2311	*/
	2312	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
	2313	(off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2314	}
	2315	break;
	2316
	2317	case IO_DIRECT:
	2318	/*
	2319	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
	2320	*/
	2321	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
	2322	break;
	2323
	2324	case IO_UNKNOWN:
	2325	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2326	break;
	2327	}
	2328	/*
	2329	* in case we end up calling cluster_write_copy (from cluster_write_direct)
	2330	* multiple times to service a multi-vector request that is not aligned properly
	2331	* we need to update the oldEOF so that we
	2332	* don't zero-fill the head of a page if we've successfully written
	2333	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2334	* page that is beyond the oldEOF if the write is unaligned... we only
	2335	* want that to happen for the very first page of the cluster_write,
	2336	* NOT the first page of each vector making up a multi-vector write.
	2337	*/
	2338	if (uio->uio_offset > oldEOF)
	2339	oldEOF = uio->uio_offset;
	2340	}
	2341	return (retval);
	2342	}
	2343
	2344
	2345	static int
	2346	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int write_type, u_int32_t *write_length,
	2347	int flags, int (callback)(buf_t, void ), void *callback_arg)
	2348	{
	2349	upl_t upl;
	2350	upl_page_info_t *pl;
	2351	vm_offset_t upl_offset;
	2352	vm_offset_t vector_upl_offset = 0;
	2353	u_int32_t io_req_size;
	2354	u_int32_t offset_in_file;
	2355	u_int32_t offset_in_iovbase;
	2356	u_int32_t io_size;
	2357	int io_flag = 0;
	2358	upl_size_t upl_size, vector_upl_size = 0;
	2359	vm_size_t upl_needed_size;
	2360	mach_msg_type_number_t pages_in_pl;
	2361	upl_control_flags_t upl_flags;
	2362	kern_return_t kret;
	2363	mach_msg_type_number_t i;
	2364	int force_data_sync;
	2365	int retval = 0;
	2366	int first_IO = 1;
	2367	struct clios iostate;
	2368	user_addr_t iov_base;
	2369	u_int32_t mem_alignment_mask;
	2370	u_int32_t devblocksize;
	2371	u_int32_t max_io_size;
	2372	u_int32_t max_upl_size;
	2373	u_int32_t max_vector_size;
	2374	u_int32_t bytes_outstanding_limit;
	2375	boolean_t io_throttled = FALSE;
	2376
	2377	u_int32_t vector_upl_iosize = 0;
	2378	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	2379	off_t v_upl_uio_offset = 0;
	2380	int vector_upl_index=0;
	2381	upl_t vector_upl = NULL;
	2382
	2383
	2384	/*
	2385	* When we enter this routine, we know
	2386	* -- the resid will not exceed iov_len
	2387	*/
	2388	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	2389	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2390
	2391	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2392
	2393	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
	2394
	2395	if (flags & IO_PASSIVE)
	2396	io_flag \|= CL_PASSIVE;
	2397
	2398	if (flags & IO_NOCACHE)
	2399	io_flag \|= CL_NOCACHE;
	2400
	2401	if (flags & IO_SKIP_ENCRYPTION)
	2402	io_flag \|= CL_ENCRYPTED;
	2403
	2404	iostate.io_completed = 0;
	2405	iostate.io_issued = 0;
	2406	iostate.io_error = 0;
	2407	iostate.io_wanted = 0;
	2408
	2409	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2410
	2411	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2412	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2413
	2414	if (devblocksize == 1) {
	2415	/*
	2416	* the AFP client advertises a devblocksize of 1
	2417	* however, its BLOCKMAP routine maps to physical
	2418	* blocks that are PAGE_SIZE in size...
	2419	* therefore we can't ask for I/Os that aren't page aligned
	2420	* or aren't multiples of PAGE_SIZE in size
	2421	* by setting devblocksize to PAGE_SIZE, we re-instate
	2422	* the old behavior we had before the mem_alignment_mask
	2423	* changes went in...
	2424	*/
	2425	devblocksize = PAGE_SIZE;
	2426	}
	2427
	2428	next_dwrite:
	2429	io_req_size = *write_length;
	2430	iov_base = uio_curriovbase(uio);
	2431
	2432	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
	2433	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	2434
	2435	if (offset_in_file \|\| offset_in_iovbase) {
	2436	/*
	2437	* one of the 2 important offsets is misaligned
	2438	* so fire an I/O through the cache for this entire vector
	2439	*/
	2440	goto wait_for_dwrites;
	2441	}
	2442	if (iov_base & (devblocksize - 1)) {
	2443	/*
	2444	* the offset in memory must be on a device block boundary
	2445	* so that we can guarantee that we can generate an
	2446	* I/O that ends on a page boundary in cluster_io
	2447	*/
	2448	goto wait_for_dwrites;
	2449	}
	2450
	2451	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
	2452	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
	2453	int throttle_type;
	2454
	2455	if ( (throttle_type = cluster_is_throttled(vp)) ) {
	2456	/*
	2457	* we're in the throttle window, at the very least
	2458	* we want to limit the size of the I/O we're about
	2459	* to issue
	2460	*/
	2461	if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
	2462	/*
	2463	* we're in the throttle window and at least 1 I/O
	2464	* has already been issued by a throttleable thread
	2465	* in this window, so return with EAGAIN to indicate
	2466	* to the FS issuing the cluster_write call that it
	2467	* should now throttle after dropping any locks
	2468	*/
	2469	throttle_info_update_by_mount(vp->v_mount);
	2470
	2471	io_throttled = TRUE;
	2472	goto wait_for_dwrites;
	2473	}
	2474	max_vector_size = THROTTLE_MAX_IOSIZE;
	2475	max_io_size = THROTTLE_MAX_IOSIZE;
	2476	} else {
	2477	max_vector_size = MAX_VECTOR_UPL_SIZE;
	2478	max_io_size = max_upl_size;
	2479	}
	2480
	2481	if (first_IO) {
	2482	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2483	first_IO = 0;
	2484	}
	2485	io_size = io_req_size & ~PAGE_MASK;
	2486	iov_base = uio_curriovbase(uio);
	2487
	2488	if (io_size > max_io_size)
	2489	io_size = max_io_size;
	2490
	2491	if(useVectorUPL && (iov_base & PAGE_MASK)) {
	2492	/*
	2493	* We have an iov_base that's not page-aligned.
	2494	* Issue all I/O's that have been collected within
	2495	* this Vectored UPL.
	2496	*/
	2497	if(vector_upl_index) {
	2498	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2499	reset_vector_run_state();
	2500	}
	2501
	2502	/*
	2503	* After this point, if we are using the Vector UPL path and the base is
	2504	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	2505	*/
	2506	}
	2507
	2508	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2509	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	2510
	2511	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	2512	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	2513
	2514	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2515	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	2516	pages_in_pl = 0;
	2517	upl_size = upl_needed_size;
	2518	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2519	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE
	2520	\| UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
	2521
	2522	kret = vm_map_get_upl(map,
	2523	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2524	&upl_size,
	2525	&upl,
	2526	NULL,
	2527	&pages_in_pl,
	2528	&upl_flags,
	2529	force_data_sync);
	2530
	2531	if (kret != KERN_SUCCESS) {
	2532	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2533	0, 0, 0, kret, 0);
	2534	/*
	2535	* failed to get pagelist
	2536	*
	2537	* we may have already spun some portion of this request
	2538	* off as async requests... we need to wait for the I/O
	2539	* to complete before returning
	2540	*/
	2541	goto wait_for_dwrites;
	2542	}
	2543	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2544	pages_in_pl = upl_size / PAGE_SIZE;
	2545
	2546	for (i = 0; i < pages_in_pl; i++) {
	2547	if (!upl_valid_page(pl, i))
	2548	break;
	2549	}
	2550	if (i == pages_in_pl)
	2551	break;
	2552
	2553	/*
	2554	* didn't get all the pages back that we
	2555	* needed... release this upl and try again
	2556	*/
	2557	ubc_upl_abort(upl, 0);
	2558	}
	2559	if (force_data_sync >= 3) {
	2560	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2561	i, pages_in_pl, upl_size, kret, 0);
	2562	/*
	2563	* for some reason, we couldn't acquire a hold on all
	2564	* the pages needed in the user's address space
	2565	*
	2566	* we may have already spun some portion of this request
	2567	* off as async requests... we need to wait for the I/O
	2568	* to complete before returning
	2569	*/
	2570	goto wait_for_dwrites;
	2571	}
	2572
	2573	/*
	2574	* Consider the possibility that upl_size wasn't satisfied.
	2575	*/
	2576	if (upl_size < upl_needed_size) {
	2577	if (upl_size && upl_offset == 0)
	2578	io_size = upl_size;
	2579	else
	2580	io_size = 0;
	2581	}
	2582	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2583	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	2584
	2585	if (io_size == 0) {
	2586	ubc_upl_abort(upl, 0);
	2587	/*
	2588	* we may have already spun some portion of this request
	2589	* off as async requests... we need to wait for the I/O
	2590	* to complete before returning
	2591	*/
	2592	goto wait_for_dwrites;
	2593	}
	2594
	2595	if(useVectorUPL) {
	2596	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	2597	if(end_off)
	2598	issueVectorUPL = 1;
	2599	/*
	2600	* After this point, if we are using a vector UPL, then
	2601	* either all the UPL elements end on a page boundary OR
	2602	* this UPL is the last element because it does not end
	2603	* on a page boundary.
	2604	*/
	2605	}
	2606
	2607	/*
	2608	* we want push out these writes asynchronously so that we can overlap
	2609	* the preparation of the next I/O
	2610	* if there are already too many outstanding writes
	2611	* wait until some complete before issuing the next
	2612	*/
	2613	if (vp->v_mount->mnt_minsaturationbytecount)
	2614	bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
	2615	else
	2616	bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
	2617
	2618	cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
	2619
	2620	if (iostate.io_error) {
	2621	/*
	2622	* one of the earlier writes we issued ran into a hard error
	2623	* don't issue any more writes, cleanup the UPL
	2624	* that was just created but not used, then
	2625	* go wait for all writes that are part of this stream
	2626	* to complete before returning the error to the caller
	2627	*/
	2628	ubc_upl_abort(upl, 0);
	2629
	2630	goto wait_for_dwrites;
	2631	}
	2632
	2633	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	2634	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	2635
	2636	if(!useVectorUPL)
	2637	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	2638	io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2639
	2640	else {
	2641	if(!vector_upl_index) {
	2642	vector_upl = vector_upl_create(upl_offset);
	2643	v_upl_uio_offset = uio->uio_offset;
	2644	vector_upl_offset = upl_offset;
	2645	}
	2646
	2647	vector_upl_set_subupl(vector_upl,upl,upl_size);
	2648	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	2649	vector_upl_index++;
	2650	vector_upl_iosize += io_size;
	2651	vector_upl_size += upl_size;
	2652
	2653	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	2654	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2655	reset_vector_run_state();
	2656	}
	2657	}
	2658
	2659	/*
	2660	* update the uio structure to
	2661	* reflect the I/O that we just issued
	2662	*/
	2663	uio_update(uio, (user_size_t)io_size);
	2664
	2665	/*
	2666	* in case we end up calling through to cluster_write_copy to finish
	2667	* the tail of this request, we need to update the oldEOF so that we
	2668	* don't zero-fill the head of a page if we've successfully written
	2669	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2670	* page that is beyond the oldEOF if the write is unaligned... we only
	2671	* want that to happen for the very first page of the cluster_write,
	2672	* NOT the first page of each vector making up a multi-vector write.
	2673	*/
	2674	if (uio->uio_offset > oldEOF)
	2675	oldEOF = uio->uio_offset;
	2676
	2677	io_req_size -= io_size;
	2678
	2679	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	2680	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
	2681
	2682	} /* end while */
	2683
	2684	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
	2685
	2686	retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
	2687
	2688	if (retval == 0 && *write_type == IO_DIRECT) {
	2689
	2690	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_NONE,
	2691	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2692
	2693	goto next_dwrite;
	2694	}
	2695	}
	2696
	2697	wait_for_dwrites:
	2698
	2699	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	2700	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2701	reset_vector_run_state();
	2702	}
	2703	/*
	2704	* make sure all async writes issued as part of this stream
	2705	* have completed before we return
	2706	*/
	2707	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
	2708
	2709	if (iostate.io_error)
	2710	retval = iostate.io_error;
	2711
	2712	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2713
	2714	if (io_throttled == TRUE && retval == 0)
	2715	retval = EAGAIN;
	2716
	2717	if (io_req_size && retval == 0) {
	2718	/*
	2719	* we couldn't handle the tail of this request in DIRECT mode
	2720	* so fire it through the copy path
	2721	*
	2722	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
	2723	* so we can just pass 0 in for the headOff and tailOff
	2724	*/
	2725	if (uio->uio_offset > oldEOF)
	2726	oldEOF = uio->uio_offset;
	2727
	2728	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
	2729
	2730	*write_type = IO_UNKNOWN;
	2731	}
	2732	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	2733	(int)uio->uio_offset, io_req_size, retval, 4, 0);
	2734
	2735	return (retval);
	2736	}
	2737
	2738
	2739	static int
	2740	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int write_type, u_int32_t *write_length,
	2741	int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2742	{
	2743	upl_page_info_t *pl;
	2744	addr64_t src_paddr = 0;
	2745	upl_t upl[MAX_VECTS];
	2746	vm_offset_t upl_offset;
	2747	u_int32_t tail_size = 0;
	2748	u_int32_t io_size;
	2749	u_int32_t xsize;
	2750	upl_size_t upl_size;
	2751	vm_size_t upl_needed_size;
	2752	mach_msg_type_number_t pages_in_pl;
	2753	upl_control_flags_t upl_flags;
	2754	kern_return_t kret;
	2755	struct clios iostate;
	2756	int error = 0;
	2757	int cur_upl = 0;
	2758	int num_upl = 0;
	2759	int n;
	2760	user_addr_t iov_base;
	2761	u_int32_t devblocksize;
	2762	u_int32_t mem_alignment_mask;
	2763
	2764	/*
	2765	* When we enter this routine, we know
	2766	* -- the io_req_size will not exceed iov_len
	2767	* -- the target address is physically contiguous
	2768	*/
	2769	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2770
	2771	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2772	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2773
	2774	iostate.io_completed = 0;
	2775	iostate.io_issued = 0;
	2776	iostate.io_error = 0;
	2777	iostate.io_wanted = 0;
	2778
	2779	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2780
	2781	next_cwrite:
	2782	io_size = *write_length;
	2783
	2784	iov_base = uio_curriovbase(uio);
	2785
	2786	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2787	upl_needed_size = upl_offset + io_size;
	2788
	2789	pages_in_pl = 0;
	2790	upl_size = upl_needed_size;
	2791	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2792	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE
	2793	\| UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
	2794
	2795	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2796	kret = vm_map_get_upl(map,
	2797	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2798	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
	2799
	2800	if (kret != KERN_SUCCESS) {
	2801	/*
	2802	* failed to get pagelist
	2803	*/
	2804	error = EINVAL;
	2805	goto wait_for_cwrites;
	2806	}
	2807	num_upl++;
	2808
	2809	/*
	2810	* Consider the possibility that upl_size wasn't satisfied.
	2811	*/
	2812	if (upl_size < upl_needed_size) {
	2813	/*
	2814	* This is a failure in the physical memory case.
	2815	*/
	2816	error = EINVAL;
	2817	goto wait_for_cwrites;
	2818	}
	2819	pl = ubc_upl_pageinfo(upl[cur_upl]);
	2820
	2821	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	2822
	2823	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	2824	u_int32_t head_size;
	2825
	2826	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	2827
	2828	if (head_size > io_size)
	2829	head_size = io_size;
	2830
	2831	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
	2832
	2833	if (error)
	2834	goto wait_for_cwrites;
	2835
	2836	upl_offset += head_size;
	2837	src_paddr += head_size;
	2838	io_size -= head_size;
	2839
	2840	iov_base += head_size;
	2841	}
	2842	if ((u_int32_t)iov_base & mem_alignment_mask) {
	2843	/*
	2844	* request doesn't set up on a memory boundary
	2845	* the underlying DMA engine can handle...
	2846	* return an error instead of going through
	2847	* the slow copy path since the intent of this
	2848	* path is direct I/O from device memory
	2849	*/
	2850	error = EINVAL;
	2851	goto wait_for_cwrites;
	2852	}
	2853
	2854	tail_size = io_size & (devblocksize - 1);
	2855	io_size -= tail_size;
	2856
	2857	while (io_size && error == 0) {
	2858
	2859	if (io_size > MAX_IO_CONTIG_SIZE)
	2860	xsize = MAX_IO_CONTIG_SIZE;
	2861	else
	2862	xsize = io_size;
	2863	/*
	2864	* request asynchronously so that we can overlap
	2865	* the preparation of the next I/O... we'll do
	2866	* the commit after all the I/O has completed
	2867	* since its all issued against the same UPL
	2868	* if there are already too many outstanding writes
	2869	* wait until some have completed before issuing the next
	2870	*/
	2871	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
	2872
	2873	if (iostate.io_error) {
	2874	/*
	2875	* one of the earlier writes we issued ran into a hard error
	2876	* don't issue any more writes...
	2877	* go wait for all writes that are part of this stream
	2878	* to complete before returning the error to the caller
	2879	*/
	2880	goto wait_for_cwrites;
	2881	}
	2882	/*
	2883	* issue an asynchronous write to cluster_io
	2884	*/
	2885	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
	2886	xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
	2887
	2888	if (error == 0) {
	2889	/*
	2890	* The cluster_io write completed successfully,
	2891	* update the uio structure
	2892	*/
	2893	uio_update(uio, (user_size_t)xsize);
	2894
	2895	upl_offset += xsize;
	2896	src_paddr += xsize;
	2897	io_size -= xsize;
	2898	}
	2899	}
	2900	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
	2901
	2902	error = cluster_io_type(uio, write_type, write_length, 0);
	2903
	2904	if (error == 0 && *write_type == IO_CONTIG) {
	2905	cur_upl++;
	2906	goto next_cwrite;
	2907	}
	2908	} else
	2909	*write_type = IO_UNKNOWN;
	2910
	2911	wait_for_cwrites:
	2912	/*
	2913	* make sure all async writes that are part of this stream
	2914	* have completed before we proceed
	2915	*/
	2916	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
	2917
	2918	if (iostate.io_error)
	2919	error = iostate.io_error;
	2920
	2921	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2922
	2923	if (error == 0 && tail_size)
	2924	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
	2925
	2926	for (n = 0; n < num_upl; n++)
	2927	/*
	2928	* just release our hold on each physically contiguous
	2929	* region without changing any state
	2930	*/
	2931	ubc_upl_abort(upl[n], 0);
	2932
	2933	return (error);
	2934	}
	2935
	2936
	2937	/*
	2938	* need to avoid a race between an msync of a range of pages dirtied via mmap
	2939	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
	2940	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
	2941	*
	2942	* we should never force-zero-fill pages that are already valid in the cache...
	2943	* the entire page contains valid data (either from disk, zero-filled or dirtied
	2944	* via an mmap) so we can only do damage by trying to zero-fill
	2945	*
	2946	*/
	2947	static int
	2948	cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
	2949	{
	2950	int zero_pg_index;
	2951	boolean_t need_cluster_zero = TRUE;
	2952
	2953	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2954
	2955	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	2956	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	2957
	2958	if (upl_valid_page(pl, zero_pg_index)) {
	2959	/*
	2960	* never force zero valid pages - dirty or clean
	2961	* we'll leave these in the UPL for cluster_write_copy to deal with
	2962	*/
	2963	need_cluster_zero = FALSE;
	2964	}
	2965	}
	2966	if (need_cluster_zero == TRUE)
	2967	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2968
	2969	return (bytes_to_zero);
	2970	}
	2971
	2972
	2973	static int
	2974	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
	2975	off_t tailOff, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2976	{
	2977	upl_page_info_t *pl;
	2978	upl_t upl;
	2979	vm_offset_t upl_offset = 0;
	2980	vm_size_t upl_size;
	2981	off_t upl_f_offset;
	2982	int pages_in_upl;
	2983	int start_offset;
	2984	int xfer_resid;
	2985	int io_size;
	2986	int io_offset;
	2987	int bytes_to_zero;
	2988	int bytes_to_move;
	2989	kern_return_t kret;
	2990	int retval = 0;
	2991	int io_resid;
	2992	long long total_size;
	2993	long long zero_cnt;
	2994	off_t zero_off;
	2995	long long zero_cnt1;
	2996	off_t zero_off1;
	2997	off_t write_off = 0;
	2998	int write_cnt = 0;
	2999	boolean_t first_pass = FALSE;
	3000	struct cl_extent cl;
	3001	struct cl_writebehind *wbp;
	3002	int bflag;
	3003	u_int max_cluster_pgcount;
	3004	u_int max_io_size;
	3005
	3006	if (uio) {
	3007	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3008	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
	3009
	3010	io_resid = io_req_size;
	3011	} else {
	3012	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3013	0, 0, (int)oldEOF, (int)newEOF, 0);
	3014
	3015	io_resid = 0;
	3016	}
	3017	if (flags & IO_PASSIVE)
	3018	bflag = CL_PASSIVE;
	3019	else
	3020	bflag = 0;
	3021	if (flags & IO_NOCACHE)
	3022	bflag \|= CL_NOCACHE;
	3023
	3024	if (flags & IO_SKIP_ENCRYPTION)
	3025	bflag \|= CL_ENCRYPTED;
	3026
	3027	zero_cnt = 0;
	3028	zero_cnt1 = 0;
	3029	zero_off = 0;
	3030	zero_off1 = 0;
	3031
	3032	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	3033	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	3034
	3035	if (flags & IO_HEADZEROFILL) {
	3036	/*
	3037	* some filesystems (HFS is one) don't support unallocated holes within a file...
	3038	* so we zero fill the intervening space between the old EOF and the offset
	3039	* where the next chunk of real data begins.... ftruncate will also use this
	3040	* routine to zero fill to the new EOF when growing a file... in this case, the
	3041	* uio structure will not be provided
	3042	*/
	3043	if (uio) {
	3044	if (headOff < uio->uio_offset) {
	3045	zero_cnt = uio->uio_offset - headOff;
	3046	zero_off = headOff;
	3047	}
	3048	} else if (headOff < newEOF) {
	3049	zero_cnt = newEOF - headOff;
	3050	zero_off = headOff;
	3051	}
	3052	} else {
	3053	if (uio && uio->uio_offset > oldEOF) {
	3054	zero_off = uio->uio_offset & ~PAGE_MASK_64;
	3055
	3056	if (zero_off >= oldEOF) {
	3057	zero_cnt = uio->uio_offset - zero_off;
	3058
	3059	flags \|= IO_HEADZEROFILL;
	3060	}
	3061	}
	3062	}
	3063	if (flags & IO_TAILZEROFILL) {
	3064	if (uio) {
	3065	zero_off1 = uio->uio_offset + io_req_size;
	3066
	3067	if (zero_off1 < tailOff)
	3068	zero_cnt1 = tailOff - zero_off1;
	3069	}
	3070	} else {
	3071	if (uio && newEOF > oldEOF) {
	3072	zero_off1 = uio->uio_offset + io_req_size;
	3073
	3074	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
	3075	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
	3076
	3077	flags \|= IO_TAILZEROFILL;
	3078	}
	3079	}
	3080	}
	3081	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	3082	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	3083	retval, 0, 0, 0, 0);
	3084	return (0);
	3085	}
	3086	if (uio) {
	3087	write_off = uio->uio_offset;
	3088	write_cnt = uio_resid(uio);
	3089	/*
	3090	* delay updating the sequential write info
	3091	* in the control block until we've obtained
	3092	* the lock for it
	3093	*/
	3094	first_pass = TRUE;
	3095	}
	3096	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	3097	/*
	3098	* for this iteration of the loop, figure out where our starting point is
	3099	*/
	3100	if (zero_cnt) {
	3101	start_offset = (int)(zero_off & PAGE_MASK_64);
	3102	upl_f_offset = zero_off - start_offset;
	3103	} else if (io_resid) {
	3104	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3105	upl_f_offset = uio->uio_offset - start_offset;
	3106	} else {
	3107	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	3108	upl_f_offset = zero_off1 - start_offset;
	3109	}
	3110	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	3111	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	3112
	3113	if (total_size > max_io_size)
	3114	total_size = max_io_size;
	3115
	3116	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	3117
	3118	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	3119	/*
	3120	* assumption... total_size <= io_resid
	3121	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	3122	*/
	3123	if ((start_offset + total_size) > max_io_size)
	3124	total_size = max_io_size - start_offset;
	3125	xfer_resid = total_size;
	3126
	3127	retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
	3128
	3129	if (retval)
	3130	break;
	3131
	3132	io_resid -= (total_size - xfer_resid);
	3133	total_size = xfer_resid;
	3134	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3135	upl_f_offset = uio->uio_offset - start_offset;
	3136
	3137	if (total_size == 0) {
	3138	if (start_offset) {
	3139	/*
	3140	* the write did not finish on a page boundary
	3141	* which will leave upl_f_offset pointing to the
	3142	* beginning of the last page written instead of
	3143	* the page beyond it... bump it in this case
	3144	* so that the cluster code records the last page
	3145	* written as dirty
	3146	*/
	3147	upl_f_offset += PAGE_SIZE_64;
	3148	}
	3149	upl_size = 0;
	3150
	3151	goto check_cluster;
	3152	}
	3153	}
	3154	/*
	3155	* compute the size of the upl needed to encompass
	3156	* the requested write... limit each call to cluster_io
	3157	* to the maximum UPL size... cluster_io will clip if
	3158	* this exceeds the maximum io_size for the device,
	3159	* make sure to account for
	3160	* a starting offset that's not page aligned
	3161	*/
	3162	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3163
	3164	if (upl_size > max_io_size)
	3165	upl_size = max_io_size;
	3166
	3167	pages_in_upl = upl_size / PAGE_SIZE;
	3168	io_size = upl_size - start_offset;
	3169
	3170	if ((long long)io_size > total_size)
	3171	io_size = total_size;
	3172
	3173	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	3174
	3175
	3176	/*
	3177	* Gather the pages from the buffer cache.
	3178	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	3179	* that we intend to modify these pages.
	3180	*/
	3181	kret = ubc_create_upl(vp,
	3182	upl_f_offset,
	3183	upl_size,
	3184	&upl,
	3185	&pl,
	3186	UPL_SET_LITE \| (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY));
	3187	if (kret != KERN_SUCCESS)
	3188	panic("cluster_write_copy: failed to get pagelist");
	3189
	3190	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	3191	upl, (int)upl_f_offset, start_offset, 0, 0);
	3192
	3193	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
	3194	int read_size;
	3195
	3196	/*
	3197	* we're starting in the middle of the first page of the upl
	3198	* and the page isn't currently valid, so we're going to have
	3199	* to read it in first... this is a synchronous operation
	3200	*/
	3201	read_size = PAGE_SIZE;
	3202
	3203	if ((upl_f_offset + read_size) > oldEOF)
	3204	read_size = oldEOF - upl_f_offset;
	3205
	3206	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	3207	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3208	if (retval) {
	3209	/*
	3210	* we had an error during the read which causes us to abort
	3211	* the current cluster_write request... before we do, we need
	3212	* to release the rest of the pages in the upl without modifying
	3213	* there state and mark the failed page in error
	3214	*/
	3215	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3216
	3217	if (upl_size > PAGE_SIZE)
	3218	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3219
	3220	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3221	upl, 0, 0, retval, 0);
	3222	break;
	3223	}
	3224	}
	3225	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	3226	/*
	3227	* the last offset we're writing to in this upl does not end on a page
	3228	* boundary... if it's not beyond the old EOF, then we'll also need to
	3229	* pre-read this page in if it isn't already valid
	3230	*/
	3231	upl_offset = upl_size - PAGE_SIZE;
	3232
	3233	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	3234	!upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
	3235	int read_size;
	3236
	3237	read_size = PAGE_SIZE;
	3238
	3239	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
	3240	read_size = oldEOF - (upl_f_offset + upl_offset);
	3241
	3242	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	3243	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3244	if (retval) {
	3245	/*
	3246	* we had an error during the read which causes us to abort
	3247	* the current cluster_write request... before we do, we
	3248	* need to release the rest of the pages in the upl without
	3249	* modifying there state and mark the failed page in error
	3250	*/
	3251	ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3252
	3253	if (upl_size > PAGE_SIZE)
	3254	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3255
	3256	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3257	upl, 0, 0, retval, 0);
	3258	break;
	3259	}
	3260	}
	3261	}
	3262	xfer_resid = io_size;
	3263	io_offset = start_offset;
	3264
	3265	while (zero_cnt && xfer_resid) {
	3266
	3267	if (zero_cnt < (long long)xfer_resid)
	3268	bytes_to_zero = zero_cnt;
	3269	else
	3270	bytes_to_zero = xfer_resid;
	3271
	3272	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
	3273
	3274	xfer_resid -= bytes_to_zero;
	3275	zero_cnt -= bytes_to_zero;
	3276	zero_off += bytes_to_zero;
	3277	io_offset += bytes_to_zero;
	3278	}
	3279	if (xfer_resid && io_resid) {
	3280	u_int32_t io_requested;
	3281
	3282	bytes_to_move = min(io_resid, xfer_resid);
	3283	io_requested = bytes_to_move;
	3284
	3285	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
	3286
	3287	if (retval) {
	3288	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3289
	3290	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3291	upl, 0, 0, retval, 0);
	3292	} else {
	3293	io_resid -= bytes_to_move;
	3294	xfer_resid -= bytes_to_move;
	3295	io_offset += bytes_to_move;
	3296	}
	3297	}
	3298	while (xfer_resid && zero_cnt1 && retval == 0) {
	3299
	3300	if (zero_cnt1 < (long long)xfer_resid)
	3301	bytes_to_zero = zero_cnt1;
	3302	else
	3303	bytes_to_zero = xfer_resid;
	3304
	3305	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
	3306
	3307	xfer_resid -= bytes_to_zero;
	3308	zero_cnt1 -= bytes_to_zero;
	3309	zero_off1 += bytes_to_zero;
	3310	io_offset += bytes_to_zero;
	3311	}
	3312	if (retval == 0) {
	3313	int cl_index;
	3314	int ret_cluster_try_push;
	3315
	3316	io_size += start_offset;
	3317
	3318	if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
	3319	/*
	3320	* if we're extending the file with this write
	3321	* we'll zero fill the rest of the page so that
	3322	* if the file gets extended again in such a way as to leave a
	3323	* hole starting at this EOF, we'll have zero's in the correct spot
	3324	*/
	3325	cluster_zero(upl, io_size, upl_size - io_size, NULL);
	3326	}
	3327	/*
	3328	* release the upl now if we hold one since...
	3329	* 1) pages in it may be present in the sparse cluster map
	3330	* and may span 2 separate buckets there... if they do and
	3331	* we happen to have to flush a bucket to make room and it intersects
	3332	* this upl, a deadlock may result on page BUSY
	3333	* 2) we're delaying the I/O... from this point forward we're just updating
	3334	* the cluster state... no need to hold the pages, so commit them
	3335	* 3) IO_SYNC is set...
	3336	* because we had to ask for a UPL that provides currenty non-present pages, the
	3337	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	3338	* upon committing it... this is not the behavior we want since it's possible for
	3339	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	3340	* we'll pick these pages back up later with the correct behavior specified.
	3341	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
	3342	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
	3343	* we hold since the flushing context is holding the cluster lock.
	3344	*/
	3345	ubc_upl_commit_range(upl, 0, upl_size,
	3346	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	3347	check_cluster:
	3348	/*
	3349	* calculate the last logical block number
	3350	* that this delayed I/O encompassed
	3351	*/
	3352	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	3353
	3354	if (flags & IO_SYNC) {
	3355	/*
	3356	* if the IO_SYNC flag is set than we need to
	3357	* bypass any clusters and immediately issue
	3358	* the I/O
	3359	*/
	3360	goto issue_io;
	3361	}
	3362	/*
	3363	* take the lock to protect our accesses
	3364	* of the writebehind and sparse cluster state
	3365	*/
	3366	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3367
	3368	if (wbp->cl_scmap) {
	3369
	3370	if ( !(flags & IO_NOCACHE)) {
	3371	/*
	3372	* we've fallen into the sparse
	3373	* cluster method of delaying dirty pages
	3374	*/
	3375	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3376
	3377	lck_mtx_unlock(&wbp->cl_lockw);
	3378
	3379	continue;
	3380	}
	3381	/*
	3382	* must have done cached writes that fell into
	3383	* the sparse cluster mechanism... we've switched
	3384	* to uncached writes on the file, so go ahead
	3385	* and push whatever's in the sparse map
	3386	* and switch back to normal clustering
	3387	*/
	3388	wbp->cl_number = 0;
	3389
	3390	sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg);
	3391	/*
	3392	* no clusters of either type present at this point
	3393	* so just go directly to start_new_cluster since
	3394	* we know we need to delay this I/O since we've
	3395	* already released the pages back into the cache
	3396	* to avoid the deadlock with sparse_cluster_push
	3397	*/
	3398	goto start_new_cluster;
	3399	}
	3400	if (first_pass) {
	3401	if (write_off == wbp->cl_last_write)
	3402	wbp->cl_seq_written += write_cnt;
	3403	else
	3404	wbp->cl_seq_written = write_cnt;
	3405
	3406	wbp->cl_last_write = write_off + write_cnt;
	3407
	3408	first_pass = FALSE;
	3409	}
	3410	if (wbp->cl_number == 0)
	3411	/*
	3412	* no clusters currently present
	3413	*/
	3414	goto start_new_cluster;
	3415
	3416	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3417	/*
	3418	* check each cluster that we currently hold
	3419	* try to merge some or all of this write into
	3420	* one or more of the existing clusters... if
	3421	* any portion of the write remains, start a
	3422	* new cluster
	3423	*/
	3424	if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	3425	/*
	3426	* the current write starts at or after the current cluster
	3427	*/
	3428	if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3429	/*
	3430	* we have a write that fits entirely
	3431	* within the existing cluster limits
	3432	*/
	3433	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
	3434	/*
	3435	* update our idea of where the cluster ends
	3436	*/
	3437	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3438	break;
	3439	}
	3440	if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3441	/*
	3442	* we have a write that starts in the middle of the current cluster
	3443	* but extends beyond the cluster's limit... we know this because
	3444	* of the previous checks
	3445	* we'll extend the current cluster to the max
	3446	* and update the b_addr for the current write to reflect that
	3447	* the head of it was absorbed into this cluster...
	3448	* note that we'll always have a leftover tail in this case since
	3449	* full absorbtion would have occurred in the clause above
	3450	*/
	3451	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
	3452
	3453	cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
	3454	}
	3455	/*
	3456	* we come here for the case where the current write starts
	3457	* beyond the limit of the existing cluster or we have a leftover
	3458	* tail after a partial absorbtion
	3459	*
	3460	* in either case, we'll check the remaining clusters before
	3461	* starting a new one
	3462	*/
	3463	} else {
	3464	/*
	3465	* the current write starts in front of the cluster we're currently considering
	3466	*/
	3467	if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
	3468	/*
	3469	* we can just merge the new request into
	3470	* this cluster and leave it in the cache
	3471	* since the resulting cluster is still
	3472	* less than the maximum allowable size
	3473	*/
	3474	wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
	3475
	3476	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3477	/*
	3478	* the current write completely
	3479	* envelops the existing cluster and since
	3480	* each write is limited to at most max_cluster_pgcount pages
	3481	* we can just use the start and last blocknos of the write
	3482	* to generate the cluster limits
	3483	*/
	3484	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3485	}
	3486	break;
	3487	}
	3488
	3489	/*
	3490	* if we were to combine this write with the current cluster
	3491	* we would exceed the cluster size limit.... so,
	3492	* let's see if there's any overlap of the new I/O with
	3493	* the cluster we're currently considering... in fact, we'll
	3494	* stretch the cluster out to it's full limit and see if we
	3495	* get an intersection with the current write
	3496	*
	3497	*/
	3498	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
	3499	/*
	3500	* the current write extends into the proposed cluster
	3501	* clip the length of the current write after first combining it's
	3502	* tail with the newly shaped cluster
	3503	*/
	3504	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
	3505
	3506	cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
	3507	}
	3508	/*
	3509	* if we get here, there was no way to merge
	3510	* any portion of this write with this cluster
	3511	* or we could only merge part of it which
	3512	* will leave a tail...
	3513	* we'll check the remaining clusters before starting a new one
	3514	*/
	3515	}
	3516	}
	3517	if (cl_index < wbp->cl_number)
	3518	/*
	3519	* we found an existing cluster(s) that we
	3520	* could entirely merge this I/O into
	3521	*/
	3522	goto delay_io;
	3523
	3524	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) &&
	3525	wbp->cl_number == MAX_CLUSTERS &&
	3526	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
	3527	uint32_t n;
	3528
	3529	if (vp->v_mount->mnt_minsaturationbytecount) {
	3530	n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
	3531
	3532	if (n > MAX_CLUSTERS)
	3533	n = MAX_CLUSTERS;
	3534	} else
	3535	n = 0;
	3536
	3537	if (n == 0) {
	3538	if (vp->v_mount->mnt_kern_flag & MNTK_SSD)
	3539	n = WRITE_BEHIND_SSD;
	3540	else
	3541	n = WRITE_BEHIND;
	3542	}
	3543	while (n--)
	3544	cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL);
	3545	}
	3546	if (wbp->cl_number < MAX_CLUSTERS) {
	3547	/*
	3548	* we didn't find an existing cluster to
	3549	* merge into, but there's room to start
	3550	* a new one
	3551	*/
	3552	goto start_new_cluster;
	3553	}
	3554	/*
	3555	* no exisitng cluster to merge with and no
	3556	* room to start a new one... we'll try
	3557	* pushing one of the existing ones... if none of
	3558	* them are able to be pushed, we'll switch
	3559	* to the sparse cluster mechanism
	3560	* cluster_try_push updates cl_number to the
	3561	* number of remaining clusters... and
	3562	* returns the number of currently unused clusters
	3563	*/
	3564	ret_cluster_try_push = 0;
	3565
	3566	/*
	3567	* if writes are not deferred, call cluster push immediately
	3568	*/
	3569	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
	3570
	3571	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL);
	3572	}
	3573
	3574	/*
	3575	* execute following regardless of writes being deferred or not
	3576	*/
	3577	if (ret_cluster_try_push == 0) {
	3578	/*
	3579	* no more room in the normal cluster mechanism
	3580	* so let's switch to the more expansive but expensive
	3581	* sparse mechanism....
	3582	*/
	3583	sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
	3584	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3585
	3586	lck_mtx_unlock(&wbp->cl_lockw);
	3587
	3588	continue;
	3589	}
	3590	start_new_cluster:
	3591	wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
	3592	wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
	3593
	3594	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
	3595
	3596	if (flags & IO_NOCACHE)
	3597	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
	3598
	3599	if (bflag & CL_PASSIVE)
	3600	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
	3601
	3602	wbp->cl_number++;
	3603	delay_io:
	3604	lck_mtx_unlock(&wbp->cl_lockw);
	3605
	3606	continue;
	3607	issue_io:
	3608	/*
	3609	* we don't hold the lock at this point
	3610	*
	3611	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
	3612	* so that we correctly deal with a change in state of the hardware modify bit...
	3613	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
	3614	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
	3615	* responsible for generating the correct sized I/O(s)
	3616	*/
	3617	retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
	3618	}
	3619	}
	3620	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END, retval, 0, io_resid, 0, 0);
	3621
	3622	return (retval);
	3623	}
	3624
	3625
	3626
	3627	int
	3628	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	3629	{
	3630	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
	3631	}
	3632
	3633
	3634	int
	3635	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int xflags, int (callback)(buf_t, void ), void callback_arg)
	3636	{
	3637	int retval = 0;
	3638	int flags;
	3639	user_ssize_t cur_resid;
	3640	u_int32_t io_size;
	3641	u_int32_t read_length = 0;
	3642	int read_type = IO_COPY;
	3643
	3644	flags = xflags;
	3645
	3646	if (vp->v_flag & VNOCACHE_DATA)
	3647	flags \|= IO_NOCACHE;
	3648	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled)
	3649	flags \|= IO_RAOFF;
	3650
	3651	if (flags & IO_SKIP_ENCRYPTION)
	3652	flags \|= IO_ENCRYPTED;
	3653
	3654	/*
	3655	* do a read through the cache if one of the following is true....
	3656	* NOCACHE is not true
	3657	* the uio request doesn't target USERSPACE
	3658	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
	3659	* Reading encrypted data from a CP filesystem should never result in the data touching
	3660	* the UBC.
	3661	*
	3662	* otherwise, find out if we want the direct or contig variant for
	3663	* the first vector in the uio request
	3664	*/
	3665	if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED) ) {
	3666
	3667	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3668	}
	3669
	3670	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
	3671
	3672	switch (read_type) {
	3673
	3674	case IO_COPY:
	3675	/*
	3676	* make sure the uio_resid isn't too big...
	3677	* internally, we want to handle all of the I/O in
	3678	* chunk sizes that fit in a 32 bit int
	3679	*/
	3680	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
	3681	io_size = MAX_IO_REQUEST_SIZE;
	3682	else
	3683	io_size = (u_int32_t)cur_resid;
	3684
	3685	retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
	3686	break;
	3687
	3688	case IO_DIRECT:
	3689	retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
	3690	break;
	3691
	3692	case IO_CONTIG:
	3693	retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
	3694	break;
	3695
	3696	case IO_UNKNOWN:
	3697	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3698	break;
	3699	}
	3700	}
	3701	return (retval);
	3702	}
	3703
	3704
	3705
	3706	static void
	3707	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
	3708	{
	3709	int range;
	3710	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	3711
	3712	if ((range = last_pg - start_pg)) {
	3713	if (take_reference)
	3714	abort_flags \|= UPL_ABORT_REFERENCE;
	3715
	3716	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
	3717	}
	3718	}
	3719
	3720
	3721	static int
	3722	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int flags, int (callback)(buf_t, void ), void callback_arg)
	3723	{
	3724	upl_page_info_t *pl;
	3725	upl_t upl;
	3726	vm_offset_t upl_offset;
	3727	u_int32_t upl_size;
	3728	off_t upl_f_offset;
	3729	int start_offset;
	3730	int start_pg;
	3731	int last_pg;
	3732	int uio_last = 0;
	3733	int pages_in_upl;
	3734	off_t max_size;
	3735	off_t last_ioread_offset;
	3736	off_t last_request_offset;
	3737	kern_return_t kret;
	3738	int error = 0;
	3739	int retval = 0;
	3740	u_int32_t size_of_prefetch;
	3741	u_int32_t xsize;
	3742	u_int32_t io_size;
	3743	u_int32_t max_rd_size;
	3744	u_int32_t max_io_size;
	3745	u_int32_t max_prefetch;
	3746	u_int rd_ahead_enabled = 1;
	3747	u_int prefetch_enabled = 1;
	3748	struct cl_readahead * rap;
	3749	struct clios iostate;
	3750	struct cl_extent extent;
	3751	int bflag;
	3752	int take_reference = 1;
	3753	int policy = IOPOL_DEFAULT;
	3754	boolean_t iolock_inited = FALSE;
	3755
	3756	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	3757	(int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
	3758
	3759	if (flags & IO_ENCRYPTED) {
	3760	panic ("encrypted blocks will hit UBC!");
	3761	}
	3762
	3763	policy = throttle_get_io_policy(NULL);
	3764
	3765	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE))
	3766	take_reference = 0;
	3767
	3768	if (flags & IO_PASSIVE)
	3769	bflag = CL_PASSIVE;
	3770	else
	3771	bflag = 0;
	3772
	3773	if (flags & IO_NOCACHE)
	3774	bflag \|= CL_NOCACHE;
	3775
	3776	if (flags & IO_SKIP_ENCRYPTION)
	3777	bflag \|= CL_ENCRYPTED;
	3778
	3779	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	3780	max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD));
	3781	max_rd_size = max_prefetch;
	3782
	3783	last_request_offset = uio->uio_offset + io_req_size;
	3784
	3785	if (last_request_offset > filesize)
	3786	last_request_offset = filesize;
	3787
	3788	if ((flags & (IO_RAOFF\|IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	3789	rd_ahead_enabled = 0;
	3790	rap = NULL;
	3791	} else {
	3792	if (cluster_is_throttled(vp)) {
	3793	/*
	3794	* we're in the throttle window, at the very least
	3795	* we want to limit the size of the I/O we're about
	3796	* to issue
	3797	*/
	3798	rd_ahead_enabled = 0;
	3799	prefetch_enabled = 0;
	3800
	3801	max_rd_size = THROTTLE_MAX_IOSIZE;
	3802	}
	3803	if ((rap = cluster_get_rap(vp)) == NULL)
	3804	rd_ahead_enabled = 0;
	3805	else {
	3806	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	3807	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	3808	}
	3809	}
	3810	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	3811	/*
	3812	* determine if we already have a read-ahead in the pipe courtesy of the
	3813	* last read systemcall that was issued...
	3814	* if so, pick up it's extent to determine where we should start
	3815	* with respect to any read-ahead that might be necessary to
	3816	* garner all the data needed to complete this read systemcall
	3817	*/
	3818	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	3819
	3820	if (last_ioread_offset < uio->uio_offset)
	3821	last_ioread_offset = (off_t)0;
	3822	else if (last_ioread_offset > last_request_offset)
	3823	last_ioread_offset = last_request_offset;
	3824	} else
	3825	last_ioread_offset = (off_t)0;
	3826
	3827	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
	3828
	3829	max_size = filesize - uio->uio_offset;
	3830
	3831	if ((off_t)(io_req_size) < max_size)
	3832	io_size = io_req_size;
	3833	else
	3834	io_size = max_size;
	3835
	3836	if (!(flags & IO_NOCACHE)) {
	3837
	3838	while (io_size) {
	3839	u_int32_t io_resid;
	3840	u_int32_t io_requested;
	3841
	3842	/*
	3843	* if we keep finding the pages we need already in the cache, then
	3844	* don't bother to call cluster_read_prefetch since it costs CPU cycles
	3845	* to determine that we have all the pages we need... once we miss in
	3846	* the cache and have issued an I/O, than we'll assume that we're likely
	3847	* to continue to miss in the cache and it's to our advantage to try and prefetch
	3848	*/
	3849	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
	3850	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	3851	/*
	3852	* we've already issued I/O for this request and
	3853	* there's still work to do and
	3854	* our prefetch stream is running dry, so issue a
	3855	* pre-fetch I/O... the I/O latency will overlap
	3856	* with the copying of the data
	3857	*/
	3858	if (size_of_prefetch > max_rd_size)
	3859	size_of_prefetch = max_rd_size;
	3860
	3861	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	3862
	3863	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	3864
	3865	if (last_ioread_offset > last_request_offset)
	3866	last_ioread_offset = last_request_offset;
	3867	}
	3868	}
	3869	/*
	3870	* limit the size of the copy we're about to do so that
	3871	* we can notice that our I/O pipe is running dry and
	3872	* get the next I/O issued before it does go dry
	3873	*/
	3874	if (last_ioread_offset && io_size > (max_io_size / 4))
	3875	io_resid = (max_io_size / 4);
	3876	else
	3877	io_resid = io_size;
	3878
	3879	io_requested = io_resid;
	3880
	3881	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
	3882
	3883	xsize = io_requested - io_resid;
	3884
	3885	io_size -= xsize;
	3886	io_req_size -= xsize;
	3887
	3888	if (retval \|\| io_resid)
	3889	/*
	3890	* if we run into a real error or
	3891	* a page that is not in the cache
	3892	* we need to leave streaming mode
	3893	*/
	3894	break;
	3895
	3896	if (rd_ahead_enabled && (io_size == 0 \|\| last_ioread_offset == last_request_offset)) {
	3897	/*
	3898	* we're already finished the I/O for this read request
	3899	* let's see if we should do a read-ahead
	3900	*/
	3901	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	3902	}
	3903	}
	3904	if (retval)
	3905	break;
	3906	if (io_size == 0) {
	3907	if (rap != NULL) {
	3908	if (extent.e_addr < rap->cl_lastr)
	3909	rap->cl_maxra = 0;
	3910	rap->cl_lastr = extent.e_addr;
	3911	}
	3912	break;
	3913	}
	3914	/*
	3915	* recompute max_size since cluster_copy_ubc_data_internal
	3916	* may have advanced uio->uio_offset
	3917	*/
	3918	max_size = filesize - uio->uio_offset;
	3919	}
	3920
	3921	iostate.io_completed = 0;
	3922	iostate.io_issued = 0;
	3923	iostate.io_error = 0;
	3924	iostate.io_wanted = 0;
	3925
	3926	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	3927	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	3928	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	3929	/*
	3930	* we're in the throttle window and at least 1 I/O
	3931	* has already been issued by a throttleable thread
	3932	* in this window, so return with EAGAIN to indicate
	3933	* to the FS issuing the cluster_read call that it
	3934	* should now throttle after dropping any locks
	3935	*/
	3936	throttle_info_update_by_mount(vp->v_mount);
	3937
	3938	retval = EAGAIN;
	3939	break;
	3940	}
	3941	}
	3942	}
	3943
	3944	/*
	3945	* compute the size of the upl needed to encompass
	3946	* the requested read... limit each call to cluster_io
	3947	* to the maximum UPL size... cluster_io will clip if
	3948	* this exceeds the maximum io_size for the device,
	3949	* make sure to account for
	3950	* a starting offset that's not page aligned
	3951	*/
	3952	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3953	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	3954
	3955	if (io_size > max_rd_size)
	3956	io_size = max_rd_size;
	3957
	3958	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3959
	3960	if (flags & IO_NOCACHE) {
	3961	if (upl_size > max_io_size)
	3962	upl_size = max_io_size;
	3963	} else {
	3964	if (upl_size > max_io_size / 4) {
	3965	upl_size = max_io_size / 4;
	3966	upl_size &= ~PAGE_MASK;
	3967
	3968	if (upl_size == 0)
	3969	upl_size = PAGE_SIZE;
	3970	}
	3971	}
	3972	pages_in_upl = upl_size / PAGE_SIZE;
	3973
	3974	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	3975	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3976
	3977	kret = ubc_create_upl(vp,
	3978	upl_f_offset,
	3979	upl_size,
	3980	&upl,
	3981	&pl,
	3982	UPL_FILE_IO \| UPL_SET_LITE);
	3983	if (kret != KERN_SUCCESS)
	3984	panic("cluster_read_copy: failed to get pagelist");
	3985
	3986	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	3987	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3988
	3989	/*
	3990	* scan from the beginning of the upl looking for the first
	3991	* non-valid page.... this will become the first page in
	3992	* the request we're going to make to 'cluster_io'... if all
	3993	* of the pages are valid, we won't call through to 'cluster_io'
	3994	*/
	3995	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	3996	if (!upl_valid_page(pl, start_pg))
	3997	break;
	3998	}
	3999
	4000	/*
	4001	* scan from the starting invalid page looking for a valid
	4002	* page before the end of the upl is reached, if we
	4003	* find one, then it will be the last page of the request to
	4004	* 'cluster_io'
	4005	*/
	4006	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4007	if (upl_valid_page(pl, last_pg))
	4008	break;
	4009	}
	4010
	4011	if (start_pg < last_pg) {
	4012	/*
	4013	* we found a range of 'invalid' pages that must be filled
	4014	* if the last page in this range is the last page of the file
	4015	* we may have to clip the size of it to keep from reading past
	4016	* the end of the last physical block associated with the file
	4017	*/
	4018	if (iolock_inited == FALSE) {
	4019	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4020
	4021	iolock_inited = TRUE;
	4022	}
	4023	upl_offset = start_pg * PAGE_SIZE;
	4024	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4025
	4026	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	4027	io_size = filesize - (upl_f_offset + upl_offset);
	4028
	4029	/*
	4030	* issue an asynchronous read to cluster_io
	4031	*/
	4032
	4033	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	4034	io_size, CL_READ \| CL_ASYNC \| bflag, (buf_t)NULL, &iostate, callback, callback_arg);
	4035
	4036	if (rap) {
	4037	if (extent.e_addr < rap->cl_maxra) {
	4038	/*
	4039	* we've just issued a read for a block that should have been
	4040	* in the cache courtesy of the read-ahead engine... something
	4041	* has gone wrong with the pipeline, so reset the read-ahead
	4042	* logic which will cause us to restart from scratch
	4043	*/
	4044	rap->cl_maxra = 0;
	4045	}
	4046	}
	4047	}
	4048	if (error == 0) {
	4049	/*
	4050	* if the read completed successfully, or there was no I/O request
	4051	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	4052	* we'll first add on any 'valid'
	4053	* pages that were present in the upl when we acquired it.
	4054	*/
	4055	u_int val_size;
	4056
	4057	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	4058	if (!upl_valid_page(pl, uio_last))
	4059	break;
	4060	}
	4061	if (uio_last < pages_in_upl) {
	4062	/*
	4063	* there were some invalid pages beyond the valid pages
	4064	* that we didn't issue an I/O for, just release them
	4065	* unchanged now, so that any prefetch/readahed can
	4066	* include them
	4067	*/
	4068	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	4069	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4070	}
	4071
	4072	/*
	4073	* compute size to transfer this round, if io_req_size is
	4074	* still non-zero after this attempt, we'll loop around and
	4075	* set up for another I/O.
	4076	*/
	4077	val_size = (uio_last * PAGE_SIZE) - start_offset;
	4078
	4079	if (val_size > max_size)
	4080	val_size = max_size;
	4081
	4082	if (val_size > io_req_size)
	4083	val_size = io_req_size;
	4084
	4085	if ((uio->uio_offset + val_size) > last_ioread_offset)
	4086	last_ioread_offset = uio->uio_offset + val_size;
	4087
	4088	if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	4089
	4090	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
	4091	/*
	4092	* if there's still I/O left to do for this request, and...
	4093	* we're not in hard throttle mode, and...
	4094	* we're close to using up the previous prefetch, then issue a
	4095	* new pre-fetch I/O... the I/O latency will overlap
	4096	* with the copying of the data
	4097	*/
	4098	if (size_of_prefetch > max_rd_size)
	4099	size_of_prefetch = max_rd_size;
	4100
	4101	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	4102
	4103	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	4104
	4105	if (last_ioread_offset > last_request_offset)
	4106	last_ioread_offset = last_request_offset;
	4107	}
	4108
	4109	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	4110	/*
	4111	* this transfer will finish this request, so...
	4112	* let's try to read ahead if we're in
	4113	* a sequential access pattern and we haven't
	4114	* explicitly disabled it
	4115	*/
	4116	if (rd_ahead_enabled)
	4117	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	4118
	4119	if (rap != NULL) {
	4120	if (extent.e_addr < rap->cl_lastr)
	4121	rap->cl_maxra = 0;
	4122	rap->cl_lastr = extent.e_addr;
	4123	}
	4124	}
	4125	if (iolock_inited == TRUE)
	4126	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4127
	4128	if (iostate.io_error)
	4129	error = iostate.io_error;
	4130	else {
	4131	u_int32_t io_requested;
	4132
	4133	io_requested = val_size;
	4134
	4135	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
	4136
	4137	io_req_size -= (val_size - io_requested);
	4138	}
	4139	} else {
	4140	if (iolock_inited == TRUE)
	4141	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4142	}
	4143	if (start_pg < last_pg) {
	4144	/*
	4145	* compute the range of pages that we actually issued an I/O for
	4146	* and either commit them as valid if the I/O succeeded
	4147	* or abort them if the I/O failed or we're not supposed to
	4148	* keep them in the cache
	4149	*/
	4150	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4151
	4152	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4153
	4154	if (error \|\| (flags & IO_NOCACHE))
	4155	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	4156	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	4157	else {
	4158	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
	4159
	4160	if (take_reference)
	4161	commit_flags \|= UPL_COMMIT_INACTIVATE;
	4162	else
	4163	commit_flags \|= UPL_COMMIT_SPECULATE;
	4164
	4165	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
	4166	}
	4167	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4168	}
	4169	if ((last_pg - start_pg) < pages_in_upl) {
	4170	/*
	4171	* the set of pages that we issued an I/O for did not encompass
	4172	* the entire upl... so just release these without modifying
	4173	* their state
	4174	*/
	4175	if (error)
	4176	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	4177	else {
	4178
	4179	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	4180	upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	4181
	4182	/*
	4183	* handle any valid pages at the beginning of
	4184	* the upl... release these appropriately
	4185	*/
	4186	cluster_read_upl_release(upl, 0, start_pg, take_reference);
	4187
	4188	/*
	4189	* handle any valid pages immediately after the
	4190	* pages we issued I/O for... ... release these appropriately
	4191	*/
	4192	cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
	4193
	4194	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, -1, -1, 0, 0);
	4195	}
	4196	}
	4197	if (retval == 0)
	4198	retval = error;
	4199
	4200	if (io_req_size) {
	4201	if (cluster_is_throttled(vp)) {
	4202	/*
	4203	* we're in the throttle window, at the very least
	4204	* we want to limit the size of the I/O we're about
	4205	* to issue
	4206	*/
	4207	rd_ahead_enabled = 0;
	4208	prefetch_enabled = 0;
	4209	max_rd_size = THROTTLE_MAX_IOSIZE;
	4210	} else {
	4211	if (max_rd_size == THROTTLE_MAX_IOSIZE) {
	4212	/*
	4213	* coming out of throttled state
	4214	*/
	4215	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
	4216	if (rap != NULL)
	4217	rd_ahead_enabled = 1;
	4218	prefetch_enabled = 1;
	4219	}
	4220	max_rd_size = max_prefetch;
	4221	last_ioread_offset = 0;
	4222	}
	4223	}
	4224	}
	4225	}
	4226	if (iolock_inited == TRUE) {
	4227	/*
	4228	* cluster_io returned an error after it
	4229	* had already issued some I/O. we need
	4230	* to wait for that I/O to complete before
	4231	* we can destroy the iostate mutex...
	4232	* 'retval' already contains the early error
	4233	* so no need to pick it up from iostate.io_error
	4234	*/
	4235	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4236
	4237	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4238	}
	4239	if (rap != NULL) {
	4240	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4241	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
	4242
	4243	lck_mtx_unlock(&rap->cl_lockr);
	4244	} else {
	4245	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4246	(int)uio->uio_offset, io_req_size, 0, retval, 0);
	4247	}
	4248
	4249	return (retval);
	4250	}
	4251
	4252	/*
	4253	* We don't want another read/write lock for every vnode in the system
	4254	* so we keep a hash of them here. There should never be very many of
	4255	* these around at any point in time.
	4256	*/
	4257	cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
	4258	{
	4259	struct cl_direct_read_locks *head
	4260	= &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
	4261	% CL_DIRECT_READ_LOCK_BUCKETS];
	4262
	4263	struct cl_direct_read_lock lck, new_lck = NULL;
	4264
	4265	for (;;) {
	4266	lck_spin_lock(&cl_direct_read_spin_lock);
	4267
	4268	LIST_FOREACH(lck, head, chain) {
	4269	if (lck->vp == vp) {
	4270	++lck->ref_count;
	4271	lck_spin_unlock(&cl_direct_read_spin_lock);
	4272	if (new_lck) {
	4273	// Someone beat us to it, ditch the allocation
	4274	lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
	4275	FREE(new_lck, M_TEMP);
	4276	}
	4277	lck_rw_lock(&lck->rw_lock, type);
	4278	return lck;
	4279	}
	4280	}
	4281
	4282	if (new_lck) {
	4283	// Use the lock we allocated
	4284	LIST_INSERT_HEAD(head, new_lck, chain);
	4285	lck_spin_unlock(&cl_direct_read_spin_lock);
	4286	lck_rw_lock(&new_lck->rw_lock, type);
	4287	return new_lck;
	4288	}
	4289
	4290	lck_spin_unlock(&cl_direct_read_spin_lock);
	4291
	4292	// Allocate a new lock
	4293	MALLOC(new_lck, cl_direct_read_lock_t , sizeof(new_lck),
	4294	M_TEMP, M_WAITOK);
	4295	lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
	4296	new_lck->vp = vp;
	4297	new_lck->ref_count = 1;
	4298
	4299	// Got to go round again
	4300	}
	4301	}
	4302
	4303	void cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
	4304	{
	4305	lck_rw_done(&lck->rw_lock);
	4306
	4307	lck_spin_lock(&cl_direct_read_spin_lock);
	4308	if (lck->ref_count == 1) {
	4309	LIST_REMOVE(lck, chain);
	4310	lck_spin_unlock(&cl_direct_read_spin_lock);
	4311	lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
	4312	FREE(lck, M_TEMP);
	4313	} else {
	4314	--lck->ref_count;
	4315	lck_spin_unlock(&cl_direct_read_spin_lock);
	4316	}
	4317	}
	4318
	4319	static int
	4320	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4321	int flags, int (callback)(buf_t, void ), void *callback_arg)
	4322	{
	4323	upl_t upl;
	4324	upl_page_info_t *pl;
	4325	off_t max_io_size;
	4326	vm_offset_t upl_offset, vector_upl_offset = 0;
	4327	upl_size_t upl_size, vector_upl_size = 0;
	4328	vm_size_t upl_needed_size;
	4329	unsigned int pages_in_pl;
	4330	upl_control_flags_t upl_flags;
	4331	kern_return_t kret;
	4332	unsigned int i;
	4333	int force_data_sync;
	4334	int retval = 0;
	4335	int no_zero_fill = 0;
	4336	int io_flag = 0;
	4337	int misaligned = 0;
	4338	struct clios iostate;
	4339	user_addr_t iov_base;
	4340	u_int32_t io_req_size;
	4341	u_int32_t offset_in_file;
	4342	u_int32_t offset_in_iovbase;
	4343	u_int32_t io_size;
	4344	u_int32_t io_min;
	4345	u_int32_t xsize;
	4346	u_int32_t devblocksize;
	4347	u_int32_t mem_alignment_mask;
	4348	u_int32_t max_upl_size;
	4349	u_int32_t max_rd_size;
	4350	u_int32_t max_rd_ahead;
	4351	u_int32_t max_vector_size;
	4352	boolean_t strict_uncached_IO = FALSE;
	4353	boolean_t io_throttled = FALSE;
	4354
	4355	u_int32_t vector_upl_iosize = 0;
	4356	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	4357	off_t v_upl_uio_offset = 0;
	4358	int vector_upl_index=0;
	4359	upl_t vector_upl = NULL;
	4360	cl_direct_read_lock_t *lock = NULL;
	4361
	4362	user_addr_t orig_iov_base = 0;
	4363	user_addr_t last_iov_base = 0;
	4364	user_addr_t next_iov_base = 0;
	4365
	4366	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	4367	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4368
	4369	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4370
	4371	max_rd_size = max_upl_size;
	4372	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4373
	4374	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
	4375
	4376	if (flags & IO_PASSIVE)
	4377	io_flag \|= CL_PASSIVE;
	4378
	4379	if (flags & IO_ENCRYPTED) {
	4380	io_flag \|= CL_RAW_ENCRYPTED;
	4381	}
	4382
	4383	if (flags & IO_NOCACHE) {
	4384	io_flag \|= CL_NOCACHE;
	4385	}
	4386
	4387	if (flags & IO_SKIP_ENCRYPTION)
	4388	io_flag \|= CL_ENCRYPTED;
	4389
	4390	iostate.io_completed = 0;
	4391	iostate.io_issued = 0;
	4392	iostate.io_error = 0;
	4393	iostate.io_wanted = 0;
	4394
	4395	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4396
	4397	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4398	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4399
	4400	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4401	(int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
	4402
	4403	if (devblocksize == 1) {
	4404	/*
	4405	* the AFP client advertises a devblocksize of 1
	4406	* however, its BLOCKMAP routine maps to physical
	4407	* blocks that are PAGE_SIZE in size...
	4408	* therefore we can't ask for I/Os that aren't page aligned
	4409	* or aren't multiples of PAGE_SIZE in size
	4410	* by setting devblocksize to PAGE_SIZE, we re-instate
	4411	* the old behavior we had before the mem_alignment_mask
	4412	* changes went in...
	4413	*/
	4414	devblocksize = PAGE_SIZE;
	4415	}
	4416
	4417	strict_uncached_IO = ubc_strict_uncached_IO(vp);
	4418
	4419	orig_iov_base = uio_curriovbase(uio);
	4420	last_iov_base = orig_iov_base;
	4421
	4422	next_dread:
	4423	io_req_size = *read_length;
	4424	iov_base = uio_curriovbase(uio);
	4425
	4426	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
	4427	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	4428
	4429	if (offset_in_file \|\| offset_in_iovbase) {
	4430	/*
	4431	* one of the 2 important offsets is misaligned
	4432	* so fire an I/O through the cache for this entire vector
	4433	*/
	4434	misaligned = 1;
	4435	}
	4436	if (iov_base & (devblocksize - 1)) {
	4437	/*
	4438	* the offset in memory must be on a device block boundary
	4439	* so that we can guarantee that we can generate an
	4440	* I/O that ends on a page boundary in cluster_io
	4441	*/
	4442	misaligned = 1;
	4443	}
	4444
	4445	max_io_size = filesize - uio->uio_offset;
	4446
	4447	/*
	4448	* The user must request IO in aligned chunks. If the
	4449	* offset into the file is bad, or the userland pointer
	4450	* is non-aligned, then we cannot service the encrypted IO request.
	4451	*/
	4452	if (flags & IO_ENCRYPTED) {
	4453	if (misaligned \|\| (io_req_size & (devblocksize - 1)))
	4454	retval = EINVAL;
	4455
	4456	max_io_size = roundup(max_io_size, devblocksize);
	4457	}
	4458
	4459	if ((off_t)io_req_size > max_io_size)
	4460	io_req_size = max_io_size;
	4461
	4462	/*
	4463	* When we get to this point, we know...
	4464	* -- the offset into the file is on a devblocksize boundary
	4465	*/
	4466
	4467	while (io_req_size && retval == 0) {
	4468	u_int32_t io_start;
	4469
	4470	if (cluster_is_throttled(vp)) {
	4471	/*
	4472	* we're in the throttle window, at the very least
	4473	* we want to limit the size of the I/O we're about
	4474	* to issue
	4475	*/
	4476	max_rd_size = THROTTLE_MAX_IOSIZE;
	4477	max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
	4478	max_vector_size = THROTTLE_MAX_IOSIZE;
	4479	} else {
	4480	max_rd_size = max_upl_size;
	4481	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4482	max_vector_size = MAX_VECTOR_UPL_SIZE;
	4483	}
	4484	io_start = io_size = io_req_size;
	4485
	4486	/*
	4487	* First look for pages already in the cache
	4488	* and move them to user space. But only do this
	4489	* check if we are not retrieving encrypted data directly
	4490	* from the filesystem; those blocks should never
	4491	* be in the UBC.
	4492	*
	4493	* cluster_copy_ubc_data returns the resid
	4494	* in io_size
	4495	*/
	4496	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4497	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
	4498	}
	4499	/*
	4500	* calculate the number of bytes actually copied
	4501	* starting size - residual
	4502	*/
	4503	xsize = io_start - io_size;
	4504
	4505	io_req_size -= xsize;
	4506
	4507	if(useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
	4508	/*
	4509	* We found something in the cache or we have an iov_base that's not
	4510	* page-aligned.
	4511	*
	4512	* Issue all I/O's that have been collected within this Vectored UPL.
	4513	*/
	4514	if(vector_upl_index) {
	4515	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4516	reset_vector_run_state();
	4517	}
	4518
	4519	if(xsize)
	4520	useVectorUPL = 0;
	4521
	4522	/*
	4523	* After this point, if we are using the Vector UPL path and the base is
	4524	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	4525	*/
	4526	}
	4527
	4528	/*
	4529	* check to see if we are finished with this request.
	4530	*
	4531	* If we satisfied this IO already, then io_req_size will be 0.
	4532	* Otherwise, see if the IO was mis-aligned and needs to go through
	4533	* the UBC to deal with the 'tail'.
	4534	*
	4535	*/
	4536	if (io_req_size == 0 \|\| (misaligned)) {
	4537	/*
	4538	* see if there's another uio vector to
	4539	* process that's of type IO_DIRECT
	4540	*
	4541	* break out of while loop to get there
	4542	*/
	4543	break;
	4544	}
	4545	/*
	4546	* assume the request ends on a device block boundary
	4547	*/
	4548	io_min = devblocksize;
	4549
	4550	/*
	4551	* we can handle I/O's in multiples of the device block size
	4552	* however, if io_size isn't a multiple of devblocksize we
	4553	* want to clip it back to the nearest page boundary since
	4554	* we are going to have to go through cluster_read_copy to
	4555	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
	4556	* multiple, we avoid asking the drive for the same physical
	4557	* blocks twice.. once for the partial page at the end of the
	4558	* request and a 2nd time for the page we read into the cache
	4559	* (which overlaps the end of the direct read) in order to
	4560	* get at the overhang bytes
	4561	*/
	4562	if (io_size & (devblocksize - 1)) {
	4563	assert(!(flags & IO_ENCRYPTED));
	4564	/*
	4565	* Clip the request to the previous page size boundary
	4566	* since request does NOT end on a device block boundary
	4567	*/
	4568	io_size &= ~PAGE_MASK;
	4569	io_min = PAGE_SIZE;
	4570	}
	4571	if (retval \|\| io_size < io_min) {
	4572	/*
	4573	* either an error or we only have the tail left to
	4574	* complete via the copy path...
	4575	* we may have already spun some portion of this request
	4576	* off as async requests... we need to wait for the I/O
	4577	* to complete before returning
	4578	*/
	4579	goto wait_for_dreads;
	4580	}
	4581
	4582	/*
	4583	* Don't re-check the UBC data if we are looking for uncached IO
	4584	* or asking for encrypted blocks.
	4585	*/
	4586	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4587
	4588	if ((xsize = io_size) > max_rd_size)
	4589	xsize = max_rd_size;
	4590
	4591	io_size = 0;
	4592
	4593	if (!lock) {
	4594	/*
	4595	* We hold a lock here between the time we check the
	4596	* cache and the time we issue I/O. This saves us
	4597	* from having to lock the pages in the cache. Not
	4598	* all clients will care about this lock but some
	4599	* clients may want to guarantee stability between
	4600	* here and when the I/O is issued in which case they
	4601	* will take the lock exclusively.
	4602	*/
	4603	lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
	4604	}
	4605
	4606	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
	4607
	4608	if (io_size == 0) {
	4609	/*
	4610	* a page must have just come into the cache
	4611	* since the first page in this range is no
	4612	* longer absent, go back and re-evaluate
	4613	*/
	4614	continue;
	4615	}
	4616	}
	4617	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	4618	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4619	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4620	/*
	4621	* we're in the throttle window and at least 1 I/O
	4622	* has already been issued by a throttleable thread
	4623	* in this window, so return with EAGAIN to indicate
	4624	* to the FS issuing the cluster_read call that it
	4625	* should now throttle after dropping any locks
	4626	*/
	4627	throttle_info_update_by_mount(vp->v_mount);
	4628
	4629	io_throttled = TRUE;
	4630	goto wait_for_dreads;
	4631	}
	4632	}
	4633	}
	4634	if (io_size > max_rd_size)
	4635	io_size = max_rd_size;
	4636
	4637	iov_base = uio_curriovbase(uio);
	4638
	4639	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4640	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	4641
	4642	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	4643	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	4644
	4645	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
	4646	no_zero_fill = 1;
	4647	else
	4648	no_zero_fill = 0;
	4649
	4650	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4651	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	4652	pages_in_pl = 0;
	4653	upl_size = upl_needed_size;
	4654	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE
	4655	\| UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
	4656	if (no_zero_fill)
	4657	upl_flags \|= UPL_NOZEROFILL;
	4658	if (force_data_sync)
	4659	upl_flags \|= UPL_FORCE_DATA_SYNC;
	4660
	4661	kret = vm_map_create_upl(map,
	4662	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4663	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
	4664
	4665	if (kret != KERN_SUCCESS) {
	4666	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4667	(int)upl_offset, upl_size, io_size, kret, 0);
	4668	/*
	4669	* failed to get pagelist
	4670	*
	4671	* we may have already spun some portion of this request
	4672	* off as async requests... we need to wait for the I/O
	4673	* to complete before returning
	4674	*/
	4675	goto wait_for_dreads;
	4676	}
	4677	pages_in_pl = upl_size / PAGE_SIZE;
	4678	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	4679
	4680	for (i = 0; i < pages_in_pl; i++) {
	4681	if (!upl_page_present(pl, i))
	4682	break;
	4683	}
	4684	if (i == pages_in_pl)
	4685	break;
	4686
	4687	ubc_upl_abort(upl, 0);
	4688	}
	4689	if (force_data_sync >= 3) {
	4690	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4691	(int)upl_offset, upl_size, io_size, kret, 0);
	4692
	4693	goto wait_for_dreads;
	4694	}
	4695	/*
	4696	* Consider the possibility that upl_size wasn't satisfied.
	4697	*/
	4698	if (upl_size < upl_needed_size) {
	4699	if (upl_size && upl_offset == 0)
	4700	io_size = upl_size;
	4701	else
	4702	io_size = 0;
	4703	}
	4704	if (io_size == 0) {
	4705	ubc_upl_abort(upl, 0);
	4706	goto wait_for_dreads;
	4707	}
	4708	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4709	(int)upl_offset, upl_size, io_size, kret, 0);
	4710
	4711	if(useVectorUPL) {
	4712	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	4713	if(end_off)
	4714	issueVectorUPL = 1;
	4715	/*
	4716	* After this point, if we are using a vector UPL, then
	4717	* either all the UPL elements end on a page boundary OR
	4718	* this UPL is the last element because it does not end
	4719	* on a page boundary.
	4720	*/
	4721	}
	4722
	4723	/*
	4724	* request asynchronously so that we can overlap
	4725	* the preparation of the next I/O
	4726	* if there are already too many outstanding reads
	4727	* wait until some have completed before issuing the next read
	4728	*/
	4729	cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
	4730
	4731	if (iostate.io_error) {
	4732	/*
	4733	* one of the earlier reads we issued ran into a hard error
	4734	* don't issue any more reads, cleanup the UPL
	4735	* that was just created but not used, then
	4736	* go wait for any other reads to complete before
	4737	* returning the error to the caller
	4738	*/
	4739	ubc_upl_abort(upl, 0);
	4740
	4741	goto wait_for_dreads;
	4742	}
	4743	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	4744	upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	4745
	4746	if(!useVectorUPL) {
	4747	if (no_zero_fill)
	4748	io_flag &= ~CL_PRESERVE;
	4749	else
	4750	io_flag \|= CL_PRESERVE;
	4751
	4752	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4753
	4754	} else {
	4755
	4756	if(!vector_upl_index) {
	4757	vector_upl = vector_upl_create(upl_offset);
	4758	v_upl_uio_offset = uio->uio_offset;
	4759	vector_upl_offset = upl_offset;
	4760	}
	4761
	4762	vector_upl_set_subupl(vector_upl,upl, upl_size);
	4763	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	4764	vector_upl_index++;
	4765	vector_upl_size += upl_size;
	4766	vector_upl_iosize += io_size;
	4767
	4768	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	4769	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4770	reset_vector_run_state();
	4771	}
	4772	}
	4773	last_iov_base = iov_base + io_size;
	4774
	4775	if (lock) {
	4776	// We don't need to wait for the I/O to complete
	4777	cluster_unlock_direct_read(lock);
	4778	lock = NULL;
	4779	}
	4780
	4781	/*
	4782	* update the uio structure
	4783	*/
	4784	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
	4785	uio_update(uio, (user_size_t)max_io_size);
	4786	}
	4787	else {
	4788	uio_update(uio, (user_size_t)io_size);
	4789	}
	4790
	4791	io_req_size -= io_size;
	4792
	4793	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	4794	upl, (int)uio->uio_offset, io_req_size, retval, 0);
	4795
	4796	} /* end while */
	4797
	4798	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
	4799
	4800	retval = cluster_io_type(uio, read_type, read_length, 0);
	4801
	4802	if (retval == 0 && *read_type == IO_DIRECT) {
	4803
	4804	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4805	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4806
	4807	goto next_dread;
	4808	}
	4809	}
	4810
	4811	wait_for_dreads:
	4812
	4813	if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	4814	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4815	reset_vector_run_state();
	4816	}
	4817
	4818	// We don't need to wait for the I/O to complete
	4819	if (lock)
	4820	cluster_unlock_direct_read(lock);
	4821
	4822	/*
	4823	* make sure all async reads that are part of this stream
	4824	* have completed before we return
	4825	*/
	4826	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
	4827
	4828	if (iostate.io_error)
	4829	retval = iostate.io_error;
	4830
	4831	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4832
	4833	if (io_throttled == TRUE && retval == 0)
	4834	retval = EAGAIN;
	4835
	4836	for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
	4837	/*
	4838	* This is specifically done for pmap accounting purposes.
	4839	* vm_pre_fault() will call vm_fault() to enter the page into
	4840	* the pmap if there isn't _a_ physical page for that VA already.
	4841	*/
	4842	vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
	4843	}
	4844
	4845	if (io_req_size && retval == 0) {
	4846	/*
	4847	* we couldn't handle the tail of this request in DIRECT mode
	4848	* so fire it through the copy path
	4849	*/
	4850	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
	4851
	4852	*read_type = IO_UNKNOWN;
	4853	}
	4854	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	4855	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
	4856
	4857	return (retval);
	4858	}
	4859
	4860
	4861	static int
	4862	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4863	int (callback)(buf_t, void ), void *callback_arg, int flags)
	4864	{
	4865	upl_page_info_t *pl;
	4866	upl_t upl[MAX_VECTS];
	4867	vm_offset_t upl_offset;
	4868	addr64_t dst_paddr = 0;
	4869	user_addr_t iov_base;
	4870	off_t max_size;
	4871	upl_size_t upl_size;
	4872	vm_size_t upl_needed_size;
	4873	mach_msg_type_number_t pages_in_pl;
	4874	upl_control_flags_t upl_flags;
	4875	kern_return_t kret;
	4876	struct clios iostate;
	4877	int error= 0;
	4878	int cur_upl = 0;
	4879	int num_upl = 0;
	4880	int n;
	4881	u_int32_t xsize;
	4882	u_int32_t io_size;
	4883	u_int32_t devblocksize;
	4884	u_int32_t mem_alignment_mask;
	4885	u_int32_t tail_size = 0;
	4886	int bflag;
	4887
	4888	if (flags & IO_PASSIVE)
	4889	bflag = CL_PASSIVE;
	4890	else
	4891	bflag = 0;
	4892
	4893	if (flags & IO_NOCACHE)
	4894	bflag \|= CL_NOCACHE;
	4895
	4896	/*
	4897	* When we enter this routine, we know
	4898	* -- the read_length will not exceed the current iov_len
	4899	* -- the target address is physically contiguous for read_length
	4900	*/
	4901	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
	4902
	4903	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4904	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4905
	4906	iostate.io_completed = 0;
	4907	iostate.io_issued = 0;
	4908	iostate.io_error = 0;
	4909	iostate.io_wanted = 0;
	4910
	4911	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4912
	4913	next_cread:
	4914	io_size = *read_length;
	4915
	4916	max_size = filesize - uio->uio_offset;
	4917
	4918	if (io_size > max_size)
	4919	io_size = max_size;
	4920
	4921	iov_base = uio_curriovbase(uio);
	4922
	4923	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4924	upl_needed_size = upl_offset + io_size;
	4925
	4926	pages_in_pl = 0;
	4927	upl_size = upl_needed_size;
	4928	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE
	4929	\| UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
	4930
	4931
	4932	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_START,
	4933	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
	4934
	4935	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4936	kret = vm_map_get_upl(map,
	4937	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4938	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
	4939
	4940	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_END,
	4941	(int)upl_offset, upl_size, io_size, kret, 0);
	4942
	4943	if (kret != KERN_SUCCESS) {
	4944	/*
	4945	* failed to get pagelist
	4946	*/
	4947	error = EINVAL;
	4948	goto wait_for_creads;
	4949	}
	4950	num_upl++;
	4951
	4952	if (upl_size < upl_needed_size) {
	4953	/*
	4954	* The upl_size wasn't satisfied.
	4955	*/
	4956	error = EINVAL;
	4957	goto wait_for_creads;
	4958	}
	4959	pl = ubc_upl_pageinfo(upl[cur_upl]);
	4960
	4961	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	4962
	4963	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	4964	u_int32_t head_size;
	4965
	4966	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	4967
	4968	if (head_size > io_size)
	4969	head_size = io_size;
	4970
	4971	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
	4972
	4973	if (error)
	4974	goto wait_for_creads;
	4975
	4976	upl_offset += head_size;
	4977	dst_paddr += head_size;
	4978	io_size -= head_size;
	4979
	4980	iov_base += head_size;
	4981	}
	4982	if ((u_int32_t)iov_base & mem_alignment_mask) {
	4983	/*
	4984	* request doesn't set up on a memory boundary
	4985	* the underlying DMA engine can handle...
	4986	* return an error instead of going through
	4987	* the slow copy path since the intent of this
	4988	* path is direct I/O to device memory
	4989	*/
	4990	error = EINVAL;
	4991	goto wait_for_creads;
	4992	}
	4993
	4994	tail_size = io_size & (devblocksize - 1);
	4995
	4996	io_size -= tail_size;
	4997
	4998	while (io_size && error == 0) {
	4999
	5000	if (io_size > MAX_IO_CONTIG_SIZE)
	5001	xsize = MAX_IO_CONTIG_SIZE;
	5002	else
	5003	xsize = io_size;
	5004	/*
	5005	* request asynchronously so that we can overlap
	5006	* the preparation of the next I/O... we'll do
	5007	* the commit after all the I/O has completed
	5008	* since its all issued against the same UPL
	5009	* if there are already too many outstanding reads
	5010	* wait until some have completed before issuing the next
	5011	*/
	5012	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
	5013
	5014	if (iostate.io_error) {
	5015	/*
	5016	* one of the earlier reads we issued ran into a hard error
	5017	* don't issue any more reads...
	5018	* go wait for any other reads to complete before
	5019	* returning the error to the caller
	5020	*/
	5021	goto wait_for_creads;
	5022	}
	5023	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
	5024	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
	5025	(buf_t)NULL, &iostate, callback, callback_arg);
	5026	/*
	5027	* The cluster_io read was issued successfully,
	5028	* update the uio structure
	5029	*/
	5030	if (error == 0) {
	5031	uio_update(uio, (user_size_t)xsize);
	5032
	5033	dst_paddr += xsize;
	5034	upl_offset += xsize;
	5035	io_size -= xsize;
	5036	}
	5037	}
	5038	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
	5039
	5040	error = cluster_io_type(uio, read_type, read_length, 0);
	5041
	5042	if (error == 0 && *read_type == IO_CONTIG) {
	5043	cur_upl++;
	5044	goto next_cread;
	5045	}
	5046	} else
	5047	*read_type = IO_UNKNOWN;
	5048
	5049	wait_for_creads:
	5050	/*
	5051	* make sure all async reads that are part of this stream
	5052	* have completed before we proceed
	5053	*/
	5054	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
	5055
	5056	if (iostate.io_error)
	5057	error = iostate.io_error;
	5058
	5059	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	5060
	5061	if (error == 0 && tail_size)
	5062	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
	5063
	5064	for (n = 0; n < num_upl; n++)
	5065	/*
	5066	* just release our hold on each physically contiguous
	5067	* region without changing any state
	5068	*/
	5069	ubc_upl_abort(upl[n], 0);
	5070
	5071	return (error);
	5072	}
	5073
	5074
	5075	static int
	5076	cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length)
	5077	{
	5078	user_size_t iov_len;
	5079	user_addr_t iov_base = 0;
	5080	upl_t upl;
	5081	upl_size_t upl_size;
	5082	upl_control_flags_t upl_flags;
	5083	int retval = 0;
	5084
	5085	/*
	5086	* skip over any emtpy vectors
	5087	*/
	5088	uio_update(uio, (user_size_t)0);
	5089
	5090	iov_len = uio_curriovlen(uio);
	5091
	5092	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
	5093
	5094	if (iov_len) {
	5095	iov_base = uio_curriovbase(uio);
	5096	/*
	5097	* make sure the size of the vector isn't too big...
	5098	* internally, we want to handle all of the I/O in
	5099	* chunk sizes that fit in a 32 bit int
	5100	*/
	5101	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
	5102	upl_size = MAX_IO_REQUEST_SIZE;
	5103	else
	5104	upl_size = (u_int32_t)iov_len;
	5105
	5106	upl_flags = UPL_QUERY_OBJECT_TYPE \| UPL_MEMORY_TAG_MAKE(VM_KERN_MEMORY_FILE);
	5107
	5108	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	5109	if ((vm_map_get_upl(map,
	5110	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	5111	&upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
	5112	/*
	5113	* the user app must have passed in an invalid address
	5114	*/
	5115	retval = EFAULT;
	5116	}
	5117	if (upl_size == 0)
	5118	retval = EFAULT;
	5119
	5120	*io_length = upl_size;
	5121
	5122	if (upl_flags & UPL_PHYS_CONTIG)
	5123	*io_type = IO_CONTIG;
	5124	else if (iov_len >= min_length)
	5125	*io_type = IO_DIRECT;
	5126	else
	5127	*io_type = IO_COPY;
	5128	} else {
	5129	/*
	5130	* nothing left to do for this uio
	5131	*/
	5132	*io_length = 0;
	5133	*io_type = IO_UNKNOWN;
	5134	}
	5135	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, 0);
	5136
	5137	return (retval);
	5138	}
	5139
	5140
	5141	/*
	5142	* generate advisory I/O's in the largest chunks possible
	5143	* the completed pages will be released into the VM cache
	5144	*/
	5145	int
	5146	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	5147	{
	5148	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
	5149	}
	5150
	5151	int
	5152	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	5153	{
	5154	upl_page_info_t *pl;
	5155	upl_t upl;
	5156	vm_offset_t upl_offset;
	5157	int upl_size;
	5158	off_t upl_f_offset;
	5159	int start_offset;
	5160	int start_pg;
	5161	int last_pg;
	5162	int pages_in_upl;
	5163	off_t max_size;
	5164	int io_size;
	5165	kern_return_t kret;
	5166	int retval = 0;
	5167	int issued_io;
	5168	int skip_range;
	5169	uint32_t max_io_size;
	5170
	5171
	5172	if ( !UBCINFOEXISTS(vp))
	5173	return(EINVAL);
	5174
	5175	if (resid < 0)
	5176	return(EINVAL);
	5177
	5178	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	5179
	5180	if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) {
	5181	if (max_io_size > speculative_prefetch_max_iosize)
	5182	max_io_size = speculative_prefetch_max_iosize;
	5183	}
	5184
	5185	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	5186	(int)f_offset, resid, (int)filesize, 0, 0);
	5187
	5188	while (resid && f_offset < filesize && retval == 0) {
	5189	/*
	5190	* compute the size of the upl needed to encompass
	5191	* the requested read... limit each call to cluster_io
	5192	* to the maximum UPL size... cluster_io will clip if
	5193	* this exceeds the maximum io_size for the device,
	5194	* make sure to account for
	5195	* a starting offset that's not page aligned
	5196	*/
	5197	start_offset = (int)(f_offset & PAGE_MASK_64);
	5198	upl_f_offset = f_offset - (off_t)start_offset;
	5199	max_size = filesize - f_offset;
	5200
	5201	if (resid < max_size)
	5202	io_size = resid;
	5203	else
	5204	io_size = max_size;
	5205
	5206	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5207	if ((uint32_t)upl_size > max_io_size)
	5208	upl_size = max_io_size;
	5209
	5210	skip_range = 0;
	5211	/*
	5212	* return the number of contiguously present pages in the cache
	5213	* starting at upl_f_offset within the file
	5214	*/
	5215	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	5216
	5217	if (skip_range) {
	5218	/*
	5219	* skip over pages already present in the cache
	5220	*/
	5221	io_size = skip_range - start_offset;
	5222
	5223	f_offset += io_size;
	5224	resid -= io_size;
	5225
	5226	if (skip_range == upl_size)
	5227	continue;
	5228	/*
	5229	* have to issue some real I/O
	5230	* at this point, we know it's starting on a page boundary
	5231	* because we've skipped over at least the first page in the request
	5232	*/
	5233	start_offset = 0;
	5234	upl_f_offset += skip_range;
	5235	upl_size -= skip_range;
	5236	}
	5237	pages_in_upl = upl_size / PAGE_SIZE;
	5238
	5239	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	5240	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5241
	5242	kret = ubc_create_upl(vp,
	5243	upl_f_offset,
	5244	upl_size,
	5245	&upl,
	5246	&pl,
	5247	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE);
	5248	if (kret != KERN_SUCCESS)
	5249	return(retval);
	5250	issued_io = 0;
	5251
	5252	/*
	5253	* before we start marching forward, we must make sure we end on
	5254	* a present page, otherwise we will be working with a freed
	5255	* upl
	5256	*/
	5257	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5258	if (upl_page_present(pl, last_pg))
	5259	break;
	5260	}
	5261	pages_in_upl = last_pg + 1;
	5262
	5263
	5264	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	5265	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5266
	5267
	5268	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5269	/*
	5270	* scan from the beginning of the upl looking for the first
	5271	* page that is present.... this will become the first page in
	5272	* the request we're going to make to 'cluster_io'... if all
	5273	* of the pages are absent, we won't call through to 'cluster_io'
	5274	*/
	5275	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5276	if (upl_page_present(pl, start_pg))
	5277	break;
	5278	}
	5279
	5280	/*
	5281	* scan from the starting present page looking for an absent
	5282	* page before the end of the upl is reached, if we
	5283	* find one, then it will terminate the range of pages being
	5284	* presented to 'cluster_io'
	5285	*/
	5286	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5287	if (!upl_page_present(pl, last_pg))
	5288	break;
	5289	}
	5290
	5291	if (last_pg > start_pg) {
	5292	/*
	5293	* we found a range of pages that must be filled
	5294	* if the last page in this range is the last page of the file
	5295	* we may have to clip the size of it to keep from reading past
	5296	* the end of the last physical block associated with the file
	5297	*/
	5298	upl_offset = start_pg * PAGE_SIZE;
	5299	io_size = (last_pg - start_pg) * PAGE_SIZE;
	5300
	5301	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	5302	io_size = filesize - (upl_f_offset + upl_offset);
	5303
	5304	/*
	5305	* issue an asynchronous read to cluster_io
	5306	*/
	5307	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5308	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5309
	5310	issued_io = 1;
	5311	}
	5312	}
	5313	if (issued_io == 0)
	5314	ubc_upl_abort(upl, 0);
	5315
	5316	io_size = upl_size - start_offset;
	5317
	5318	if (io_size > resid)
	5319	io_size = resid;
	5320	f_offset += io_size;
	5321	resid -= io_size;
	5322	}
	5323
	5324	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	5325	(int)f_offset, resid, retval, 0, 0);
	5326
	5327	return(retval);
	5328	}
	5329
	5330
	5331	int
	5332	cluster_push(vnode_t vp, int flags)
	5333	{
	5334	return cluster_push_ext(vp, flags, NULL, NULL);
	5335	}
	5336
	5337
	5338	int
	5339	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void ), void *callback_arg)
	5340	{
	5341	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
	5342	}
	5343
	5344	/* write errors via err, but return the number of clusters written */
	5345	int
	5346	cluster_push_err(vnode_t vp, int flags, int (callback)(buf_t, void ), void callback_arg, int err)
	5347	{
	5348	int retval;
	5349	int my_sparse_wait = 0;
	5350	struct cl_writebehind *wbp;
	5351
	5352	if (err)
	5353	*err = 0;
	5354
	5355	if ( !UBCINFOEXISTS(vp)) {
	5356	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
	5357	return (0);
	5358	}
	5359	/* return if deferred write is set */
	5360	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	5361	return (0);
	5362	}
	5363	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	5364	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
	5365	return (0);
	5366	}
	5367	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	5368	lck_mtx_unlock(&wbp->cl_lockw);
	5369
	5370	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
	5371	return(0);
	5372	}
	5373	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	5374	wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	5375
	5376	/*
	5377	* if we have an fsync in progress, we don't want to allow any additional
	5378	* sync/fsync/close(s) to occur until it finishes.
	5379	* note that its possible for writes to continue to occur to this file
	5380	* while we're waiting and also once the fsync starts to clean if we're
	5381	* in the sparse map case
	5382	*/
	5383	while (wbp->cl_sparse_wait) {
	5384	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5385
	5386	msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5387
	5388	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5389	}
	5390	if (flags & IO_SYNC) {
	5391	my_sparse_wait = 1;
	5392	wbp->cl_sparse_wait = 1;
	5393
	5394	/*
	5395	* this is an fsync (or equivalent)... we must wait for any existing async
	5396	* cleaning operations to complete before we evaulate the current state
	5397	* and finish cleaning... this insures that all writes issued before this
	5398	* fsync actually get cleaned to the disk before this fsync returns
	5399	*/
	5400	while (wbp->cl_sparse_pushes) {
	5401	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5402
	5403	msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5404
	5405	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5406	}
	5407	}
	5408	if (wbp->cl_scmap) {
	5409	void *scmap;
	5410
	5411	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
	5412
	5413	scmap = wbp->cl_scmap;
	5414	wbp->cl_scmap = NULL;
	5415
	5416	wbp->cl_sparse_pushes++;
	5417
	5418	lck_mtx_unlock(&wbp->cl_lockw);
	5419
	5420	retval = sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5421
	5422	lck_mtx_lock(&wbp->cl_lockw);
	5423
	5424	wbp->cl_sparse_pushes--;
	5425
	5426	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
	5427	wakeup((caddr_t)&wbp->cl_sparse_pushes);
	5428	} else {
	5429	retval = sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5430	}
	5431	if (err)
	5432	*err = retval;
	5433	retval = 1;
	5434	} else {
	5435	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err);
	5436	}
	5437	lck_mtx_unlock(&wbp->cl_lockw);
	5438
	5439	if (flags & IO_SYNC)
	5440	(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
	5441
	5442	if (my_sparse_wait) {
	5443	/*
	5444	* I'm the owner of the serialization token
	5445	* clear it and wakeup anyone that is waiting
	5446	* for me to finish
	5447	*/
	5448	lck_mtx_lock(&wbp->cl_lockw);
	5449
	5450	wbp->cl_sparse_wait = 0;
	5451	wakeup((caddr_t)&wbp->cl_sparse_wait);
	5452
	5453	lck_mtx_unlock(&wbp->cl_lockw);
	5454	}
	5455	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	5456	wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
	5457
	5458	return (retval);
	5459	}
	5460
	5461
	5462	__private_extern__ void
	5463	cluster_release(struct ubc_info *ubc)
	5464	{
	5465	struct cl_writebehind *wbp;
	5466	struct cl_readahead *rap;
	5467
	5468	if ((wbp = ubc->cl_wbehind)) {
	5469
	5470	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
	5471
	5472	if (wbp->cl_scmap)
	5473	vfs_drt_control(&(wbp->cl_scmap), 0);
	5474	} else {
	5475	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, 0, 0, 0, 0);
	5476	}
	5477
	5478	rap = ubc->cl_rahead;
	5479
	5480	if (wbp != NULL) {
	5481	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	5482	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	5483	}
	5484	if ((rap = ubc->cl_rahead)) {
	5485	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	5486	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	5487	}
	5488	ubc->cl_rahead = NULL;
	5489	ubc->cl_wbehind = NULL;
	5490
	5491	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, ubc, rap, wbp, 0, 0);
	5492	}
	5493
	5494
	5495	static int
	5496	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg, int *err)
	5497	{
	5498	int cl_index;
	5499	int cl_index1;
	5500	int min_index;
	5501	int cl_len;
	5502	int cl_pushed = 0;
	5503	struct cl_wextent l_clusters[MAX_CLUSTERS];
	5504	u_int max_cluster_pgcount;
	5505	int error = 0;
	5506
	5507	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	5508	/*
	5509	* the write behind context exists and has
	5510	* already been locked...
	5511	*/
	5512	if (wbp->cl_number == 0)
	5513	/*
	5514	* no clusters to push
	5515	* return number of empty slots
	5516	*/
	5517	return (MAX_CLUSTERS);
	5518
	5519	/*
	5520	* make a local 'sorted' copy of the clusters
	5521	* and clear wbp->cl_number so that new clusters can
	5522	* be developed
	5523	*/
	5524	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5525	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	5526	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
	5527	continue;
	5528	if (min_index == -1)
	5529	min_index = cl_index1;
	5530	else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
	5531	min_index = cl_index1;
	5532	}
	5533	if (min_index == -1)
	5534	break;
	5535
	5536	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	5537	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	5538	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
	5539
	5540	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	5541	}
	5542	wbp->cl_number = 0;
	5543
	5544	cl_len = cl_index;
	5545
	5546	/* skip switching to the sparse cluster mechanism if on diskimage */
	5547	if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) &&
	5548	!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) {
	5549	int i;
	5550
	5551	/*
	5552	* determine if we appear to be writing the file sequentially
	5553	* if not, by returning without having pushed any clusters
	5554	* we will cause this vnode to be pushed into the sparse cluster mechanism
	5555	* used for managing more random I/O patterns
	5556	*
	5557	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	5558	* that's why we're in try_push with PUSH_DELAY...
	5559	*
	5560	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	5561	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	5562	* so we can just make a simple pass through, up to, but not including the last one...
	5563	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	5564	* are sequential
	5565	*
	5566	* we let the last one be partial as long as it was adjacent to the previous one...
	5567	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	5568	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	5569	*/
	5570	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	5571	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
	5572	goto dont_try;
	5573	if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
	5574	goto dont_try;
	5575	}
	5576	}
	5577	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	5578	int flags;
	5579	struct cl_extent cl;
	5580	int retval;
	5581
	5582	flags = io_flags & (IO_PASSIVE\|IO_CLOSE);
	5583
	5584	/*
	5585	* try to push each cluster in turn...
	5586	*/
	5587	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
	5588	flags \|= IO_NOCACHE;
	5589
	5590	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
	5591	flags \|= IO_PASSIVE;
	5592
	5593	if (push_flag & PUSH_SYNC)
	5594	flags \|= IO_SYNC;
	5595
	5596	cl.b_addr = l_clusters[cl_index].b_addr;
	5597	cl.e_addr = l_clusters[cl_index].e_addr;
	5598
	5599	retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
	5600
	5601	if (error == 0 && retval)
	5602	error = retval;
	5603
	5604	l_clusters[cl_index].b_addr = 0;
	5605	l_clusters[cl_index].e_addr = 0;
	5606
	5607	cl_pushed++;
	5608
	5609	if ( !(push_flag & PUSH_ALL) )
	5610	break;
	5611	}
	5612	if (err)
	5613	*err = error;
	5614
	5615	dont_try:
	5616	if (cl_len > cl_pushed) {
	5617	/*
	5618	* we didn't push all of the clusters, so
	5619	* lets try to merge them back in to the vnode
	5620	*/
	5621	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	5622	/*
	5623	* we picked up some new clusters while we were trying to
	5624	* push the old ones... this can happen because I've dropped
	5625	* the vnode lock... the sum of the
	5626	* leftovers plus the new cluster count exceeds our ability
	5627	* to represent them, so switch to the sparse cluster mechanism
	5628	*
	5629	* collect the active public clusters...
	5630	*/
	5631	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5632
	5633	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	5634	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5635	continue;
	5636	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5637	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5638	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5639
	5640	cl_index1++;
	5641	}
	5642	/*
	5643	* update the cluster count
	5644	*/
	5645	wbp->cl_number = cl_index1;
	5646
	5647	/*
	5648	* and collect the original clusters that were moved into the
	5649	* local storage for sorting purposes
	5650	*/
	5651	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5652
	5653	} else {
	5654	/*
	5655	* we've got room to merge the leftovers back in
	5656	* just append them starting at the next 'hole'
	5657	* represented by wbp->cl_number
	5658	*/
	5659	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	5660	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5661	continue;
	5662
	5663	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5664	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5665	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5666
	5667	cl_index1++;
	5668	}
	5669	/*
	5670	* update the cluster count
	5671	*/
	5672	wbp->cl_number = cl_index1;
	5673	}
	5674	}
	5675	return (MAX_CLUSTERS - wbp->cl_number);
	5676	}
	5677
	5678
	5679
	5680	static int
	5681	cluster_push_now(vnode_t vp, struct cl_extent cl, off_t EOF, int flags, int (callback)(buf_t, void ), void callback_arg)
	5682	{
	5683	upl_page_info_t *pl;
	5684	upl_t upl;
	5685	vm_offset_t upl_offset;
	5686	int upl_size;
	5687	off_t upl_f_offset;
	5688	int pages_in_upl;
	5689	int start_pg;
	5690	int last_pg;
	5691	int io_size;
	5692	int io_flags;
	5693	int upl_flags;
	5694	int bflag;
	5695	int size;
	5696	int error = 0;
	5697	int retval;
	5698	kern_return_t kret;
	5699
	5700	if (flags & IO_PASSIVE)
	5701	bflag = CL_PASSIVE;
	5702	else
	5703	bflag = 0;
	5704
	5705	if (flags & IO_SKIP_ENCRYPTION)
	5706	bflag \|= CL_ENCRYPTED;
	5707
	5708	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	5709	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	5710
	5711	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	5712	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	5713
	5714	return (0);
	5715	}
	5716	upl_size = pages_in_upl * PAGE_SIZE;
	5717	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5718
	5719	if (upl_f_offset + upl_size >= EOF) {
	5720
	5721	if (upl_f_offset >= EOF) {
	5722	/*
	5723	* must have truncated the file and missed
	5724	* clearing a dangling cluster (i.e. it's completely
	5725	* beyond the new EOF
	5726	*/
	5727	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	5728
	5729	return(0);
	5730	}
	5731	size = EOF - upl_f_offset;
	5732
	5733	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5734	pages_in_upl = upl_size / PAGE_SIZE;
	5735	} else
	5736	size = upl_size;
	5737
	5738	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	5739
	5740	/*
	5741	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	5742	*
	5743	* - only pages that are currently dirty are returned... these are the ones we need to clean
	5744	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	5745	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	5746	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	5747	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	5748	*
	5749	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	5750	*/
	5751
	5752	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE))
	5753	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	5754	else
	5755	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	5756
	5757	kret = ubc_create_upl(vp,
	5758	upl_f_offset,
	5759	upl_size,
	5760	&upl,
	5761	&pl,
	5762	upl_flags);
	5763	if (kret != KERN_SUCCESS)
	5764	panic("cluster_push: failed to get pagelist");
	5765
	5766	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
	5767
	5768	/*
	5769	* since we only asked for the dirty pages back
	5770	* it's possible that we may only get a few or even none, so...
	5771	* before we start marching forward, we must make sure we know
	5772	* where the last present page is in the UPL, otherwise we could
	5773	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	5774	* employed by commit_range and abort_range.
	5775	*/
	5776	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5777	if (upl_page_present(pl, last_pg))
	5778	break;
	5779	}
	5780	pages_in_upl = last_pg + 1;
	5781
	5782	if (pages_in_upl == 0) {
	5783	ubc_upl_abort(upl, 0);
	5784
	5785	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	5786	return(0);
	5787	}
	5788
	5789	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5790	/*
	5791	* find the next dirty page in the UPL
	5792	* this will become the first page in the
	5793	* next I/O to generate
	5794	*/
	5795	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5796	if (upl_dirty_page(pl, start_pg))
	5797	break;
	5798	if (upl_page_present(pl, start_pg))
	5799	/*
	5800	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	5801	* just release these unchanged since we're not going
	5802	* to steal them or change their state
	5803	*/
	5804	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	5805	}
	5806	if (start_pg >= pages_in_upl)
	5807	/*
	5808	* done... no more dirty pages to push
	5809	*/
	5810	break;
	5811	if (start_pg > last_pg)
	5812	/*
	5813	* skipped over some non-dirty pages
	5814	*/
	5815	size -= ((start_pg - last_pg) * PAGE_SIZE);
	5816
	5817	/*
	5818	* find a range of dirty pages to write
	5819	*/
	5820	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5821	if (!upl_dirty_page(pl, last_pg))
	5822	break;
	5823	}
	5824	upl_offset = start_pg * PAGE_SIZE;
	5825
	5826	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	5827
	5828	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
	5829
	5830	if ( !(flags & IO_SYNC))
	5831	io_flags \|= CL_ASYNC;
	5832
	5833	if (flags & IO_CLOSE)
	5834	io_flags \|= CL_CLOSE;
	5835
	5836	if (flags & IO_NOCACHE)
	5837	io_flags \|= CL_NOCACHE;
	5838
	5839	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5840	io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5841
	5842	if (error == 0 && retval)
	5843	error = retval;
	5844
	5845	size -= io_size;
	5846	}
	5847	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, 0, 0, 0);
	5848
	5849	return(error);
	5850	}
	5851
	5852
	5853	/*
	5854	* sparse_cluster_switch is called with the write behind lock held
	5855	*/
	5856	static void
	5857	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int (callback)(buf_t, void ), void callback_arg)
	5858	{
	5859	int cl_index;
	5860
	5861	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
	5862
	5863	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5864	int flags;
	5865	struct cl_extent cl;
	5866
	5867	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	5868
	5869	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
	5870	if (flags & UPL_POP_DIRTY) {
	5871	cl.e_addr = cl.b_addr + 1;
	5872
	5873	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg);
	5874	}
	5875	}
	5876	}
	5877	}
	5878	wbp->cl_number = 0;
	5879
	5880	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
	5881	}
	5882
	5883
	5884	/*
	5885	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
	5886	* still associated with the write-behind context... however, if the scmap has been disassociated
	5887	* from the write-behind context (the cluster_push case), the wb lock is not held
	5888	*/
	5889	static int
	5890	sparse_cluster_push(void *scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg)
	5891	{
	5892	struct cl_extent cl;
	5893	off_t offset;
	5894	u_int length;
	5895	int error = 0;
	5896
	5897	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
	5898
	5899	if (push_flag & PUSH_ALL)
	5900	vfs_drt_control(scmap, 1);
	5901
	5902	for (;;) {
	5903	int retval;
	5904	if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
	5905	break;
	5906
	5907	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	5908	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	5909
	5910	retval = cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE\|IO_CLOSE), callback, callback_arg);
	5911	if (error == 0 && retval)
	5912	error = retval;
	5913
	5914	if ( !(push_flag & PUSH_ALL) )
	5915	break;
	5916	}
	5917	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
	5918
	5919	return error;
	5920	}
	5921
	5922
	5923	/*
	5924	* sparse_cluster_add is called with the write behind lock held
	5925	*/
	5926	static void
	5927	sparse_cluster_add(void *scmap, vnode_t vp, struct cl_extent cl, off_t EOF, int (callback)(buf_t, void ), void *callback_arg)
	5928	{
	5929	u_int new_dirty;
	5930	u_int length;
	5931	off_t offset;
	5932
	5933	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
	5934
	5935	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5936	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	5937
	5938	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
	5939	/*
	5940	* no room left in the map
	5941	* only a partial update was done
	5942	* push out some pages and try again
	5943	*/
	5944	sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
	5945
	5946	offset += (new_dirty * PAGE_SIZE_64);
	5947	length -= (new_dirty * PAGE_SIZE);
	5948	}
	5949	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
	5950	}
	5951
	5952
	5953	static int
	5954	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (callback)(buf_t, void ), void callback_arg)
	5955	{
	5956	upl_page_info_t *pl;
	5957	upl_t upl;
	5958	addr64_t ubc_paddr;
	5959	kern_return_t kret;
	5960	int error = 0;
	5961	int did_read = 0;
	5962	int abort_flags;
	5963	int upl_flags;
	5964	int bflag;
	5965
	5966	if (flags & IO_PASSIVE)
	5967	bflag = CL_PASSIVE;
	5968	else
	5969	bflag = 0;
	5970
	5971	if (flags & IO_NOCACHE)
	5972	bflag \|= CL_NOCACHE;
	5973
	5974	upl_flags = UPL_SET_LITE;
	5975
	5976	if ( !(flags & CL_READ) ) {
	5977	/*
	5978	* "write" operation: let the UPL subsystem know
	5979	* that we intend to modify the buffer cache pages
	5980	* we're gathering.
	5981	*/
	5982	upl_flags \|= UPL_WILL_MODIFY;
	5983	} else {
	5984	/*
	5985	* indicate that there is no need to pull the
	5986	* mapping for this page... we're only going
	5987	* to read from it, not modify it.
	5988	*/
	5989	upl_flags \|= UPL_FILE_IO;
	5990	}
	5991	kret = ubc_create_upl(vp,
	5992	uio->uio_offset & ~PAGE_MASK_64,
	5993	PAGE_SIZE,
	5994	&upl,
	5995	&pl,
	5996	upl_flags);
	5997
	5998	if (kret != KERN_SUCCESS)
	5999	return(EINVAL);
	6000
	6001	if (!upl_valid_page(pl, 0)) {
	6002	/*
	6003	* issue a synchronous read to cluster_io
	6004	*/
	6005	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6006	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6007	if (error) {
	6008	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	6009
	6010	return(error);
	6011	}
	6012	did_read = 1;
	6013	}
	6014	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	6015
	6016	/*
	6017	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	6018	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	6019	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	6020	* way to do so without exporting them to kexts as well.
	6021	*/
	6022	if (flags & CL_READ)
	6023	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	6024	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	6025	else
	6026	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	6027	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	6028
	6029	if ( !(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	6030	/*
	6031	* issue a synchronous write to cluster_io
	6032	*/
	6033	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6034	bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6035	}
	6036	if (error == 0)
	6037	uio_update(uio, (user_size_t)xsize);
	6038
	6039	if (did_read)
	6040	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	6041	else
	6042	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	6043
	6044	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	6045
	6046	return (error);
	6047	}
	6048
	6049	int
	6050	cluster_copy_upl_data(struct uio uio, upl_t upl, int upl_offset, int io_resid)
	6051	{
	6052	int pg_offset;
	6053	int pg_index;
	6054	int csize;
	6055	int segflg;
	6056	int retval = 0;
	6057	int xsize;
	6058	upl_page_info_t *pl;
	6059	int dirty_count;
	6060
	6061	xsize = *io_resid;
	6062
	6063	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6064	(int)uio->uio_offset, upl_offset, xsize, 0, 0);
	6065
	6066	segflg = uio->uio_segflg;
	6067
	6068	switch(segflg) {
	6069
	6070	case UIO_USERSPACE32:
	6071	case UIO_USERISPACE32:
	6072	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6073	break;
	6074
	6075	case UIO_USERSPACE:
	6076	case UIO_USERISPACE:
	6077	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6078	break;
	6079
	6080	case UIO_USERSPACE64:
	6081	case UIO_USERISPACE64:
	6082	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6083	break;
	6084
	6085	case UIO_SYSSPACE:
	6086	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6087	break;
	6088
	6089	}
	6090	pl = ubc_upl_pageinfo(upl);
	6091
	6092	pg_index = upl_offset / PAGE_SIZE;
	6093	pg_offset = upl_offset & PAGE_MASK;
	6094	csize = min(PAGE_SIZE - pg_offset, xsize);
	6095
	6096	dirty_count = 0;
	6097	while (xsize && retval == 0) {
	6098	addr64_t paddr;
	6099
	6100	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
	6101	if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE))
	6102	dirty_count++;
	6103
	6104	retval = uiomove64(paddr, csize, uio);
	6105
	6106	pg_index += 1;
	6107	pg_offset = 0;
	6108	xsize -= csize;
	6109	csize = min(PAGE_SIZE, xsize);
	6110	}
	6111	*io_resid = xsize;
	6112
	6113	uio->uio_segflg = segflg;
	6114
	6115	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
	6116	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6117	(int)uio->uio_offset, xsize, retval, segflg, 0);
	6118
	6119	return (retval);
	6120	}
	6121
	6122
	6123	int
	6124	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	6125	{
	6126
	6127	return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
	6128	}
	6129
	6130
	6131	static int
	6132	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference)
	6133	{
	6134	int segflg;
	6135	int io_size;
	6136	int xsize;
	6137	int start_offset;
	6138	int retval = 0;
	6139	memory_object_control_t control;
	6140
	6141	io_size = *io_resid;
	6142
	6143	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6144	(int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
	6145
	6146	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	6147
	6148	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	6149	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6150	(int)uio->uio_offset, io_size, retval, 3, 0);
	6151
	6152	return(0);
	6153	}
	6154	segflg = uio->uio_segflg;
	6155
	6156	switch(segflg) {
	6157
	6158	case UIO_USERSPACE32:
	6159	case UIO_USERISPACE32:
	6160	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6161	break;
	6162
	6163	case UIO_USERSPACE64:
	6164	case UIO_USERISPACE64:
	6165	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6166	break;
	6167
	6168	case UIO_USERSPACE:
	6169	case UIO_USERISPACE:
	6170	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6171	break;
	6172
	6173	case UIO_SYSSPACE:
	6174	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6175	break;
	6176	}
	6177
	6178	if ( (io_size = *io_resid) ) {
	6179	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	6180	xsize = uio_resid(uio);
	6181
	6182	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
	6183	start_offset, io_size, mark_dirty, take_reference);
	6184	xsize -= uio_resid(uio);
	6185	io_size -= xsize;
	6186	}
	6187	uio->uio_segflg = segflg;
	6188	*io_resid = io_size;
	6189
	6190	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6191	(int)uio->uio_offset, io_size, retval, 0x80000000 \| segflg, 0);
	6192
	6193	return(retval);
	6194	}
	6195
	6196
	6197	int
	6198	is_file_clean(vnode_t vp, off_t filesize)
	6199	{
	6200	off_t f_offset;
	6201	int flags;
	6202	int total_dirty = 0;
	6203
	6204	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	6205	if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
	6206	if (flags & UPL_POP_DIRTY) {
	6207	total_dirty++;
	6208	}
	6209	}
	6210	}
	6211	if (total_dirty)
	6212	return(EINVAL);
	6213
	6214	return (0);
	6215	}
	6216
	6217
	6218
	6219	/*
	6220	* Dirty region tracking/clustering mechanism.
	6221	*
	6222	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	6223	* dirty regions within a larger space (file). It is primarily intended to
	6224	* support clustering in large files with many dirty areas.
	6225	*
	6226	* The implementation assumes that the dirty regions are pages.
	6227	*
	6228	* To represent dirty pages within the file, we store bit vectors in a
	6229	* variable-size circular hash.
	6230	*/
	6231
	6232	/*
	6233	* Bitvector size. This determines the number of pages we group in a
	6234	* single hashtable entry. Each hashtable entry is aligned to this
	6235	* size within the file.
	6236	*/
	6237	#define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE)
	6238
	6239	/*
	6240	* File offset handling.
	6241	*
	6242	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	6243	* the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6244	*/
	6245	#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6246	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	6247
	6248	/*
	6249	* Hashtable address field handling.
	6250	*
	6251	* The low-order bits of the hashtable address are used to conserve
	6252	* space.
	6253	*
	6254	* DRT_HASH_COUNT_MASK must be large enough to store the range
	6255	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	6256	* to indicate that the bucket is actually unoccupied.
	6257	*/
	6258	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	6259	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	6260	do { \
	6261	(scm)->scm_hashtable[(i)].dhe_control = \
	6262	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	6263	} while (0)
	6264	#define DRT_HASH_COUNT_MASK 0x1ff
	6265	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	6266	#define DRT_HASH_SET_COUNT(scm, i, c) \
	6267	do { \
	6268	(scm)->scm_hashtable[(i)].dhe_control = \
	6269	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	6270	} while (0)
	6271	#define DRT_HASH_CLEAR(scm, i) \
	6272	do { \
	6273	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	6274	} while (0)
	6275	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	6276	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	6277	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	6278	do { \
	6279	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	6280	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	6281	} while(0);
	6282
	6283
	6284	/*
	6285	* Hash table moduli.
	6286	*
	6287	* Since the hashtable entry's size is dependent on the size of
	6288	* the bitvector, and since the hashtable size is constrained to
	6289	* both being prime and fitting within the desired allocation
	6290	* size, these values need to be manually determined.
	6291	*
	6292	* For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
	6293	*
	6294	* The small hashtable allocation is 1024 bytes, so the modulus is 23.
	6295	* The large hashtable allocation is 16384 bytes, so the modulus is 401.
	6296	*/
	6297	#define DRT_HASH_SMALL_MODULUS 23
	6298	#define DRT_HASH_LARGE_MODULUS 401
	6299
	6300	/*
	6301	* Physical memory required before the large hash modulus is permitted.
	6302	*
	6303	* On small memory systems, the large hash modulus can lead to phsyical
	6304	* memory starvation, so we avoid using it there.
	6305	*/
	6306	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
	6307
	6308	#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
	6309	#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
	6310
	6311	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	6312
	6313	/*
	6314	* Hashtable bitvector handling.
	6315	*
	6316	* Bitvector fields are 32 bits long.
	6317	*/
	6318
	6319	#define DRT_HASH_SET_BIT(scm, i, bit) \
	6320	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	6321
	6322	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	6323	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	6324
	6325	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	6326	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	6327
	6328	#define DRT_BITVECTOR_CLEAR(scm, i) \
	6329	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6330
	6331	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	6332	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	6333	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	6334	(DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6335
	6336
	6337
	6338	/*
	6339	* Hashtable entry.
	6340	*/
	6341	struct vfs_drt_hashentry {
	6342	u_int64_t dhe_control;
	6343	/*
	6344	* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	6345	* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
	6346	* Since PAGE_SIZE is only known at boot time,
	6347	* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
	6348	* -declare dhe_bitvector array for largest possible length
	6349	*/
	6350	#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
	6351	u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
	6352	};
	6353
	6354	/*
	6355	* Dirty Region Tracking structure.
	6356	*
	6357	* The hashtable is allocated entirely inside the DRT structure.
	6358	*
	6359	* The hash is a simple circular prime modulus arrangement, the structure
	6360	* is resized from small to large if it overflows.
	6361	*/
	6362
	6363	struct vfs_drt_clustermap {
	6364	u_int32_t scm_magic; /* sanity/detection */
	6365	#define DRT_SCM_MAGIC 0x12020003
	6366	u_int32_t scm_modulus; /* current ring size */
	6367	u_int32_t scm_buckets; /* number of occupied buckets */
	6368	u_int32_t scm_lastclean; /* last entry we cleaned */
	6369	u_int32_t scm_iskips; /* number of slot skips */
	6370
	6371	struct vfs_drt_hashentry scm_hashtable[0];
	6372	};
	6373
	6374
	6375	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	6376	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	6377
	6378	/*
	6379	* Debugging codes and arguments.
	6380	*/
	6381	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	6382	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	6383	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	6384	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	6385	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	6386	* dirty */
	6387	/* 0, setcount */
	6388	/* 1 (clean, no map) */
	6389	/* 2 (map alloc fail) */
	6390	/* 3, resid (partial) */
	6391	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	6392	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	6393	* lastclean, iskips */
	6394
	6395
	6396	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	6397	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	6398	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	6399	u_int64_t offset, int *indexp);
	6400	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	6401	u_int64_t offset,
	6402	int *indexp,
	6403	int recursed);
	6404	static kern_return_t vfs_drt_do_mark_pages(
	6405	void **cmapp,
	6406	u_int64_t offset,
	6407	u_int length,
	6408	u_int *setcountp,
	6409	int dirty);
	6410	static void vfs_drt_trace(
	6411	struct vfs_drt_clustermap *cmap,
	6412	int code,
	6413	int arg1,
	6414	int arg2,
	6415	int arg3,
	6416	int arg4);
	6417
	6418
	6419	/*
	6420	* Allocate and initialise a sparse cluster map.
	6421	*
	6422	* Will allocate a new map, resize or compact an existing map.
	6423	*
	6424	* XXX we should probably have at least one intermediate map size,
	6425	* as the 1:16 ratio seems a bit drastic.
	6426	*/
	6427	static kern_return_t
	6428	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	6429	{
	6430	struct vfs_drt_clustermap cmap, ocmap;
	6431	kern_return_t kret;
	6432	u_int64_t offset;
	6433	u_int32_t i;
	6434	int nsize, active_buckets, index, copycount;
	6435
	6436	ocmap = NULL;
	6437	if (cmapp != NULL)
	6438	ocmap = *cmapp;
	6439
	6440	/*
	6441	* Decide on the size of the new map.
	6442	*/
	6443	if (ocmap == NULL) {
	6444	nsize = DRT_HASH_SMALL_MODULUS;
	6445	} else {
	6446	/* count the number of active buckets in the old map */
	6447	active_buckets = 0;
	6448	for (i = 0; i < ocmap->scm_modulus; i++) {
	6449	if (!DRT_HASH_VACANT(ocmap, i) &&
	6450	(DRT_HASH_GET_COUNT(ocmap, i) != 0))
	6451	active_buckets++;
	6452	}
	6453	/*
	6454	* If we're currently using the small allocation, check to
	6455	* see whether we should grow to the large one.
	6456	*/
	6457	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	6458	/*
	6459	* If the ring is nearly full and we are allowed to
	6460	* use the large modulus, upgrade.
	6461	*/
	6462	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
	6463	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
	6464	nsize = DRT_HASH_LARGE_MODULUS;
	6465	} else {
	6466	nsize = DRT_HASH_SMALL_MODULUS;
	6467	}
	6468	} else {
	6469	/* already using the large modulus */
	6470	nsize = DRT_HASH_LARGE_MODULUS;
	6471	/*
	6472	* If the ring is completely full, there's
	6473	* nothing useful for us to do. Behave as
	6474	* though we had compacted into the new
	6475	* array and return.
	6476	*/
	6477	if (active_buckets >= DRT_HASH_LARGE_MODULUS)
	6478	return(KERN_SUCCESS);
	6479	}
	6480	}
	6481
	6482	/*
	6483	* Allocate and initialise the new map.
	6484	*/
	6485
	6486	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
	6487	(nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE);
	6488	if (kret != KERN_SUCCESS)
	6489	return(kret);
	6490	cmap->scm_magic = DRT_SCM_MAGIC;
	6491	cmap->scm_modulus = nsize;
	6492	cmap->scm_buckets = 0;
	6493	cmap->scm_lastclean = 0;
	6494	cmap->scm_iskips = 0;
	6495	for (i = 0; i < cmap->scm_modulus; i++) {
	6496	DRT_HASH_CLEAR(cmap, i);
	6497	DRT_HASH_VACATE(cmap, i);
	6498	DRT_BITVECTOR_CLEAR(cmap, i);
	6499	}
	6500
	6501	/*
	6502	* If there's an old map, re-hash entries from it into the new map.
	6503	*/
	6504	copycount = 0;
	6505	if (ocmap != NULL) {
	6506	for (i = 0; i < ocmap->scm_modulus; i++) {
	6507	/* skip empty buckets */
	6508	if (DRT_HASH_VACANT(ocmap, i) \|\|
	6509	(DRT_HASH_GET_COUNT(ocmap, i) == 0))
	6510	continue;
	6511	/* get new index */
	6512	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	6513	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	6514	if (kret != KERN_SUCCESS) {
	6515	/* XXX need to bail out gracefully here */
	6516	panic("vfs_drt: new cluster map mysteriously too small");
	6517	index = 0;
	6518	}
	6519	/* copy */
	6520	DRT_HASH_COPY(ocmap, i, cmap, index);
	6521	copycount++;
	6522	}
	6523	}
	6524
	6525	/* log what we've done */
	6526	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	6527
	6528	/*
	6529	* It's important to ensure that *cmapp always points to
	6530	* a valid map, so we must overwrite it before freeing
	6531	* the old map.
	6532	*/
	6533	*cmapp = cmap;
	6534	if (ocmap != NULL) {
	6535	/* emit stats into trace buffer */
	6536	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	6537	ocmap->scm_modulus,
	6538	ocmap->scm_buckets,
	6539	ocmap->scm_lastclean,
	6540	ocmap->scm_iskips);
	6541
	6542	vfs_drt_free_map(ocmap);
	6543	}
	6544	return(KERN_SUCCESS);
	6545	}
	6546
	6547
	6548	/*
	6549	* Free a sparse cluster map.
	6550	*/
	6551	static kern_return_t
	6552	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	6553	{
	6554	kmem_free(kernel_map, (vm_offset_t)cmap,
	6555	(cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	6556	return(KERN_SUCCESS);
	6557	}
	6558
	6559
	6560	/*
	6561	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	6562	*/
	6563	static kern_return_t
	6564	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	6565	{
	6566	int index;
	6567	u_int32_t i;
	6568
	6569	offset = DRT_ALIGN_ADDRESS(offset);
	6570	index = DRT_HASH(cmap, offset);
	6571
	6572	/* traverse the hashtable */
	6573	for (i = 0; i < cmap->scm_modulus; i++) {
	6574
	6575	/*
	6576	* If the slot is vacant, we can stop.
	6577	*/
	6578	if (DRT_HASH_VACANT(cmap, index))
	6579	break;
	6580
	6581	/*
	6582	* If the address matches our offset, we have success.
	6583	*/
	6584	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	6585	*indexp = index;
	6586	return(KERN_SUCCESS);
	6587	}
	6588
	6589	/*
	6590	* Move to the next slot, try again.
	6591	*/
	6592	index = DRT_HASH_NEXT(cmap, index);
	6593	}
	6594	/*
	6595	* It's not there.
	6596	*/
	6597	return(KERN_FAILURE);
	6598	}
	6599
	6600	/*
	6601	* Find the hashtable slot for the supplied offset. If we haven't allocated
	6602	* one yet, allocate one and populate the address field. Note that it will
	6603	* not have a nonzero page count and thus will still technically be free, so
	6604	* in the case where we are called to clean pages, the slot will remain free.
	6605	*/
	6606	static kern_return_t
	6607	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	6608	{
	6609	struct vfs_drt_clustermap *cmap;
	6610	kern_return_t kret;
	6611	u_int32_t index;
	6612	u_int32_t i;
	6613
	6614	cmap = *cmapp;
	6615
	6616	/* look for an existing entry */
	6617	kret = vfs_drt_search_index(cmap, offset, indexp);
	6618	if (kret == KERN_SUCCESS)
	6619	return(kret);
	6620
	6621	/* need to allocate an entry */
	6622	offset = DRT_ALIGN_ADDRESS(offset);
	6623	index = DRT_HASH(cmap, offset);
	6624
	6625	/* scan from the index forwards looking for a vacant slot */
	6626	for (i = 0; i < cmap->scm_modulus; i++) {
	6627	/* slot vacant? */
	6628	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap,index) == 0) {
	6629	cmap->scm_buckets++;
	6630	if (index < cmap->scm_lastclean)
	6631	cmap->scm_lastclean = index;
	6632	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	6633	DRT_HASH_SET_COUNT(cmap, index, 0);
	6634	DRT_BITVECTOR_CLEAR(cmap, index);
	6635	*indexp = index;
	6636	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	6637	return(KERN_SUCCESS);
	6638	}
	6639	cmap->scm_iskips += i;
	6640	index = DRT_HASH_NEXT(cmap, index);
	6641	}
	6642
	6643	/*
	6644	* We haven't found a vacant slot, so the map is full. If we're not
	6645	* already recursed, try reallocating/compacting it.
	6646	*/
	6647	if (recursed)
	6648	return(KERN_FAILURE);
	6649	kret = vfs_drt_alloc_map(cmapp);
	6650	if (kret == KERN_SUCCESS) {
	6651	/* now try to insert again */
	6652	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	6653	}
	6654	return(kret);
	6655	}
	6656
	6657	/*
	6658	* Implementation of set dirty/clean.
	6659	*
	6660	* In the 'clean' case, not finding a map is OK.
	6661	*/
	6662	static kern_return_t
	6663	vfs_drt_do_mark_pages(
	6664	void **private,
	6665	u_int64_t offset,
	6666	u_int length,
	6667	u_int *setcountp,
	6668	int dirty)
	6669	{
	6670	struct vfs_drt_clustermap cmap, *cmapp;
	6671	kern_return_t kret;
	6672	int i, index, pgoff, pgcount, setcount, ecount;
	6673
	6674	cmapp = (struct vfs_drt_clustermap **)private;
	6675	cmap = *cmapp;
	6676
	6677	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	6678
	6679	if (setcountp != NULL)
	6680	*setcountp = 0;
	6681
	6682	/* allocate a cluster map if we don't already have one */
	6683	if (cmap == NULL) {
	6684	/* no cluster map, nothing to clean */
	6685	if (!dirty) {
	6686	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	6687	return(KERN_SUCCESS);
	6688	}
	6689	kret = vfs_drt_alloc_map(cmapp);
	6690	if (kret != KERN_SUCCESS) {
	6691	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	6692	return(kret);
	6693	}
	6694	}
	6695	setcount = 0;
	6696
	6697	/*
	6698	* Iterate over the length of the region.
	6699	*/
	6700	while (length > 0) {
	6701	/*
	6702	* Get the hashtable index for this offset.
	6703	*
	6704	* XXX this will add blank entries if we are clearing a range
	6705	* that hasn't been dirtied.
	6706	*/
	6707	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	6708	cmap = cmapp; / may have changed! */
	6709	/* this may be a partial-success return */
	6710	if (kret != KERN_SUCCESS) {
	6711	if (setcountp != NULL)
	6712	*setcountp = setcount;
	6713	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	6714
	6715	return(kret);
	6716	}
	6717
	6718	/*
	6719	* Work out how many pages we're modifying in this
	6720	* hashtable entry.
	6721	*/
	6722	pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
	6723	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	6724
	6725	/*
	6726	* Iterate over pages, dirty/clearing as we go.
	6727	*/
	6728	ecount = DRT_HASH_GET_COUNT(cmap, index);
	6729	for (i = 0; i < pgcount; i++) {
	6730	if (dirty) {
	6731	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6732	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	6733	ecount++;
	6734	setcount++;
	6735	}
	6736	} else {
	6737	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6738	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	6739	ecount--;
	6740	setcount++;
	6741	}
	6742	}
	6743	}
	6744	DRT_HASH_SET_COUNT(cmap, index, ecount);
	6745
	6746	offset += pgcount * PAGE_SIZE;
	6747	length -= pgcount * PAGE_SIZE;
	6748	}
	6749	if (setcountp != NULL)
	6750	*setcountp = setcount;
	6751
	6752	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	6753
	6754	return(KERN_SUCCESS);
	6755	}
	6756
	6757	/*
	6758	* Mark a set of pages as dirty/clean.
	6759	*
	6760	* This is a public interface.
	6761	*
	6762	* cmapp
	6763	* Pointer to storage suitable for holding a pointer. Note that
	6764	* this must either be NULL or a value set by this function.
	6765	*
	6766	* size
	6767	* Current file size in bytes.
	6768	*
	6769	* offset
	6770	* Offset of the first page to be marked as dirty, in bytes. Must be
	6771	* page-aligned.
	6772	*
	6773	* length
	6774	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	6775	*
	6776	* setcountp
	6777	* Number of pages newly marked dirty by this call (optional).
	6778	*
	6779	* Returns KERN_SUCCESS if all the pages were successfully marked.
	6780	*/
	6781	static kern_return_t
	6782	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
	6783	{
	6784	/* XXX size unused, drop from interface */
	6785	return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
	6786	}
	6787
	6788	#if 0
	6789	static kern_return_t
	6790	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	6791	{
	6792	return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
	6793	}
	6794	#endif
	6795
	6796	/*
	6797	* Get a cluster of dirty pages.
	6798	*
	6799	* This is a public interface.
	6800	*
	6801	* cmapp
	6802	* Pointer to storage managed by drt_mark_pages. Note that this must
	6803	* be NULL or a value set by drt_mark_pages.
	6804	*
	6805	* offsetp
	6806	* Returns the byte offset into the file of the first page in the cluster.
	6807	*
	6808	* lengthp
	6809	* Returns the length in bytes of the cluster of dirty pages.
	6810	*
	6811	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	6812	* are no dirty pages meeting the minmum size criteria. Private storage will
	6813	* be released if there are no more dirty pages left in the map
	6814	*
	6815	*/
	6816	static kern_return_t
	6817	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	6818	{
	6819	struct vfs_drt_clustermap *cmap;
	6820	u_int64_t offset;
	6821	u_int length;
	6822	u_int32_t j;
	6823	int index, i, fs, ls;
	6824
	6825	/* sanity */
	6826	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6827	return(KERN_FAILURE);
	6828	cmap = *cmapp;
	6829
	6830	/* walk the hashtable */
	6831	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	6832	index = DRT_HASH(cmap, offset);
	6833
	6834	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0))
	6835	continue;
	6836
	6837	/* scan the bitfield for a string of bits */
	6838	fs = -1;
	6839
	6840	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6841	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	6842	fs = i;
	6843	break;
	6844	}
	6845	}
	6846	if (fs == -1) {
	6847	/* didn't find any bits set */
	6848	panic("vfs_drt: entry summary count > 0 but no bits set in map");
	6849	}
	6850	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	6851	if (!DRT_HASH_TEST_BIT(cmap, index, i))
	6852	break;
	6853	}
	6854
	6855	/* compute offset and length, mark pages clean */
	6856	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	6857	length = ls * PAGE_SIZE;
	6858	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	6859	cmap->scm_lastclean = index;
	6860
	6861	/* return successful */
	6862	*offsetp = (off_t)offset;
	6863	*lengthp = length;
	6864
	6865	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	6866	return(KERN_SUCCESS);
	6867	}
	6868	/*
	6869	* We didn't find anything... hashtable is empty
	6870	* emit stats into trace buffer and
	6871	* then free it
	6872	*/
	6873	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6874	cmap->scm_modulus,
	6875	cmap->scm_buckets,
	6876	cmap->scm_lastclean,
	6877	cmap->scm_iskips);
	6878
	6879	vfs_drt_free_map(cmap);
	6880	*cmapp = NULL;
	6881
	6882	return(KERN_FAILURE);
	6883	}
	6884
	6885
	6886	static kern_return_t
	6887	vfs_drt_control(void **cmapp, int op_type)
	6888	{
	6889	struct vfs_drt_clustermap *cmap;
	6890
	6891	/* sanity */
	6892	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6893	return(KERN_FAILURE);
	6894	cmap = *cmapp;
	6895
	6896	switch (op_type) {
	6897	case 0:
	6898	/* emit stats into trace buffer */
	6899	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6900	cmap->scm_modulus,
	6901	cmap->scm_buckets,
	6902	cmap->scm_lastclean,
	6903	cmap->scm_iskips);
	6904
	6905	vfs_drt_free_map(cmap);
	6906	*cmapp = NULL;
	6907	break;
	6908
	6909	case 1:
	6910	cmap->scm_lastclean = 0;
	6911	break;
	6912	}
	6913	return(KERN_SUCCESS);
	6914	}
	6915
	6916
	6917
	6918	/*
	6919	* Emit a summary of the state of the clustermap into the trace buffer
	6920	* along with some caller-provided data.
	6921	*/
	6922	#if KDEBUG
	6923	static void
	6924	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	6925	{
	6926	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	6927	}
	6928	#else
	6929	static void
	6930	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	6931	__unused int arg1, __unused int arg2, __unused int arg3,
	6932	__unused int arg4)
	6933	{
	6934	}
	6935	#endif
	6936
	6937	#if 0
	6938	/*
	6939	* Perform basic sanity check on the hash entry summary count
	6940	* vs. the actual bits set in the entry.
	6941	*/
	6942	static void
	6943	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	6944	{
	6945	int index, i;
	6946	int bits_on;
	6947
	6948	for (index = 0; index < cmap->scm_modulus; index++) {
	6949	if (DRT_HASH_VACANT(cmap, index))
	6950	continue;
	6951
	6952	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6953	if (DRT_HASH_TEST_BIT(cmap, index, i))
	6954	bits_on++;
	6955	}
	6956	if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
	6957	panic("bits_on = %d, index = %d\n", bits_on, index);
	6958	}
	6959	}
	6960	#endif