git.saurik.com Git - apple/xnu.git/blame_incremental

0 / 6975 ( 0%)

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/buf_internal.h>
	67	#include <sys/mount_internal.h>
	68	#include <sys/vnode_internal.h>
	69	#include <sys/trace.h>
	70	#include <sys/malloc.h>
	71	#include <sys/time.h>
	72	#include <sys/kernel.h>
	73	#include <sys/resourcevar.h>
	74	#include <miscfs/specfs/specdev.h>
	75	#include <sys/uio_internal.h>
	76	#include <libkern/libkern.h>
	77	#include <machine/machine_routines.h>
	78
	79	#include <sys/ubc_internal.h>
	80	#include <vm/vnode_pager.h>
	81
	82	#include <mach/mach_types.h>
	83	#include <mach/memory_object_types.h>
	84	#include <mach/vm_map.h>
	85	#include <mach/upl.h>
	86	#include <kern/task.h>
	87	#include <kern/policy_internal.h>
	88
	89	#include <vm/vm_kern.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_fault.h>
	93
	94	#include <sys/kdebug.h>
	95	#include <libkern/OSAtomic.h>
	96
	97	#include <sys/sdt.h>
	98
	99	#include <stdbool.h>
	100
	101	#include <vfs/vfs_disk_conditioner.h>
	102
	103	#if 0
	104	#undef KERNEL_DEBUG
	105	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	106	#endif
	107
	108
	109	#define CL_READ 0x01
	110	#define CL_WRITE 0x02
	111	#define CL_ASYNC 0x04
	112	#define CL_COMMIT 0x08
	113	#define CL_PAGEOUT 0x10
	114	#define CL_AGE 0x20
	115	#define CL_NOZERO 0x40
	116	#define CL_PAGEIN 0x80
	117	#define CL_DEV_MEMORY 0x100
	118	#define CL_PRESERVE 0x200
	119	#define CL_THROTTLE 0x400
	120	#define CL_KEEPCACHED 0x800
	121	#define CL_DIRECT_IO 0x1000
	122	#define CL_PASSIVE 0x2000
	123	#define CL_IOSTREAMING 0x4000
	124	#define CL_CLOSE 0x8000
	125	#define CL_ENCRYPTED 0x10000
	126	#define CL_RAW_ENCRYPTED 0x20000
	127	#define CL_NOCACHE 0x40000
	128
	129	#define MAX_VECTOR_UPL_ELEMENTS 8
	130	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
	131
	132	#define CLUSTER_IO_WAITING ((buf_t)1)
	133
	134	extern upl_t vector_upl_create(vm_offset_t);
	135	extern boolean_t vector_upl_is_valid(upl_t);
	136	extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
	137	extern void vector_upl_set_pagelist(upl_t);
	138	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
	139
	140	struct clios {
	141	lck_mtx_t io_mtxp;
	142	u_int io_completed; /* amount of io that has currently completed */
	143	u_int io_issued; /* amount of io that was successfully issued */
	144	int io_error; /* error code of first error encountered */
	145	int io_wanted; /* someone is sleeping waiting for a change in state */
	146	};
	147
	148	struct cl_direct_read_lock {
	149	LIST_ENTRY(cl_direct_read_lock) chain;
	150	int32_t ref_count;
	151	vnode_t vp;
	152	lck_rw_t rw_lock;
	153	};
	154
	155	#define CL_DIRECT_READ_LOCK_BUCKETS 61
	156
	157	static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
	158	cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
	159
	160	static lck_spin_t cl_direct_read_spin_lock;
	161
	162	static lck_grp_t *cl_mtx_grp;
	163	static lck_attr_t *cl_mtx_attr;
	164	static lck_grp_attr_t *cl_mtx_grp_attr;
	165	static lck_mtx_t *cl_transaction_mtxp;
	166
	167	#define IO_UNKNOWN 0
	168	#define IO_DIRECT 1
	169	#define IO_CONTIG 2
	170	#define IO_COPY 3
	171
	172	#define PUSH_DELAY 0x01
	173	#define PUSH_ALL 0x02
	174	#define PUSH_SYNC 0x04
	175
	176
	177	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
	178	static void cluster_wait_IO(buf_t cbp_head, int async);
	179	static void cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait);
	180
	181	static int cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length);
	182
	183	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	184	int flags, buf_t real_bp, struct clios iostate, int ()(buf_t, void ), void callback_arg);
	185	static int cluster_iodone(buf_t bp, void *callback_arg);
	186	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
	187	static int cluster_is_throttled(vnode_t vp);
	188
	189	static void cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name);
	190
	191	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void ), void *callback_arg, int flags);
	192
	193	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
	194	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference);
	195
	196	static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
	197	int ()(buf_t, void ), void *callback_arg);
	198	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	199	int flags, int ()(buf_t, void ), void *callback_arg);
	200	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	201	int ()(buf_t, void ), void *callback_arg, int flags);
	202
	203	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
	204	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void ), void *callback_arg);
	205	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	206	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void ), void *callback_arg);
	207	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
	208	int write_type, u_int32_t write_length, int ()(buf_t, void ), void *callback_arg, int bflag);
	209
	210	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int ()(buf_t, void ), void callback_arg);
	211
	212	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	213	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	214
	215	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int flags, int ()(buf_t, void ), void callback_arg);
	216
	217	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int push_flag, int flags, int ()(buf_t, void ), void callback_arg, int *err);
	218
	219	static void sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int ()(buf_t, void ), void callback_arg);
	220	static int sparse_cluster_push(void *cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int ()(buf_t, void ), void callback_arg);
	221	static void sparse_cluster_add(void *cmapp, vnode_t vp, struct cl_extent , off_t EOF, int ()(buf_t, void ), void *callback_arg);
	222
	223	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
	224	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	225	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	226
	227
	228	/*
	229	* For throttled IO to check whether
	230	* a block is cached by the boot cache
	231	* and thus it can avoid delaying the IO.
	232	*
	233	* bootcache_contains_block is initially
	234	* NULL. The BootCache will set it while
	235	* the cache is active and clear it when
	236	* the cache is jettisoned.
	237	*
	238	* Returns 0 if the block is not
	239	* contained in the cache, 1 if it is
	240	* contained.
	241	*
	242	* The function pointer remains valid
	243	* after the cache has been evicted even
	244	* if bootcache_contains_block has been
	245	* cleared.
	246	*
	247	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
	248	*/
	249	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
	250
	251
	252	/*
	253	* limit the internal I/O size so that we
	254	* can represent it in a 32 bit int
	255	*/
	256	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
	257	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
	258	#define MAX_VECTS 16
	259	/*
	260	* The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
	261	* allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
	262	* we have not historically allowed the write to bypass the UBC.
	263	*/
	264	#define MIN_DIRECT_WRITE_SIZE (16384)
	265
	266	#define WRITE_THROTTLE 6
	267	#define WRITE_THROTTLE_SSD 2
	268	#define WRITE_BEHIND 1
	269	#define WRITE_BEHIND_SSD 1
	270
	271	#if CONFIG_EMBEDDED
	272	#define PREFETCH 1
	273	#define PREFETCH_SSD 1
	274	uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
	275	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
	276	#else
	277	#define PREFETCH 3
	278	#define PREFETCH_SSD 2
	279	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
	280	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
	281	#endif
	282
	283
	284	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
	285	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
	286	#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
	287
	288	int speculative_reads_disabled = 0;
	289
	290	/*
	291	* throttle the number of async writes that
	292	* can be outstanding on a single vnode
	293	* before we issue a synchronous write
	294	*/
	295	#define THROTTLE_MAXCNT 0
	296
	297	uint32_t throttle_max_iosize = (128 * 1024);
	298
	299	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
	300
	301	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
	302
	303
	304	void
	305	cluster_init(void) {
	306	/*
	307	* allocate lock group attribute and group
	308	*/
	309	cl_mtx_grp_attr = lck_grp_attr_alloc_init();
	310	cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
	311
	312	/*
	313	* allocate the lock attribute
	314	*/
	315	cl_mtx_attr = lck_attr_alloc_init();
	316
	317	cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
	318
	319	if (cl_transaction_mtxp == NULL)
	320	panic("cluster_init: failed to allocate cl_transaction_mtxp");
	321
	322	lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
	323
	324	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i)
	325	LIST_INIT(&cl_direct_read_locks[i]);
	326	}
	327
	328
	329	uint32_t
	330	cluster_max_io_size(mount_t mp, int type)
	331	{
	332	uint32_t max_io_size;
	333	uint32_t segcnt;
	334	uint32_t maxcnt;
	335
	336	switch(type) {
	337
	338	case CL_READ:
	339	segcnt = mp->mnt_segreadcnt;
	340	maxcnt = mp->mnt_maxreadcnt;
	341	break;
	342	case CL_WRITE:
	343	segcnt = mp->mnt_segwritecnt;
	344	maxcnt = mp->mnt_maxwritecnt;
	345	break;
	346	default:
	347	segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
	348	maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
	349	break;
	350	}
	351	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
	352	/*
	353	* don't allow a size beyond the max UPL size we can create
	354	*/
	355	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
	356	}
	357	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
	358
	359	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
	360	/*
	361	* don't allow a size smaller than the old fixed limit
	362	*/
	363	max_io_size = MAX_UPL_TRANSFER_BYTES;
	364	} else {
	365	/*
	366	* make sure the size specified is a multiple of PAGE_SIZE
	367	*/
	368	max_io_size &= ~PAGE_MASK;
	369	}
	370	return (max_io_size);
	371	}
	372
	373
	374
	375
	376	#define CLW_ALLOCATE 0x01
	377	#define CLW_RETURNLOCKED 0x02
	378	#define CLW_IONOCACHE 0x04
	379	#define CLW_IOPASSIVE 0x08
	380
	381	/*
	382	* if the read ahead context doesn't yet exist,
	383	* allocate and initialize it...
	384	* the vnode lock serializes multiple callers
	385	* during the actual assignment... first one
	386	* to grab the lock wins... the other callers
	387	* will release the now unnecessary storage
	388	*
	389	* once the context is present, try to grab (but don't block on)
	390	* the lock associated with it... if someone
	391	* else currently owns it, than the read
	392	* will run without read-ahead. this allows
	393	* multiple readers to run in parallel and
	394	* since there's only 1 read ahead context,
	395	* there's no real loss in only allowing 1
	396	* reader to have read-ahead enabled.
	397	*/
	398	static struct cl_readahead *
	399	cluster_get_rap(vnode_t vp)
	400	{
	401	struct ubc_info *ubc;
	402	struct cl_readahead *rap;
	403
	404	ubc = vp->v_ubcinfo;
	405
	406	if ((rap = ubc->cl_rahead) == NULL) {
	407	MALLOC_ZONE(rap, struct cl_readahead , sizeof rap, M_CLRDAHEAD, M_WAITOK);
	408
	409	bzero(rap, sizeof *rap);
	410	rap->cl_lastr = -1;
	411	lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
	412
	413	vnode_lock(vp);
	414
	415	if (ubc->cl_rahead == NULL)
	416	ubc->cl_rahead = rap;
	417	else {
	418	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	419	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	420	rap = ubc->cl_rahead;
	421	}
	422	vnode_unlock(vp);
	423	}
	424	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
	425	return(rap);
	426
	427	return ((struct cl_readahead *)NULL);
	428	}
	429
	430
	431	/*
	432	* if the write behind context doesn't yet exist,
	433	* and CLW_ALLOCATE is specified, allocate and initialize it...
	434	* the vnode lock serializes multiple callers
	435	* during the actual assignment... first one
	436	* to grab the lock wins... the other callers
	437	* will release the now unnecessary storage
	438	*
	439	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	440	* the lock associated with the write behind context before
	441	* returning
	442	*/
	443
	444	static struct cl_writebehind *
	445	cluster_get_wbp(vnode_t vp, int flags)
	446	{
	447	struct ubc_info *ubc;
	448	struct cl_writebehind *wbp;
	449
	450	ubc = vp->v_ubcinfo;
	451
	452	if ((wbp = ubc->cl_wbehind) == NULL) {
	453
	454	if ( !(flags & CLW_ALLOCATE))
	455	return ((struct cl_writebehind *)NULL);
	456
	457	MALLOC_ZONE(wbp, struct cl_writebehind , sizeof wbp, M_CLWRBEHIND, M_WAITOK);
	458
	459	bzero(wbp, sizeof *wbp);
	460	lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
	461
	462	vnode_lock(vp);
	463
	464	if (ubc->cl_wbehind == NULL)
	465	ubc->cl_wbehind = wbp;
	466	else {
	467	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	468	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	469	wbp = ubc->cl_wbehind;
	470	}
	471	vnode_unlock(vp);
	472	}
	473	if (flags & CLW_RETURNLOCKED)
	474	lck_mtx_lock(&wbp->cl_lockw);
	475
	476	return (wbp);
	477	}
	478
	479
	480	static void
	481	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, int flags)
	482	{
	483	struct cl_writebehind *wbp;
	484
	485	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
	486
	487	if (wbp->cl_number) {
	488	lck_mtx_lock(&wbp->cl_lockw);
	489
	490	cluster_try_push(wbp, vp, newEOF, PUSH_ALL \| flags, 0, callback, callback_arg, NULL);
	491
	492	lck_mtx_unlock(&wbp->cl_lockw);
	493	}
	494	}
	495	}
	496
	497
	498	static int
	499	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
	500	{
	501	daddr64_t blkno;
	502	size_t io_size;
	503	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
	504
	505	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
	506	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ \| VNODE_BLOCKMAP_NO_TRACK, NULL))
	507	return(0);
	508
	509	if (io_size == 0)
	510	return (0);
	511
	512	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
	513	return(1);
	514	}
	515	return(0);
	516	}
	517
	518
	519	static int
	520	cluster_is_throttled(vnode_t vp)
	521	{
	522	return (throttle_io_will_be_throttled(-1, vp->v_mount));
	523	}
	524
	525
	526	static void
	527	cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name)
	528	{
	529
	530	lck_mtx_lock(&iostate->io_mtxp);
	531
	532	while ((iostate->io_issued - iostate->io_completed) > target) {
	533
	534	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_START,
	535	iostate->io_issued, iostate->io_completed, target, 0, 0);
	536
	537	iostate->io_wanted = 1;
	538	msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
	539
	540	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_END,
	541	iostate->io_issued, iostate->io_completed, target, 0, 0);
	542	}
	543	lck_mtx_unlock(&iostate->io_mtxp);
	544	}
	545
	546	static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
	547	upl_offset_t upl_offset, upl_size_t size)
	548	{
	549	if (!size)
	550	return;
	551
	552	upl_t associated_upl = upl_associated_upl(upl);
	553
	554	if (!associated_upl)
	555	return;
	556
	557	#if 0
	558	printf("1: %d %d\n", upl_offset, upl_offset + size);
	559	#endif
	560
	561	/*
	562	* The associated UPL is page aligned to file offsets whereas the
	563	* UPL it's attached to has different alignment requirements. The
	564	* upl_offset that we have refers to @upl. The code that follows
	565	* has to deal with the first and last pages in this transaction
	566	* which might straddle pages in the associated UPL. To keep
	567	* track of these pages, we use the mark bits: if the mark bit is
	568	* set, we know another transaction has completed its part of that
	569	* page and so we can unlock that page here.
	570	*
	571	* The following illustrates what we have to deal with:
	572	*
	573	* MEM u <------------ 1 PAGE ------------> e
	574	* +-------------+----------------------+-----------------
	575	* \| \|######################\|#################
	576	* +-------------+----------------------+-----------------
	577	* FILE \| <--- a ---> o <------------ 1 PAGE ------------>
	578	*
	579	* So here we show a write to offset @o. The data that is to be
	580	* written is in a buffer that is not page aligned; it has offset
	581	* @a in the page. The upl that carries the data starts in memory
	582	* at @u. The associated upl starts in the file at offset @o. A
	583	* transaction will always end on a page boundary (like @e above)
	584	* except for the very last transaction in the group. We cannot
	585	* unlock the page at @o in the associated upl until both the
	586	* transaction ending at @e and the following transaction (that
	587	* starts at @e) has completed.
	588	*/
	589
	590	/*
	591	* We record whether or not the two UPLs are aligned as the mark
	592	* bit in the first page of @upl.
	593	*/
	594	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	595	bool is_unaligned = upl_page_get_mark(pl, 0);
	596
	597	if (is_unaligned) {
	598	upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
	599
	600	upl_offset_t upl_end = upl_offset + size;
	601	assert(upl_end >= PAGE_SIZE);
	602
	603	upl_size_t assoc_upl_size = upl_get_size(associated_upl);
	604
	605	/*
	606	* In the very first transaction in the group, upl_offset will
	607	* not be page aligned, but after that it will be and in that
	608	* case we want the preceding page in the associated UPL hence
	609	* the minus one.
	610	*/
	611	assert(upl_offset);
	612	if (upl_offset)
	613	upl_offset = trunc_page_32(upl_offset - 1);
	614
	615	lck_mtx_lock_spin(&iostate->io_mtxp);
	616
	617	// Look at the first page...
	618	if (upl_offset
	619	&& !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
	620	/*
	621	* The first page isn't marked so let another transaction
	622	* completion handle it.
	623	*/
	624	upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
	625	upl_offset += PAGE_SIZE;
	626	}
	627
	628	// And now the last page...
	629
	630	/*
	631	* This needs to be > rather than >= because if it's equal, it
	632	* means there's another transaction that is sharing the last
	633	* page.
	634	*/
	635	if (upl_end > assoc_upl_size)
	636	upl_end = assoc_upl_size;
	637	else {
	638	upl_end = trunc_page_32(upl_end);
	639	const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
	640
	641	if (!upl_page_get_mark(assoc_pl, last_pg)) {
	642	/*
	643	* The last page isn't marked so mark the page and let another
	644	* transaction completion handle it.
	645	*/
	646	upl_page_set_mark(assoc_pl, last_pg, true);
	647	upl_end -= PAGE_SIZE;
	648	}
	649	}
	650
	651	lck_mtx_unlock(&iostate->io_mtxp);
	652
	653	#if 0
	654	printf("2: %d %d\n", upl_offset, upl_end);
	655	#endif
	656
	657	if (upl_end <= upl_offset)
	658	return;
	659
	660	size = upl_end - upl_offset;
	661	} else {
	662	assert(!(upl_offset & PAGE_MASK));
	663	assert(!(size & PAGE_MASK));
	664	}
	665
	666	boolean_t empty;
	667
	668	/*
	669	* We can unlock these pages now and as this is for a
	670	* direct/uncached write, we want to dump the pages too.
	671	*/
	672	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
	673	UPL_ABORT_DUMP_PAGES, &empty);
	674
	675	assert(!kr);
	676
	677	if (!kr && empty) {
	678	upl_set_associated_upl(upl, NULL);
	679	upl_deallocate(associated_upl);
	680	}
	681	}
	682
	683	static int
	684	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
	685	{
	686	int upl_abort_code = 0;
	687	int page_in = 0;
	688	int page_out = 0;
	689
	690	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE))
	691	/*
	692	* direct write of any flavor, or a direct read that wasn't aligned
	693	*/
	694	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
	695	else {
	696	if (io_flags & B_PAGEIO) {
	697	if (io_flags & B_READ)
	698	page_in = 1;
	699	else
	700	page_out = 1;
	701	}
	702	if (io_flags & B_CACHE)
	703	/*
	704	* leave pages in the cache unchanged on error
	705	*/
	706	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	707	else if (page_out && ((error != ENXIO) \|\| vnode_isswap(vp)))
	708	/*
	709	* transient error... leave pages unchanged
	710	*/
	711	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	712	else if (page_in)
	713	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	714	else
	715	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	716
	717	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
	718	}
	719	return (upl_abort_code);
	720	}
	721
	722
	723	static int
	724	cluster_iodone(buf_t bp, void *callback_arg)
	725	{
	726	int b_flags;
	727	int error;
	728	int total_size;
	729	int total_resid;
	730	int upl_offset;
	731	int zero_offset;
	732	int pg_offset = 0;
	733	int commit_size = 0;
	734	int upl_flags = 0;
	735	int transaction_size = 0;
	736	upl_t upl;
	737	buf_t cbp;
	738	buf_t cbp_head;
	739	buf_t cbp_next;
	740	buf_t real_bp;
	741	vnode_t vp;
	742	struct clios *iostate;
	743	boolean_t transaction_complete = FALSE;
	744
	745	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
	746
	747	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	748	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	749
	750	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
	751	lck_mtx_lock_spin(cl_transaction_mtxp);
	752
	753	bp->b_flags \|= B_TDONE;
	754
	755	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	756	/*
	757	* all I/O requests that are part of this transaction
	758	* have to complete before we can process it
	759	*/
	760	if ( !(cbp->b_flags & B_TDONE)) {
	761
	762	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	763	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	764
	765	lck_mtx_unlock(cl_transaction_mtxp);
	766
	767	return 0;
	768	}
	769
	770	if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
	771	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	772	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	773
	774	lck_mtx_unlock(cl_transaction_mtxp);
	775	wakeup(cbp);
	776
	777	return 0;
	778	}
	779
	780	if (cbp->b_flags & B_EOT)
	781	transaction_complete = TRUE;
	782	}
	783	lck_mtx_unlock(cl_transaction_mtxp);
	784
	785	if (transaction_complete == FALSE) {
	786	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	787	cbp_head, 0, 0, 0, 0);
	788	return 0;
	789	}
	790	}
	791	error = 0;
	792	total_size = 0;
	793	total_resid = 0;
	794
	795	cbp = cbp_head;
	796	vp = cbp->b_vp;
	797	upl_offset = cbp->b_uploffset;
	798	upl = cbp->b_upl;
	799	b_flags = cbp->b_flags;
	800	real_bp = cbp->b_real_bp;
	801	zero_offset= cbp->b_validend;
	802	iostate = (struct clios *)cbp->b_iostate;
	803
	804	if (real_bp)
	805	real_bp->b_dev = cbp->b_dev;
	806
	807	while (cbp) {
	808	if ((cbp->b_flags & B_ERROR) && error == 0)
	809	error = cbp->b_error;
	810
	811	total_resid += cbp->b_resid;
	812	total_size += cbp->b_bcount;
	813
	814	cbp_next = cbp->b_trans_next;
	815
	816	if (cbp_next == NULL)
	817	/*
	818	* compute the overall size of the transaction
	819	* in case we created one that has 'holes' in it
	820	* 'total_size' represents the amount of I/O we
	821	* did, not the span of the transaction w/r to the UPL
	822	*/
	823	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
	824
	825	if (cbp != cbp_head)
	826	free_io_buf(cbp);
	827
	828	cbp = cbp_next;
	829	}
	830
	831	if (ISSET(b_flags, B_COMMIT_UPL)) {
	832	cluster_handle_associated_upl(iostate,
	833	cbp_head->b_upl,
	834	upl_offset,
	835	transaction_size);
	836	}
	837
	838	if (error == 0 && total_resid)
	839	error = EIO;
	840
	841	if (error == 0) {
	842	int (cliodone_func)(buf_t, void ) = (int ()(buf_t, void ))(cbp_head->b_cliodone);
	843
	844	if (cliodone_func != NULL) {
	845	cbp_head->b_bcount = transaction_size;
	846
	847	error = (*cliodone_func)(cbp_head, callback_arg);
	848	}
	849	}
	850	if (zero_offset)
	851	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	852
	853	free_io_buf(cbp_head);
	854
	855	if (iostate) {
	856	int need_wakeup = 0;
	857
	858	/*
	859	* someone has issued multiple I/Os asynchrounsly
	860	* and is waiting for them to complete (streaming)
	861	*/
	862	lck_mtx_lock_spin(&iostate->io_mtxp);
	863
	864	if (error && iostate->io_error == 0)
	865	iostate->io_error = error;
	866
	867	iostate->io_completed += total_size;
	868
	869	if (iostate->io_wanted) {
	870	/*
	871	* someone is waiting for the state of
	872	* this io stream to change
	873	*/
	874	iostate->io_wanted = 0;
	875	need_wakeup = 1;
	876	}
	877	lck_mtx_unlock(&iostate->io_mtxp);
	878
	879	if (need_wakeup)
	880	wakeup((caddr_t)&iostate->io_wanted);
	881	}
	882
	883	if (b_flags & B_COMMIT_UPL) {
	884	pg_offset = upl_offset & PAGE_MASK;
	885	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	886
	887	if (error)
	888	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
	889	else {
	890	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
	891
	892	if ((b_flags & B_PHYS) && (b_flags & B_READ))
	893	upl_flags \|= UPL_COMMIT_SET_DIRTY;
	894
	895	if (b_flags & B_AGE)
	896	upl_flags \|= UPL_COMMIT_INACTIVATE;
	897
	898	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
	899	}
	900	}
	901	if (real_bp) {
	902	if (error) {
	903	real_bp->b_flags \|= B_ERROR;
	904	real_bp->b_error = error;
	905	}
	906	real_bp->b_resid = total_resid;
	907
	908	buf_biodone(real_bp);
	909	}
	910	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	911	upl, upl_offset - pg_offset, commit_size, (error << 24) \| upl_flags, 0);
	912
	913	return (error);
	914	}
	915
	916
	917	uint32_t
	918	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
	919	{
	920	if (cluster_is_throttled(vp)) {
	921	*limit = THROTTLE_MAX_IOSIZE;
	922	return 1;
	923	}
	924	return 0;
	925	}
	926
	927
	928	void
	929	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
	930	{
	931
	932	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	933	upl_offset, size, bp, 0, 0);
	934
	935	if (bp == NULL \|\| bp->b_datap == 0) {
	936	upl_page_info_t *pl;
	937	addr64_t zero_addr;
	938
	939	pl = ubc_upl_pageinfo(upl);
	940
	941	if (upl_device_page(pl) == TRUE) {
	942	zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
	943
	944	bzero_phys_nc(zero_addr, size);
	945	} else {
	946	while (size) {
	947	int page_offset;
	948	int page_index;
	949	int zero_cnt;
	950
	951	page_index = upl_offset / PAGE_SIZE;
	952	page_offset = upl_offset & PAGE_MASK;
	953
	954	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
	955	zero_cnt = min(PAGE_SIZE - page_offset, size);
	956
	957	bzero_phys(zero_addr, zero_cnt);
	958
	959	size -= zero_cnt;
	960	upl_offset += zero_cnt;
	961	}
	962	}
	963	} else
	964	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	965
	966	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	967	upl_offset, size, 0, 0, 0);
	968	}
	969
	970
	971	static void
	972	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
	973	{
	974	cbp_head->b_validend = zero_offset;
	975	cbp_tail->b_flags \|= B_EOT;
	976	}
	977
	978	static void
	979	cluster_wait_IO(buf_t cbp_head, int async)
	980	{
	981	buf_t cbp;
	982
	983	if (async) {
	984	/*
	985	* Async callback completion will not normally generate a
	986	* wakeup upon I/O completion. To get woken up, we set
	987	* b_trans_next (which is safe for us to modify) on the last
	988	* buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
	989	* to wake us up when all buffers as part of this transaction
	990	* are completed. This is done under the umbrella of
	991	* cl_transaction_mtxp which is also taken in cluster_iodone.
	992	*/
	993	bool done = true;
	994	buf_t last = NULL;
	995
	996	lck_mtx_lock_spin(cl_transaction_mtxp);
	997
	998	for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
	999	if (!ISSET(cbp->b_flags, B_TDONE))
	1000	done = false;
	1001	}
	1002
	1003	if (!done) {
	1004	last->b_trans_next = CLUSTER_IO_WAITING;
	1005
	1006	DTRACE_IO1(wait__start, buf_t, last);
	1007	do {
	1008	msleep(last, cl_transaction_mtxp, PSPIN \| (PRIBIO+1), "cluster_wait_IO", NULL);
	1009
	1010	/*
	1011	* We should only have been woken up if all the
	1012	* buffers are completed, but just in case...
	1013	*/
	1014	done = true;
	1015	for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
	1016	if (!ISSET(cbp->b_flags, B_TDONE)) {
	1017	done = false;
	1018	break;
	1019	}
	1020	}
	1021	} while (!done);
	1022	DTRACE_IO1(wait__done, buf_t, last);
	1023
	1024	last->b_trans_next = NULL;
	1025	}
	1026
	1027	lck_mtx_unlock(cl_transaction_mtxp);
	1028	} else { // !async
	1029	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1030	buf_biowait(cbp);
	1031	}
	1032	}
	1033
	1034	static void
	1035	cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait)
	1036	{
	1037	buf_t cbp;
	1038	int error;
	1039	boolean_t isswapout = FALSE;
	1040
	1041	/*
	1042	* cluster_complete_transaction will
	1043	* only be called if we've issued a complete chain in synchronous mode
	1044	* or, we've already done a cluster_wait_IO on an incomplete chain
	1045	*/
	1046	if (needwait) {
	1047	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1048	buf_biowait(cbp);
	1049	}
	1050	/*
	1051	* we've already waited on all of the I/Os in this transaction,
	1052	* so mark all of the buf_t's in this transaction as B_TDONE
	1053	* so that cluster_iodone sees the transaction as completed
	1054	*/
	1055	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1056	cbp->b_flags \|= B_TDONE;
	1057	cbp = *cbp_head;
	1058
	1059	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
	1060	isswapout = TRUE;
	1061
	1062	error = cluster_iodone(cbp, callback_arg);
	1063
	1064	if ( !(flags & CL_ASYNC) && error && *retval == 0) {
	1065	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO))
	1066	*retval = error;
	1067	else if (isswapout == TRUE)
	1068	*retval = error;
	1069	}
	1070	*cbp_head = (buf_t)NULL;
	1071	}
	1072
	1073
	1074	static int
	1075	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	1076	int flags, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1077	{
	1078	buf_t cbp;
	1079	u_int size;
	1080	u_int io_size;
	1081	int io_flags;
	1082	int bmap_flags;
	1083	int error = 0;
	1084	int retval = 0;
	1085	buf_t cbp_head = NULL;
	1086	buf_t cbp_tail = NULL;
	1087	int trans_count = 0;
	1088	int max_trans_count;
	1089	u_int pg_count;
	1090	int pg_offset;
	1091	u_int max_iosize;
	1092	u_int max_vectors;
	1093	int priv;
	1094	int zero_offset = 0;
	1095	int async_throttle = 0;
	1096	mount_t mp;
	1097	vm_offset_t upl_end_offset;
	1098	boolean_t need_EOT = FALSE;
	1099
	1100	/*
	1101	* we currently don't support buffers larger than a page
	1102	*/
	1103	if (real_bp && non_rounded_size > PAGE_SIZE)
	1104	panic("%s(): Called with real buffer of size %d bytes which "
	1105	"is greater than the maximum allowed size of "
	1106	"%d bytes (the system PAGE_SIZE).\n",
	1107	__FUNCTION__, non_rounded_size, PAGE_SIZE);
	1108
	1109	mp = vp->v_mount;
	1110
	1111	/*
	1112	* we don't want to do any funny rounding of the size for IO requests
	1113	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
	1114	* belong to us... we can't extend (nor do we need to) the I/O to fill
	1115	* out a page
	1116	*/
	1117	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
	1118	/*
	1119	* round the requested size up so that this I/O ends on a
	1120	* page boundary in case this is a 'write'... if the filesystem
	1121	* has blocks allocated to back the page beyond the EOF, we want to
	1122	* make sure to write out the zero's that are sitting beyond the EOF
	1123	* so that in case the filesystem doesn't explicitly zero this area
	1124	* if a hole is created via a lseek/write beyond the current EOF,
	1125	* it will return zeros when it's read back from the disk. If the
	1126	* physical allocation doesn't extend for the whole page, we'll
	1127	* only write/read from the disk up to the end of this allocation
	1128	* via the extent info returned from the VNOP_BLOCKMAP call.
	1129	*/
	1130	pg_offset = upl_offset & PAGE_MASK;
	1131
	1132	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	1133	} else {
	1134	/*
	1135	* anyone advertising a blocksize of 1 byte probably
	1136	* can't deal with us rounding up the request size
	1137	* AFP is one such filesystem/device
	1138	*/
	1139	size = non_rounded_size;
	1140	}
	1141	upl_end_offset = upl_offset + size;
	1142
	1143	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
	1144
	1145	/*
	1146	* Set the maximum transaction size to the maximum desired number of
	1147	* buffers.
	1148	*/
	1149	max_trans_count = 8;
	1150	if (flags & CL_DEV_MEMORY)
	1151	max_trans_count = 16;
	1152
	1153	if (flags & CL_READ) {
	1154	io_flags = B_READ;
	1155	bmap_flags = VNODE_READ;
	1156
	1157	max_iosize = mp->mnt_maxreadcnt;
	1158	max_vectors = mp->mnt_segreadcnt;
	1159	} else {
	1160	io_flags = B_WRITE;
	1161	bmap_flags = VNODE_WRITE;
	1162
	1163	max_iosize = mp->mnt_maxwritecnt;
	1164	max_vectors = mp->mnt_segwritecnt;
	1165	}
	1166	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	1167
	1168	/*
	1169	* make sure the maximum iosize is a
	1170	* multiple of the page size
	1171	*/
	1172	max_iosize &= ~PAGE_MASK;
	1173
	1174	/*
	1175	* Ensure the maximum iosize is sensible.
	1176	*/
	1177	if (!max_iosize)
	1178	max_iosize = PAGE_SIZE;
	1179
	1180	if (flags & CL_THROTTLE) {
	1181	if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
	1182	if (max_iosize > THROTTLE_MAX_IOSIZE)
	1183	max_iosize = THROTTLE_MAX_IOSIZE;
	1184	async_throttle = THROTTLE_MAXCNT;
	1185	} else {
	1186	if ( (flags & CL_DEV_MEMORY) )
	1187	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
	1188	else {
	1189	u_int max_cluster;
	1190	u_int max_cluster_size;
	1191	u_int scale;
	1192
	1193	if (vp->v_mount->mnt_minsaturationbytecount) {
	1194	max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
	1195
	1196	scale = 1;
	1197	} else {
	1198	max_cluster_size = MAX_CLUSTER_SIZE(vp);
	1199
	1200	if (disk_conditioner_mount_is_ssd(vp->v_mount))
	1201	scale = WRITE_THROTTLE_SSD;
	1202	else
	1203	scale = WRITE_THROTTLE;
	1204	}
	1205	if (max_iosize > max_cluster_size)
	1206	max_cluster = max_cluster_size;
	1207	else
	1208	max_cluster = max_iosize;
	1209
	1210	if (size < max_cluster)
	1211	max_cluster = size;
	1212
	1213	if (flags & CL_CLOSE)
	1214	scale += MAX_CLUSTERS;
	1215
	1216	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
	1217	}
	1218	}
	1219	}
	1220	if (flags & CL_AGE)
	1221	io_flags \|= B_AGE;
	1222	if (flags & (CL_PAGEIN \| CL_PAGEOUT))
	1223	io_flags \|= B_PAGEIO;
	1224	if (flags & (CL_IOSTREAMING))
	1225	io_flags \|= B_IOSTREAMING;
	1226	if (flags & CL_COMMIT)
	1227	io_flags \|= B_COMMIT_UPL;
	1228	if (flags & CL_DIRECT_IO)
	1229	io_flags \|= B_PHYS;
	1230	if (flags & (CL_PRESERVE \| CL_KEEPCACHED))
	1231	io_flags \|= B_CACHE;
	1232	if (flags & CL_PASSIVE)
	1233	io_flags \|= B_PASSIVE;
	1234	if (flags & CL_ENCRYPTED)
	1235	io_flags \|= B_ENCRYPTED_IO;
	1236
	1237	if (vp->v_flag & VSYSTEM)
	1238	io_flags \|= B_META;
	1239
	1240	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	1241	/*
	1242	* then we are going to end up
	1243	* with a page that we can't complete (the file size wasn't a multiple
	1244	* of PAGE_SIZE and we're trying to read to the end of the file
	1245	* so we'll go ahead and zero out the portion of the page we can't
	1246	* read in from the file
	1247	*/
	1248	zero_offset = upl_offset + non_rounded_size;
	1249	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
	1250	assert(ISSET(flags, CL_COMMIT));
	1251
	1252	// For a direct/uncached write, we need to lock pages...
	1253
	1254	upl_t cached_upl;
	1255
	1256	/*
	1257	* Create a UPL to lock the pages in the cache whilst the
	1258	* write is in progress.
	1259	*/
	1260	ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
	1261	NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
	1262
	1263	/*
	1264	* Attach this UPL to the other UPL so that we can find it
	1265	* later.
	1266	*/
	1267	upl_set_associated_upl(upl, cached_upl);
	1268
	1269	if (upl_offset & PAGE_MASK) {
	1270	/*
	1271	* The two UPLs are not aligned, so mark the first page in
	1272	* @upl so that cluster_handle_associated_upl can handle
	1273	* it accordingly.
	1274	*/
	1275	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	1276	upl_page_set_mark(pl, 0, true);
	1277	}
	1278	}
	1279
	1280	while (size) {
	1281	daddr64_t blkno;
	1282	daddr64_t lblkno;
	1283	u_int io_size_wanted;
	1284	size_t io_size_tmp;
	1285
	1286	if (size > max_iosize)
	1287	io_size = max_iosize;
	1288	else
	1289	io_size = size;
	1290
	1291	io_size_wanted = io_size;
	1292	io_size_tmp = (size_t)io_size;
	1293
	1294	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
	1295	break;
	1296
	1297	if (io_size_tmp > io_size_wanted)
	1298	io_size = io_size_wanted;
	1299	else
	1300	io_size = (u_int)io_size_tmp;
	1301
	1302	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
	1303	real_bp->b_blkno = blkno;
	1304
	1305	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	1306	(int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
	1307
	1308	if (io_size == 0) {
	1309	/*
	1310	* vnop_blockmap didn't return an error... however, it did
	1311	* return an extent size of 0 which means we can't
	1312	* make forward progress on this I/O... a hole in the
	1313	* file would be returned as a blkno of -1 with a non-zero io_size
	1314	* a real extent is returned with a blkno != -1 and a non-zero io_size
	1315	*/
	1316	error = EINVAL;
	1317	break;
	1318	}
	1319	if ( !(flags & CL_READ) && blkno == -1) {
	1320	off_t e_offset;
	1321	int pageout_flags;
	1322
	1323	if (upl_get_internal_vectorupl(upl))
	1324	panic("Vector UPLs should not take this code-path\n");
	1325	/*
	1326	* we're writing into a 'hole'
	1327	*/
	1328	if (flags & CL_PAGEOUT) {
	1329	/*
	1330	* if we got here via cluster_pageout
	1331	* then just error the request and return
	1332	* the 'hole' should already have been covered
	1333	*/
	1334	error = EINVAL;
	1335	break;
	1336	}
	1337	/*
	1338	* we can get here if the cluster code happens to
	1339	* pick up a page that was dirtied via mmap vs
	1340	* a 'write' and the page targets a 'hole'...
	1341	* i.e. the writes to the cluster were sparse
	1342	* and the file was being written for the first time
	1343	*
	1344	* we can also get here if the filesystem supports
	1345	* 'holes' that are less than PAGE_SIZE.... because
	1346	* we can't know if the range in the page that covers
	1347	* the 'hole' has been dirtied via an mmap or not,
	1348	* we have to assume the worst and try to push the
	1349	* entire page to storage.
	1350	*
	1351	* Try paging out the page individually before
	1352	* giving up entirely and dumping it (the pageout
	1353	* path will insure that the zero extent accounting
	1354	* has been taken care of before we get back into cluster_io)
	1355	*
	1356	* go direct to vnode_pageout so that we don't have to
	1357	* unbusy the page from the UPL... we used to do this
	1358	* so that we could call ubc_msync, but that results
	1359	* in a potential deadlock if someone else races us to acquire
	1360	* that page and wins and in addition needs one of the pages
	1361	* we're continuing to hold in the UPL
	1362	*/
	1363	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
	1364
	1365	if ( !(flags & CL_ASYNC))
	1366	pageout_flags \|= UPL_IOSYNC;
	1367	if ( !(flags & CL_COMMIT))
	1368	pageout_flags \|= UPL_NOCOMMIT;
	1369
	1370	if (cbp_head) {
	1371	buf_t prev_cbp;
	1372	int bytes_in_last_page;
	1373
	1374	/*
	1375	* first we have to wait for the the current outstanding I/Os
	1376	* to complete... EOT hasn't been set yet on this transaction
	1377	* so the pages won't be released
	1378	*/
	1379	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1380
	1381	bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
	1382	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1383	bytes_in_last_page += cbp->b_bcount;
	1384	bytes_in_last_page &= PAGE_MASK;
	1385
	1386	while (bytes_in_last_page) {
	1387	/*
	1388	* we've got a transcation that
	1389	* includes the page we're about to push out through vnode_pageout...
	1390	* find the bp's in the list which intersect this page and either
	1391	* remove them entirely from the transaction (there could be multiple bp's), or
	1392	* round it's iosize down to the page boundary (there can only be one)...
	1393	*
	1394	* find the last bp in the list and act on it
	1395	*/
	1396	for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
	1397	prev_cbp = cbp;
	1398
	1399	if (bytes_in_last_page >= cbp->b_bcount) {
	1400	/*
	1401	* this buf no longer has any I/O associated with it
	1402	*/
	1403	bytes_in_last_page -= cbp->b_bcount;
	1404	cbp->b_bcount = 0;
	1405
	1406	free_io_buf(cbp);
	1407
	1408	if (cbp == cbp_head) {
	1409	assert(bytes_in_last_page == 0);
	1410	/*
	1411	* the buf we just freed was the only buf in
	1412	* this transaction... so there's no I/O to do
	1413	*/
	1414	cbp_head = NULL;
	1415	cbp_tail = NULL;
	1416	} else {
	1417	/*
	1418	* remove the buf we just freed from
	1419	* the transaction list
	1420	*/
	1421	prev_cbp->b_trans_next = NULL;
	1422	cbp_tail = prev_cbp;
	1423	}
	1424	} else {
	1425	/*
	1426	* this is the last bp that has I/O
	1427	* intersecting the page of interest
	1428	* only some of the I/O is in the intersection
	1429	* so clip the size but keep it in the transaction list
	1430	*/
	1431	cbp->b_bcount -= bytes_in_last_page;
	1432	cbp_tail = cbp;
	1433	bytes_in_last_page = 0;
	1434	}
	1435	}
	1436	if (cbp_head) {
	1437	/*
	1438	* there was more to the current transaction
	1439	* than just the page we are pushing out via vnode_pageout...
	1440	* mark it as finished and complete it... we've already
	1441	* waited for the I/Os to complete above in the call to cluster_wait_IO
	1442	*/
	1443	cluster_EOT(cbp_head, cbp_tail, 0);
	1444
	1445	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1446
	1447	trans_count = 0;
	1448	}
	1449	}
	1450	if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
	1451	error = EINVAL;
	1452	}
	1453	e_offset = round_page_64(f_offset + 1);
	1454	io_size = e_offset - f_offset;
	1455
	1456	f_offset += io_size;
	1457	upl_offset += io_size;
	1458
	1459	if (size >= io_size)
	1460	size -= io_size;
	1461	else
	1462	size = 0;
	1463	/*
	1464	* keep track of how much of the original request
	1465	* that we've actually completed... non_rounded_size
	1466	* may go negative due to us rounding the request
	1467	* to a page size multiple (i.e. size > non_rounded_size)
	1468	*/
	1469	non_rounded_size -= io_size;
	1470
	1471	if (non_rounded_size <= 0) {
	1472	/*
	1473	* we've transferred all of the data in the original
	1474	* request, but we were unable to complete the tail
	1475	* of the last page because the file didn't have
	1476	* an allocation to back that portion... this is ok.
	1477	*/
	1478	size = 0;
	1479	}
	1480	if (error) {
	1481	if (size == 0)
	1482	flags &= ~CL_COMMIT;
	1483	break;
	1484	}
	1485	continue;
	1486	}
	1487	lblkno = (daddr64_t)(f_offset / 0x1000);
	1488	/*
	1489	* we have now figured out how much I/O we can do - this is in 'io_size'
	1490	* pg_offset is the starting point in the first page for the I/O
	1491	* pg_count is the number of full and partial pages that 'io_size' encompasses
	1492	*/
	1493	pg_offset = upl_offset & PAGE_MASK;
	1494
	1495	if (flags & CL_DEV_MEMORY) {
	1496	/*
	1497	* treat physical requests as one 'giant' page
	1498	*/
	1499	pg_count = 1;
	1500	} else
	1501	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1502
	1503	if ((flags & CL_READ) && blkno == -1) {
	1504	vm_offset_t commit_offset;
	1505	int bytes_to_zero;
	1506	int complete_transaction_now = 0;
	1507
	1508	/*
	1509	* if we're reading and blkno == -1, then we've got a
	1510	* 'hole' in the file that we need to deal with by zeroing
	1511	* out the affected area in the upl
	1512	*/
	1513	if (io_size >= (u_int)non_rounded_size) {
	1514	/*
	1515	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	1516	* than 'zero_offset' will be non-zero
	1517	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	1518	* (indicated by the io_size finishing off the I/O request for this UPL)
	1519	* than we're not going to issue an I/O for the
	1520	* last page in this upl... we need to zero both the hole and the tail
	1521	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	1522	*/
	1523	bytes_to_zero = non_rounded_size;
	1524	if (!(flags & CL_NOZERO))
	1525	bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
	1526
	1527	zero_offset = 0;
	1528	} else
	1529	bytes_to_zero = io_size;
	1530
	1531	pg_count = 0;
	1532
	1533	cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
	1534
	1535	if (cbp_head) {
	1536	int pg_resid;
	1537
	1538	/*
	1539	* if there is a current I/O chain pending
	1540	* then the first page of the group we just zero'd
	1541	* will be handled by the I/O completion if the zero
	1542	* fill started in the middle of the page
	1543	*/
	1544	commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1545
	1546	pg_resid = commit_offset - upl_offset;
	1547
	1548	if (bytes_to_zero >= pg_resid) {
	1549	/*
	1550	* the last page of the current I/O
	1551	* has been completed...
	1552	* compute the number of fully zero'd
	1553	* pages that are beyond it
	1554	* plus the last page if its partial
	1555	* and we have no more I/O to issue...
	1556	* otherwise a partial page is left
	1557	* to begin the next I/O
	1558	*/
	1559	if ((int)io_size >= non_rounded_size)
	1560	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1561	else
	1562	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
	1563
	1564	complete_transaction_now = 1;
	1565	}
	1566	} else {
	1567	/*
	1568	* no pending I/O to deal with
	1569	* so, commit all of the fully zero'd pages
	1570	* plus the last page if its partial
	1571	* and we have no more I/O to issue...
	1572	* otherwise a partial page is left
	1573	* to begin the next I/O
	1574	*/
	1575	if ((int)io_size >= non_rounded_size)
	1576	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1577	else
	1578	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
	1579
	1580	commit_offset = upl_offset & ~PAGE_MASK;
	1581	}
	1582
	1583	// Associated UPL is currently only used in the direct write path
	1584	assert(!upl_associated_upl(upl));
	1585
	1586	if ( (flags & CL_COMMIT) && pg_count) {
	1587	ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
	1588	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	1589	}
	1590	upl_offset += io_size;
	1591	f_offset += io_size;
	1592	size -= io_size;
	1593
	1594	/*
	1595	* keep track of how much of the original request
	1596	* that we've actually completed... non_rounded_size
	1597	* may go negative due to us rounding the request
	1598	* to a page size multiple (i.e. size > non_rounded_size)
	1599	*/
	1600	non_rounded_size -= io_size;
	1601
	1602	if (non_rounded_size <= 0) {
	1603	/*
	1604	* we've transferred all of the data in the original
	1605	* request, but we were unable to complete the tail
	1606	* of the last page because the file didn't have
	1607	* an allocation to back that portion... this is ok.
	1608	*/
	1609	size = 0;
	1610	}
	1611	if (cbp_head && (complete_transaction_now \|\| size == 0)) {
	1612	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1613
	1614	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1615
	1616	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1617
	1618	trans_count = 0;
	1619	}
	1620	continue;
	1621	}
	1622	if (pg_count > max_vectors) {
	1623	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	1624	io_size = PAGE_SIZE - pg_offset;
	1625	pg_count = 1;
	1626	} else {
	1627	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	1628	pg_count = max_vectors;
	1629	}
	1630	}
	1631	/*
	1632	* If the transaction is going to reach the maximum number of
	1633	* desired elements, truncate the i/o to the nearest page so
	1634	* that the actual i/o is initiated after this buffer is
	1635	* created and added to the i/o chain.
	1636	*
	1637	* I/O directed to physically contiguous memory
	1638	* doesn't have a requirement to make sure we 'fill' a page
	1639	*/
	1640	if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
	1641	((upl_offset + io_size) & PAGE_MASK)) {
	1642	vm_offset_t aligned_ofs;
	1643
	1644	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
	1645	/*
	1646	* If the io_size does not actually finish off even a
	1647	* single page we have to keep adding buffers to the
	1648	* transaction despite having reached the desired limit.
	1649	*
	1650	* Eventually we get here with the page being finished
	1651	* off (and exceeded) and then we truncate the size of
	1652	* this i/o request so that it is page aligned so that
	1653	* we can finally issue the i/o on the transaction.
	1654	*/
	1655	if (aligned_ofs > upl_offset) {
	1656	io_size = aligned_ofs - upl_offset;
	1657	pg_count--;
	1658	}
	1659	}
	1660
	1661	if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
	1662	/*
	1663	* if we're not targeting a virtual device i.e. a disk image
	1664	* it's safe to dip into the reserve pool since real devices
	1665	* can complete this I/O request without requiring additional
	1666	* bufs from the alloc_io_buf pool
	1667	*/
	1668	priv = 1;
	1669	else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
	1670	/*
	1671	* Throttle the speculative IO
	1672	*/
	1673	priv = 0;
	1674	else
	1675	priv = 1;
	1676
	1677	cbp = alloc_io_buf(vp, priv);
	1678
	1679	if (flags & CL_PAGEOUT) {
	1680	u_int i;
	1681
	1682	/*
	1683	* since blocks are in offsets of 0x1000, scale
	1684	* iteration to (PAGE_SIZE * pg_count) of blks.
	1685	*/
	1686	for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) {
	1687	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
	1688	panic("BUSY bp found in cluster_io");
	1689	}
	1690	}
	1691	if (flags & CL_ASYNC) {
	1692	if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
	1693	panic("buf_setcallback failed\n");
	1694	}
	1695	cbp->b_cliodone = (void *)callback;
	1696	cbp->b_flags \|= io_flags;
	1697	if (flags & CL_NOCACHE)
	1698	cbp->b_attr.ba_flags \|= BA_NOCACHE;
	1699
	1700	cbp->b_lblkno = lblkno;
	1701	cbp->b_blkno = blkno;
	1702	cbp->b_bcount = io_size;
	1703
	1704	if (buf_setupl(cbp, upl, upl_offset))
	1705	panic("buf_setupl failed\n");
	1706	#if CONFIG_IOSCHED
	1707	upl_set_blkno(upl, upl_offset, io_size, blkno);
	1708	#endif
	1709	cbp->b_trans_next = (buf_t)NULL;
	1710
	1711	if ((cbp->b_iostate = (void *)iostate))
	1712	/*
	1713	* caller wants to track the state of this
	1714	* io... bump the amount issued against this stream
	1715	*/
	1716	iostate->io_issued += io_size;
	1717
	1718	if (flags & CL_READ) {
	1719	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	1720	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1721	}
	1722	else {
	1723	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	1724	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1725	}
	1726
	1727	if (cbp_head) {
	1728	cbp_tail->b_trans_next = cbp;
	1729	cbp_tail = cbp;
	1730	} else {
	1731	cbp_head = cbp;
	1732	cbp_tail = cbp;
	1733
	1734	if ( (cbp_head->b_real_bp = real_bp) )
	1735	real_bp = (buf_t)NULL;
	1736	}
	1737	(buf_t )(&cbp->b_trans_head) = cbp_head;
	1738
	1739	trans_count++;
	1740
	1741	upl_offset += io_size;
	1742	f_offset += io_size;
	1743	size -= io_size;
	1744	/*
	1745	* keep track of how much of the original request
	1746	* that we've actually completed... non_rounded_size
	1747	* may go negative due to us rounding the request
	1748	* to a page size multiple (i.e. size > non_rounded_size)
	1749	*/
	1750	non_rounded_size -= io_size;
	1751
	1752	if (non_rounded_size <= 0) {
	1753	/*
	1754	* we've transferred all of the data in the original
	1755	* request, but we were unable to complete the tail
	1756	* of the last page because the file didn't have
	1757	* an allocation to back that portion... this is ok.
	1758	*/
	1759	size = 0;
	1760	}
	1761	if (size == 0) {
	1762	/*
	1763	* we have no more I/O to issue, so go
	1764	* finish the final transaction
	1765	*/
	1766	need_EOT = TRUE;
	1767	} else if ( ((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == 0) &&
	1768	((flags & CL_ASYNC) \|\| trans_count > max_trans_count) ) {
	1769	/*
	1770	* I/O directed to physically contiguous memory...
	1771	* which doesn't have a requirement to make sure we 'fill' a page
	1772	* or...
	1773	* the current I/O we've prepared fully
	1774	* completes the last page in this request
	1775	* and ...
	1776	* it's either an ASYNC request or
	1777	* we've already accumulated more than 8 I/O's into
	1778	* this transaction so mark it as complete so that
	1779	* it can finish asynchronously or via the cluster_complete_transaction
	1780	* below if the request is synchronous
	1781	*/
	1782	need_EOT = TRUE;
	1783	}
	1784	if (need_EOT == TRUE)
	1785	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1786
	1787	if (flags & CL_THROTTLE)
	1788	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
	1789
	1790	if ( !(io_flags & B_READ))
	1791	vnode_startwrite(vp);
	1792
	1793	if (flags & CL_RAW_ENCRYPTED) {
	1794	/*
	1795	* User requested raw encrypted bytes.
	1796	* Twiddle the bit in the ba_flags for the buffer
	1797	*/
	1798	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
	1799	}
	1800
	1801	(void) VNOP_STRATEGY(cbp);
	1802
	1803	if (need_EOT == TRUE) {
	1804	if ( !(flags & CL_ASYNC))
	1805	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
	1806
	1807	need_EOT = FALSE;
	1808	trans_count = 0;
	1809	cbp_head = NULL;
	1810	}
	1811	}
	1812	if (error) {
	1813	int abort_size;
	1814
	1815	io_size = 0;
	1816
	1817	if (cbp_head) {
	1818	/*
	1819	* Wait until all of the outstanding I/O
	1820	* for this partial transaction has completed
	1821	*/
	1822	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1823
	1824	/*
	1825	* Rewind the upl offset to the beginning of the
	1826	* transaction.
	1827	*/
	1828	upl_offset = cbp_head->b_uploffset;
	1829	}
	1830
	1831	if (ISSET(flags, CL_COMMIT)) {
	1832	cluster_handle_associated_upl(iostate, upl, upl_offset,
	1833	upl_end_offset - upl_offset);
	1834	}
	1835
	1836	// Free all the IO buffers in this transaction
	1837	for (cbp = cbp_head; cbp;) {
	1838	buf_t cbp_next;
	1839
	1840	size += cbp->b_bcount;
	1841	io_size += cbp->b_bcount;
	1842
	1843	cbp_next = cbp->b_trans_next;
	1844	free_io_buf(cbp);
	1845	cbp = cbp_next;
	1846	}
	1847
	1848	if (iostate) {
	1849	int need_wakeup = 0;
	1850
	1851	/*
	1852	* update the error condition for this stream
	1853	* since we never really issued the io
	1854	* just go ahead and adjust it back
	1855	*/
	1856	lck_mtx_lock_spin(&iostate->io_mtxp);
	1857
	1858	if (iostate->io_error == 0)
	1859	iostate->io_error = error;
	1860	iostate->io_issued -= io_size;
	1861
	1862	if (iostate->io_wanted) {
	1863	/*
	1864	* someone is waiting for the state of
	1865	* this io stream to change
	1866	*/
	1867	iostate->io_wanted = 0;
	1868	need_wakeup = 1;
	1869	}
	1870	lck_mtx_unlock(&iostate->io_mtxp);
	1871
	1872	if (need_wakeup)
	1873	wakeup((caddr_t)&iostate->io_wanted);
	1874	}
	1875
	1876	if (flags & CL_COMMIT) {
	1877	int upl_flags;
	1878
	1879	pg_offset = upl_offset & PAGE_MASK;
	1880	abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
	1881
	1882	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
	1883
	1884	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1885	upl, upl_offset - pg_offset, abort_size, (error << 24) \| upl_flags, 0);
	1886	}
	1887	if (retval == 0)
	1888	retval = error;
	1889	} else if (cbp_head)
	1890	panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
	1891
	1892	if (real_bp) {
	1893	/*
	1894	* can get here if we either encountered an error
	1895	* or we completely zero-filled the request and
	1896	* no I/O was issued
	1897	*/
	1898	if (error) {
	1899	real_bp->b_flags \|= B_ERROR;
	1900	real_bp->b_error = error;
	1901	}
	1902	buf_biodone(real_bp);
	1903	}
	1904	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
	1905
	1906	return (retval);
	1907	}
	1908
	1909	#define reset_vector_run_state() \
	1910	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
	1911
	1912	static int
	1913	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
	1914	int io_flag, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1915	{
	1916	vector_upl_set_pagelist(vector_upl);
	1917
	1918	if(io_flag & CL_READ) {
	1919	if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
	1920	io_flag &= ~CL_PRESERVE; /don't zero fill/
	1921	else
	1922	io_flag \|= CL_PRESERVE; /zero fill/
	1923	}
	1924	return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
	1925
	1926	}
	1927
	1928	static int
	1929	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	1930	{
	1931	int pages_in_prefetch;
	1932
	1933	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	1934	(int)f_offset, size, (int)filesize, 0, 0);
	1935
	1936	if (f_offset >= filesize) {
	1937	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1938	(int)f_offset, 0, 0, 0, 0);
	1939	return(0);
	1940	}
	1941	if ((off_t)size > (filesize - f_offset))
	1942	size = filesize - f_offset;
	1943	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1944
	1945	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
	1946
	1947	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1948	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	1949
	1950	return (pages_in_prefetch);
	1951	}
	1952
	1953
	1954
	1955	static void
	1956	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap, int (callback)(buf_t, void ), void *callback_arg,
	1957	int bflag)
	1958	{
	1959	daddr64_t r_addr;
	1960	off_t f_offset;
	1961	int size_of_prefetch;
	1962	u_int max_prefetch;
	1963
	1964
	1965	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	1966	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	1967
	1968	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	1969	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1970	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	1971	return;
	1972	}
	1973	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
	1974	rap->cl_ralen = 0;
	1975	rap->cl_maxra = 0;
	1976
	1977	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1978	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	1979
	1980	return;
	1981	}
	1982	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
	1983
	1984	if (max_prefetch > speculative_prefetch_max)
	1985	max_prefetch = speculative_prefetch_max;
	1986
	1987	if (max_prefetch <= PAGE_SIZE) {
	1988	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1989	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
	1990	return;
	1991	}
	1992	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
	1993	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
	1994
	1995	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1996	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	1997	return;
	1998	}
	1999	}
	2000	r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
	2001	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	2002
	2003	size_of_prefetch = 0;
	2004
	2005	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	2006
	2007	if (size_of_prefetch) {
	2008	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2009	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	2010	return;
	2011	}
	2012	if (f_offset < filesize) {
	2013	daddr64_t read_size;
	2014
	2015	rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
	2016
	2017	read_size = (extent->e_addr + 1) - extent->b_addr;
	2018
	2019	if (read_size > rap->cl_ralen) {
	2020	if (read_size > max_prefetch / PAGE_SIZE)
	2021	rap->cl_ralen = max_prefetch / PAGE_SIZE;
	2022	else
	2023	rap->cl_ralen = read_size;
	2024	}
	2025	size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
	2026
	2027	if (size_of_prefetch)
	2028	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	2029	}
	2030	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2031	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	2032	}
	2033
	2034
	2035	int
	2036	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2037	int size, off_t filesize, int flags)
	2038	{
	2039	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2040
	2041	}
	2042
	2043
	2044	int
	2045	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2046	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2047	{
	2048	int io_size;
	2049	int rounded_size;
	2050	off_t max_size;
	2051	int local_flags;
	2052
	2053	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	2054
	2055	if ((flags & UPL_IOSYNC) == 0)
	2056	local_flags \|= CL_ASYNC;
	2057	if ((flags & UPL_NOCOMMIT) == 0)
	2058	local_flags \|= CL_COMMIT;
	2059	if ((flags & UPL_KEEPCACHED))
	2060	local_flags \|= CL_KEEPCACHED;
	2061	if (flags & UPL_PAGING_ENCRYPTED)
	2062	local_flags \|= CL_ENCRYPTED;
	2063
	2064
	2065	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	2066	(int)f_offset, size, (int)filesize, local_flags, 0);
	2067
	2068	/*
	2069	* If they didn't specify any I/O, then we are done...
	2070	* we can't issue an abort because we don't know how
	2071	* big the upl really is
	2072	*/
	2073	if (size <= 0)
	2074	return (EINVAL);
	2075
	2076	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	2077	if (local_flags & CL_COMMIT)
	2078	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2079	return (EROFS);
	2080	}
	2081	/*
	2082	* can't page-in from a negative offset
	2083	* or if we're starting beyond the EOF
	2084	* or if the file offset isn't page aligned
	2085	* or the size requested isn't a multiple of PAGE_SIZE
	2086	*/
	2087	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2088	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	2089	if (local_flags & CL_COMMIT)
	2090	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2091	return (EINVAL);
	2092	}
	2093	max_size = filesize - f_offset;
	2094
	2095	if (size < max_size)
	2096	io_size = size;
	2097	else
	2098	io_size = max_size;
	2099
	2100	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2101
	2102	if (size > rounded_size) {
	2103	if (local_flags & CL_COMMIT)
	2104	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	2105	UPL_ABORT_FREE_ON_EMPTY);
	2106	}
	2107	return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2108	local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
	2109	}
	2110
	2111
	2112	int
	2113	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2114	int size, off_t filesize, int flags)
	2115	{
	2116	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2117	}
	2118
	2119
	2120	int
	2121	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2122	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2123	{
	2124	u_int io_size;
	2125	int rounded_size;
	2126	off_t max_size;
	2127	int retval;
	2128	int local_flags = 0;
	2129
	2130	if (upl == NULL \|\| size < 0)
	2131	panic("cluster_pagein: NULL upl passed in");
	2132
	2133	if ((flags & UPL_IOSYNC) == 0)
	2134	local_flags \|= CL_ASYNC;
	2135	if ((flags & UPL_NOCOMMIT) == 0)
	2136	local_flags \|= CL_COMMIT;
	2137	if (flags & UPL_IOSTREAMING)
	2138	local_flags \|= CL_IOSTREAMING;
	2139	if (flags & UPL_PAGING_ENCRYPTED)
	2140	local_flags \|= CL_ENCRYPTED;
	2141
	2142
	2143	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	2144	(int)f_offset, size, (int)filesize, local_flags, 0);
	2145
	2146	/*
	2147	* can't page-in from a negative offset
	2148	* or if we're starting beyond the EOF
	2149	* or if the file offset isn't page aligned
	2150	* or the size requested isn't a multiple of PAGE_SIZE
	2151	*/
	2152	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2153	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	2154	if (local_flags & CL_COMMIT)
	2155	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2156	return (EINVAL);
	2157	}
	2158	max_size = filesize - f_offset;
	2159
	2160	if (size < max_size)
	2161	io_size = size;
	2162	else
	2163	io_size = max_size;
	2164
	2165	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2166
	2167	if (size > rounded_size && (local_flags & CL_COMMIT))
	2168	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	2169	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2170
	2171	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2172	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2173
	2174	return (retval);
	2175	}
	2176
	2177
	2178	int
	2179	cluster_bp(buf_t bp)
	2180	{
	2181	return cluster_bp_ext(bp, NULL, NULL);
	2182	}
	2183
	2184
	2185	int
	2186	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void ), void *callback_arg)
	2187	{
	2188	off_t f_offset;
	2189	int flags;
	2190
	2191	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	2192	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	2193
	2194	if (bp->b_flags & B_READ)
	2195	flags = CL_ASYNC \| CL_READ;
	2196	else
	2197	flags = CL_ASYNC;
	2198	if (bp->b_flags & B_PASSIVE)
	2199	flags \|= CL_PASSIVE;
	2200
	2201	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	2202
	2203	return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
	2204	}
	2205
	2206
	2207
	2208	int
	2209	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	2210	{
	2211	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
	2212	}
	2213
	2214
	2215	int
	2216	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
	2217	int xflags, int (callback)(buf_t, void ), void *callback_arg)
	2218	{
	2219	user_ssize_t cur_resid;
	2220	int retval = 0;
	2221	int flags;
	2222	int zflags;
	2223	int bflag;
	2224	int write_type = IO_COPY;
	2225	u_int32_t write_length;
	2226
	2227	flags = xflags;
	2228
	2229	if (flags & IO_PASSIVE)
	2230	bflag = CL_PASSIVE;
	2231	else
	2232	bflag = 0;
	2233
	2234	if (vp->v_flag & VNOCACHE_DATA){
	2235	flags \|= IO_NOCACHE;
	2236	bflag \|= CL_NOCACHE;
	2237	}
	2238	if (uio == NULL) {
	2239	/*
	2240	* no user data...
	2241	* this call is being made to zero-fill some range in the file
	2242	*/
	2243	retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
	2244
	2245	return(retval);
	2246	}
	2247	/*
	2248	* do a write through the cache if one of the following is true....
	2249	* NOCACHE is not true or NODIRECT is true
	2250	* the uio request doesn't target USERSPACE
	2251	* otherwise, find out if we want the direct or contig variant for
	2252	* the first vector in the uio request
	2253	*/
	2254	if ( ((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
	2255	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2256
	2257	if ( (flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT)
	2258	/*
	2259	* must go through the cached variant in this case
	2260	*/
	2261	write_type = IO_COPY;
	2262
	2263	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
	2264
	2265	switch (write_type) {
	2266
	2267	case IO_COPY:
	2268	/*
	2269	* make sure the uio_resid isn't too big...
	2270	* internally, we want to handle all of the I/O in
	2271	* chunk sizes that fit in a 32 bit int
	2272	*/
	2273	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	2274	/*
	2275	* we're going to have to call cluster_write_copy
	2276	* more than once...
	2277	*
	2278	* only want the last call to cluster_write_copy to
	2279	* have the IO_TAILZEROFILL flag set and only the
	2280	* first call should have IO_HEADZEROFILL
	2281	*/
	2282	zflags = flags & ~IO_TAILZEROFILL;
	2283	flags &= ~IO_HEADZEROFILL;
	2284
	2285	write_length = MAX_IO_REQUEST_SIZE;
	2286	} else {
	2287	/*
	2288	* last call to cluster_write_copy
	2289	*/
	2290	zflags = flags;
	2291
	2292	write_length = (u_int32_t)cur_resid;
	2293	}
	2294	retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
	2295	break;
	2296
	2297	case IO_CONTIG:
	2298	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
	2299
	2300	if (flags & IO_HEADZEROFILL) {
	2301	/*
	2302	* only do this once per request
	2303	*/
	2304	flags &= ~IO_HEADZEROFILL;
	2305
	2306	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
	2307	headOff, (off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2308	if (retval)
	2309	break;
	2310	}
	2311	retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
	2312
	2313	if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
	2314	/*
	2315	* we're done with the data from the user specified buffer(s)
	2316	* and we've been requested to zero fill at the tail
	2317	* treat this as an IO_HEADZEROFILL which doesn't require a uio
	2318	* by rearranging the args and passing in IO_HEADZEROFILL
	2319	*/
	2320	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
	2321	(off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2322	}
	2323	break;
	2324
	2325	case IO_DIRECT:
	2326	/*
	2327	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
	2328	*/
	2329	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
	2330	break;
	2331
	2332	case IO_UNKNOWN:
	2333	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2334	break;
	2335	}
	2336	/*
	2337	* in case we end up calling cluster_write_copy (from cluster_write_direct)
	2338	* multiple times to service a multi-vector request that is not aligned properly
	2339	* we need to update the oldEOF so that we
	2340	* don't zero-fill the head of a page if we've successfully written
	2341	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2342	* page that is beyond the oldEOF if the write is unaligned... we only
	2343	* want that to happen for the very first page of the cluster_write,
	2344	* NOT the first page of each vector making up a multi-vector write.
	2345	*/
	2346	if (uio->uio_offset > oldEOF)
	2347	oldEOF = uio->uio_offset;
	2348	}
	2349	return (retval);
	2350	}
	2351
	2352
	2353	static int
	2354	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int write_type, u_int32_t *write_length,
	2355	int flags, int (callback)(buf_t, void ), void *callback_arg)
	2356	{
	2357	upl_t upl;
	2358	upl_page_info_t *pl;
	2359	vm_offset_t upl_offset;
	2360	vm_offset_t vector_upl_offset = 0;
	2361	u_int32_t io_req_size;
	2362	u_int32_t offset_in_file;
	2363	u_int32_t offset_in_iovbase;
	2364	u_int32_t io_size;
	2365	int io_flag = 0;
	2366	upl_size_t upl_size, vector_upl_size = 0;
	2367	vm_size_t upl_needed_size;
	2368	mach_msg_type_number_t pages_in_pl;
	2369	upl_control_flags_t upl_flags;
	2370	kern_return_t kret;
	2371	mach_msg_type_number_t i;
	2372	int force_data_sync;
	2373	int retval = 0;
	2374	int first_IO = 1;
	2375	struct clios iostate;
	2376	user_addr_t iov_base;
	2377	u_int32_t mem_alignment_mask;
	2378	u_int32_t devblocksize;
	2379	u_int32_t max_io_size;
	2380	u_int32_t max_upl_size;
	2381	u_int32_t max_vector_size;
	2382	u_int32_t bytes_outstanding_limit;
	2383	boolean_t io_throttled = FALSE;
	2384
	2385	u_int32_t vector_upl_iosize = 0;
	2386	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	2387	off_t v_upl_uio_offset = 0;
	2388	int vector_upl_index=0;
	2389	upl_t vector_upl = NULL;
	2390
	2391
	2392	/*
	2393	* When we enter this routine, we know
	2394	* -- the resid will not exceed iov_len
	2395	*/
	2396	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	2397	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2398
	2399	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2400
	2401	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
	2402
	2403	if (flags & IO_PASSIVE)
	2404	io_flag \|= CL_PASSIVE;
	2405
	2406	if (flags & IO_NOCACHE)
	2407	io_flag \|= CL_NOCACHE;
	2408
	2409	if (flags & IO_SKIP_ENCRYPTION)
	2410	io_flag \|= CL_ENCRYPTED;
	2411
	2412	iostate.io_completed = 0;
	2413	iostate.io_issued = 0;
	2414	iostate.io_error = 0;
	2415	iostate.io_wanted = 0;
	2416
	2417	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2418
	2419	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2420	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2421
	2422	if (devblocksize == 1) {
	2423	/*
	2424	* the AFP client advertises a devblocksize of 1
	2425	* however, its BLOCKMAP routine maps to physical
	2426	* blocks that are PAGE_SIZE in size...
	2427	* therefore we can't ask for I/Os that aren't page aligned
	2428	* or aren't multiples of PAGE_SIZE in size
	2429	* by setting devblocksize to PAGE_SIZE, we re-instate
	2430	* the old behavior we had before the mem_alignment_mask
	2431	* changes went in...
	2432	*/
	2433	devblocksize = PAGE_SIZE;
	2434	}
	2435
	2436	next_dwrite:
	2437	io_req_size = *write_length;
	2438	iov_base = uio_curriovbase(uio);
	2439
	2440	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
	2441	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	2442
	2443	if (offset_in_file \|\| offset_in_iovbase) {
	2444	/*
	2445	* one of the 2 important offsets is misaligned
	2446	* so fire an I/O through the cache for this entire vector
	2447	*/
	2448	goto wait_for_dwrites;
	2449	}
	2450	if (iov_base & (devblocksize - 1)) {
	2451	/*
	2452	* the offset in memory must be on a device block boundary
	2453	* so that we can guarantee that we can generate an
	2454	* I/O that ends on a page boundary in cluster_io
	2455	*/
	2456	goto wait_for_dwrites;
	2457	}
	2458
	2459	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
	2460	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
	2461	int throttle_type;
	2462
	2463	if ( (throttle_type = cluster_is_throttled(vp)) ) {
	2464	/*
	2465	* we're in the throttle window, at the very least
	2466	* we want to limit the size of the I/O we're about
	2467	* to issue
	2468	*/
	2469	if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
	2470	/*
	2471	* we're in the throttle window and at least 1 I/O
	2472	* has already been issued by a throttleable thread
	2473	* in this window, so return with EAGAIN to indicate
	2474	* to the FS issuing the cluster_write call that it
	2475	* should now throttle after dropping any locks
	2476	*/
	2477	throttle_info_update_by_mount(vp->v_mount);
	2478
	2479	io_throttled = TRUE;
	2480	goto wait_for_dwrites;
	2481	}
	2482	max_vector_size = THROTTLE_MAX_IOSIZE;
	2483	max_io_size = THROTTLE_MAX_IOSIZE;
	2484	} else {
	2485	max_vector_size = MAX_VECTOR_UPL_SIZE;
	2486	max_io_size = max_upl_size;
	2487	}
	2488
	2489	if (first_IO) {
	2490	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2491	first_IO = 0;
	2492	}
	2493	io_size = io_req_size & ~PAGE_MASK;
	2494	iov_base = uio_curriovbase(uio);
	2495
	2496	if (io_size > max_io_size)
	2497	io_size = max_io_size;
	2498
	2499	if(useVectorUPL && (iov_base & PAGE_MASK)) {
	2500	/*
	2501	* We have an iov_base that's not page-aligned.
	2502	* Issue all I/O's that have been collected within
	2503	* this Vectored UPL.
	2504	*/
	2505	if(vector_upl_index) {
	2506	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2507	reset_vector_run_state();
	2508	}
	2509
	2510	/*
	2511	* After this point, if we are using the Vector UPL path and the base is
	2512	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	2513	*/
	2514	}
	2515
	2516	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2517	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	2518
	2519	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	2520	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	2521
	2522	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2523	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	2524	pages_in_pl = 0;
	2525	upl_size = upl_needed_size;
	2526	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2527	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2528
	2529	kret = vm_map_get_upl(map,
	2530	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2531	&upl_size,
	2532	&upl,
	2533	NULL,
	2534	&pages_in_pl,
	2535	&upl_flags,
	2536	VM_KERN_MEMORY_FILE,
	2537	force_data_sync);
	2538
	2539	if (kret != KERN_SUCCESS) {
	2540	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2541	0, 0, 0, kret, 0);
	2542	/*
	2543	* failed to get pagelist
	2544	*
	2545	* we may have already spun some portion of this request
	2546	* off as async requests... we need to wait for the I/O
	2547	* to complete before returning
	2548	*/
	2549	goto wait_for_dwrites;
	2550	}
	2551	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2552	pages_in_pl = upl_size / PAGE_SIZE;
	2553
	2554	for (i = 0; i < pages_in_pl; i++) {
	2555	if (!upl_valid_page(pl, i))
	2556	break;
	2557	}
	2558	if (i == pages_in_pl)
	2559	break;
	2560
	2561	/*
	2562	* didn't get all the pages back that we
	2563	* needed... release this upl and try again
	2564	*/
	2565	ubc_upl_abort(upl, 0);
	2566	}
	2567	if (force_data_sync >= 3) {
	2568	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2569	i, pages_in_pl, upl_size, kret, 0);
	2570	/*
	2571	* for some reason, we couldn't acquire a hold on all
	2572	* the pages needed in the user's address space
	2573	*
	2574	* we may have already spun some portion of this request
	2575	* off as async requests... we need to wait for the I/O
	2576	* to complete before returning
	2577	*/
	2578	goto wait_for_dwrites;
	2579	}
	2580
	2581	/*
	2582	* Consider the possibility that upl_size wasn't satisfied.
	2583	*/
	2584	if (upl_size < upl_needed_size) {
	2585	if (upl_size && upl_offset == 0)
	2586	io_size = upl_size;
	2587	else
	2588	io_size = 0;
	2589	}
	2590	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2591	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	2592
	2593	if (io_size == 0) {
	2594	ubc_upl_abort(upl, 0);
	2595	/*
	2596	* we may have already spun some portion of this request
	2597	* off as async requests... we need to wait for the I/O
	2598	* to complete before returning
	2599	*/
	2600	goto wait_for_dwrites;
	2601	}
	2602
	2603	if(useVectorUPL) {
	2604	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	2605	if(end_off)
	2606	issueVectorUPL = 1;
	2607	/*
	2608	* After this point, if we are using a vector UPL, then
	2609	* either all the UPL elements end on a page boundary OR
	2610	* this UPL is the last element because it does not end
	2611	* on a page boundary.
	2612	*/
	2613	}
	2614
	2615	/*
	2616	* we want push out these writes asynchronously so that we can overlap
	2617	* the preparation of the next I/O
	2618	* if there are already too many outstanding writes
	2619	* wait until some complete before issuing the next
	2620	*/
	2621	if (vp->v_mount->mnt_minsaturationbytecount)
	2622	bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
	2623	else
	2624	bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
	2625
	2626	cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
	2627
	2628	if (iostate.io_error) {
	2629	/*
	2630	* one of the earlier writes we issued ran into a hard error
	2631	* don't issue any more writes, cleanup the UPL
	2632	* that was just created but not used, then
	2633	* go wait for all writes that are part of this stream
	2634	* to complete before returning the error to the caller
	2635	*/
	2636	ubc_upl_abort(upl, 0);
	2637
	2638	goto wait_for_dwrites;
	2639	}
	2640
	2641	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	2642	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	2643
	2644	if(!useVectorUPL)
	2645	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	2646	io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2647
	2648	else {
	2649	if(!vector_upl_index) {
	2650	vector_upl = vector_upl_create(upl_offset);
	2651	v_upl_uio_offset = uio->uio_offset;
	2652	vector_upl_offset = upl_offset;
	2653	}
	2654
	2655	vector_upl_set_subupl(vector_upl,upl,upl_size);
	2656	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	2657	vector_upl_index++;
	2658	vector_upl_iosize += io_size;
	2659	vector_upl_size += upl_size;
	2660
	2661	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	2662	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2663	reset_vector_run_state();
	2664	}
	2665	}
	2666
	2667	/*
	2668	* update the uio structure to
	2669	* reflect the I/O that we just issued
	2670	*/
	2671	uio_update(uio, (user_size_t)io_size);
	2672
	2673	/*
	2674	* in case we end up calling through to cluster_write_copy to finish
	2675	* the tail of this request, we need to update the oldEOF so that we
	2676	* don't zero-fill the head of a page if we've successfully written
	2677	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2678	* page that is beyond the oldEOF if the write is unaligned... we only
	2679	* want that to happen for the very first page of the cluster_write,
	2680	* NOT the first page of each vector making up a multi-vector write.
	2681	*/
	2682	if (uio->uio_offset > oldEOF)
	2683	oldEOF = uio->uio_offset;
	2684
	2685	io_req_size -= io_size;
	2686
	2687	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	2688	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
	2689
	2690	} /* end while */
	2691
	2692	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
	2693
	2694	retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
	2695
	2696	if (retval == 0 && *write_type == IO_DIRECT) {
	2697
	2698	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_NONE,
	2699	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2700
	2701	goto next_dwrite;
	2702	}
	2703	}
	2704
	2705	wait_for_dwrites:
	2706
	2707	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	2708	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2709	reset_vector_run_state();
	2710	}
	2711	/*
	2712	* make sure all async writes issued as part of this stream
	2713	* have completed before we return
	2714	*/
	2715	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
	2716
	2717	if (iostate.io_error)
	2718	retval = iostate.io_error;
	2719
	2720	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2721
	2722	if (io_throttled == TRUE && retval == 0)
	2723	retval = EAGAIN;
	2724
	2725	if (io_req_size && retval == 0) {
	2726	/*
	2727	* we couldn't handle the tail of this request in DIRECT mode
	2728	* so fire it through the copy path
	2729	*
	2730	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
	2731	* so we can just pass 0 in for the headOff and tailOff
	2732	*/
	2733	if (uio->uio_offset > oldEOF)
	2734	oldEOF = uio->uio_offset;
	2735
	2736	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
	2737
	2738	*write_type = IO_UNKNOWN;
	2739	}
	2740	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	2741	(int)uio->uio_offset, io_req_size, retval, 4, 0);
	2742
	2743	return (retval);
	2744	}
	2745
	2746
	2747	static int
	2748	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int write_type, u_int32_t *write_length,
	2749	int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2750	{
	2751	upl_page_info_t *pl;
	2752	addr64_t src_paddr = 0;
	2753	upl_t upl[MAX_VECTS];
	2754	vm_offset_t upl_offset;
	2755	u_int32_t tail_size = 0;
	2756	u_int32_t io_size;
	2757	u_int32_t xsize;
	2758	upl_size_t upl_size;
	2759	vm_size_t upl_needed_size;
	2760	mach_msg_type_number_t pages_in_pl;
	2761	upl_control_flags_t upl_flags;
	2762	kern_return_t kret;
	2763	struct clios iostate;
	2764	int error = 0;
	2765	int cur_upl = 0;
	2766	int num_upl = 0;
	2767	int n;
	2768	user_addr_t iov_base;
	2769	u_int32_t devblocksize;
	2770	u_int32_t mem_alignment_mask;
	2771
	2772	/*
	2773	* When we enter this routine, we know
	2774	* -- the io_req_size will not exceed iov_len
	2775	* -- the target address is physically contiguous
	2776	*/
	2777	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2778
	2779	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2780	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2781
	2782	iostate.io_completed = 0;
	2783	iostate.io_issued = 0;
	2784	iostate.io_error = 0;
	2785	iostate.io_wanted = 0;
	2786
	2787	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2788
	2789	next_cwrite:
	2790	io_size = *write_length;
	2791
	2792	iov_base = uio_curriovbase(uio);
	2793
	2794	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2795	upl_needed_size = upl_offset + io_size;
	2796
	2797	pages_in_pl = 0;
	2798	upl_size = upl_needed_size;
	2799	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2800	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2801
	2802	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2803	kret = vm_map_get_upl(map,
	2804	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2805	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	2806
	2807	if (kret != KERN_SUCCESS) {
	2808	/*
	2809	* failed to get pagelist
	2810	*/
	2811	error = EINVAL;
	2812	goto wait_for_cwrites;
	2813	}
	2814	num_upl++;
	2815
	2816	/*
	2817	* Consider the possibility that upl_size wasn't satisfied.
	2818	*/
	2819	if (upl_size < upl_needed_size) {
	2820	/*
	2821	* This is a failure in the physical memory case.
	2822	*/
	2823	error = EINVAL;
	2824	goto wait_for_cwrites;
	2825	}
	2826	pl = ubc_upl_pageinfo(upl[cur_upl]);
	2827
	2828	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	2829
	2830	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	2831	u_int32_t head_size;
	2832
	2833	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	2834
	2835	if (head_size > io_size)
	2836	head_size = io_size;
	2837
	2838	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
	2839
	2840	if (error)
	2841	goto wait_for_cwrites;
	2842
	2843	upl_offset += head_size;
	2844	src_paddr += head_size;
	2845	io_size -= head_size;
	2846
	2847	iov_base += head_size;
	2848	}
	2849	if ((u_int32_t)iov_base & mem_alignment_mask) {
	2850	/*
	2851	* request doesn't set up on a memory boundary
	2852	* the underlying DMA engine can handle...
	2853	* return an error instead of going through
	2854	* the slow copy path since the intent of this
	2855	* path is direct I/O from device memory
	2856	*/
	2857	error = EINVAL;
	2858	goto wait_for_cwrites;
	2859	}
	2860
	2861	tail_size = io_size & (devblocksize - 1);
	2862	io_size -= tail_size;
	2863
	2864	while (io_size && error == 0) {
	2865
	2866	if (io_size > MAX_IO_CONTIG_SIZE)
	2867	xsize = MAX_IO_CONTIG_SIZE;
	2868	else
	2869	xsize = io_size;
	2870	/*
	2871	* request asynchronously so that we can overlap
	2872	* the preparation of the next I/O... we'll do
	2873	* the commit after all the I/O has completed
	2874	* since its all issued against the same UPL
	2875	* if there are already too many outstanding writes
	2876	* wait until some have completed before issuing the next
	2877	*/
	2878	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
	2879
	2880	if (iostate.io_error) {
	2881	/*
	2882	* one of the earlier writes we issued ran into a hard error
	2883	* don't issue any more writes...
	2884	* go wait for all writes that are part of this stream
	2885	* to complete before returning the error to the caller
	2886	*/
	2887	goto wait_for_cwrites;
	2888	}
	2889	/*
	2890	* issue an asynchronous write to cluster_io
	2891	*/
	2892	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
	2893	xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
	2894
	2895	if (error == 0) {
	2896	/*
	2897	* The cluster_io write completed successfully,
	2898	* update the uio structure
	2899	*/
	2900	uio_update(uio, (user_size_t)xsize);
	2901
	2902	upl_offset += xsize;
	2903	src_paddr += xsize;
	2904	io_size -= xsize;
	2905	}
	2906	}
	2907	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
	2908
	2909	error = cluster_io_type(uio, write_type, write_length, 0);
	2910
	2911	if (error == 0 && *write_type == IO_CONTIG) {
	2912	cur_upl++;
	2913	goto next_cwrite;
	2914	}
	2915	} else
	2916	*write_type = IO_UNKNOWN;
	2917
	2918	wait_for_cwrites:
	2919	/*
	2920	* make sure all async writes that are part of this stream
	2921	* have completed before we proceed
	2922	*/
	2923	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
	2924
	2925	if (iostate.io_error)
	2926	error = iostate.io_error;
	2927
	2928	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2929
	2930	if (error == 0 && tail_size)
	2931	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
	2932
	2933	for (n = 0; n < num_upl; n++)
	2934	/*
	2935	* just release our hold on each physically contiguous
	2936	* region without changing any state
	2937	*/
	2938	ubc_upl_abort(upl[n], 0);
	2939
	2940	return (error);
	2941	}
	2942
	2943
	2944	/*
	2945	* need to avoid a race between an msync of a range of pages dirtied via mmap
	2946	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
	2947	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
	2948	*
	2949	* we should never force-zero-fill pages that are already valid in the cache...
	2950	* the entire page contains valid data (either from disk, zero-filled or dirtied
	2951	* via an mmap) so we can only do damage by trying to zero-fill
	2952	*
	2953	*/
	2954	static int
	2955	cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
	2956	{
	2957	int zero_pg_index;
	2958	boolean_t need_cluster_zero = TRUE;
	2959
	2960	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2961
	2962	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	2963	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	2964
	2965	if (upl_valid_page(pl, zero_pg_index)) {
	2966	/*
	2967	* never force zero valid pages - dirty or clean
	2968	* we'll leave these in the UPL for cluster_write_copy to deal with
	2969	*/
	2970	need_cluster_zero = FALSE;
	2971	}
	2972	}
	2973	if (need_cluster_zero == TRUE)
	2974	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2975
	2976	return (bytes_to_zero);
	2977	}
	2978
	2979
	2980	static int
	2981	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
	2982	off_t tailOff, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2983	{
	2984	upl_page_info_t *pl;
	2985	upl_t upl;
	2986	vm_offset_t upl_offset = 0;
	2987	vm_size_t upl_size;
	2988	off_t upl_f_offset;
	2989	int pages_in_upl;
	2990	int start_offset;
	2991	int xfer_resid;
	2992	int io_size;
	2993	int io_offset;
	2994	int bytes_to_zero;
	2995	int bytes_to_move;
	2996	kern_return_t kret;
	2997	int retval = 0;
	2998	int io_resid;
	2999	long long total_size;
	3000	long long zero_cnt;
	3001	off_t zero_off;
	3002	long long zero_cnt1;
	3003	off_t zero_off1;
	3004	off_t write_off = 0;
	3005	int write_cnt = 0;
	3006	boolean_t first_pass = FALSE;
	3007	struct cl_extent cl;
	3008	struct cl_writebehind *wbp;
	3009	int bflag;
	3010	u_int max_cluster_pgcount;
	3011	u_int max_io_size;
	3012
	3013	if (uio) {
	3014	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3015	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
	3016
	3017	io_resid = io_req_size;
	3018	} else {
	3019	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3020	0, 0, (int)oldEOF, (int)newEOF, 0);
	3021
	3022	io_resid = 0;
	3023	}
	3024	if (flags & IO_PASSIVE)
	3025	bflag = CL_PASSIVE;
	3026	else
	3027	bflag = 0;
	3028	if (flags & IO_NOCACHE)
	3029	bflag \|= CL_NOCACHE;
	3030
	3031	if (flags & IO_SKIP_ENCRYPTION)
	3032	bflag \|= CL_ENCRYPTED;
	3033
	3034	zero_cnt = 0;
	3035	zero_cnt1 = 0;
	3036	zero_off = 0;
	3037	zero_off1 = 0;
	3038
	3039	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	3040	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	3041
	3042	if (flags & IO_HEADZEROFILL) {
	3043	/*
	3044	* some filesystems (HFS is one) don't support unallocated holes within a file...
	3045	* so we zero fill the intervening space between the old EOF and the offset
	3046	* where the next chunk of real data begins.... ftruncate will also use this
	3047	* routine to zero fill to the new EOF when growing a file... in this case, the
	3048	* uio structure will not be provided
	3049	*/
	3050	if (uio) {
	3051	if (headOff < uio->uio_offset) {
	3052	zero_cnt = uio->uio_offset - headOff;
	3053	zero_off = headOff;
	3054	}
	3055	} else if (headOff < newEOF) {
	3056	zero_cnt = newEOF - headOff;
	3057	zero_off = headOff;
	3058	}
	3059	} else {
	3060	if (uio && uio->uio_offset > oldEOF) {
	3061	zero_off = uio->uio_offset & ~PAGE_MASK_64;
	3062
	3063	if (zero_off >= oldEOF) {
	3064	zero_cnt = uio->uio_offset - zero_off;
	3065
	3066	flags \|= IO_HEADZEROFILL;
	3067	}
	3068	}
	3069	}
	3070	if (flags & IO_TAILZEROFILL) {
	3071	if (uio) {
	3072	zero_off1 = uio->uio_offset + io_req_size;
	3073
	3074	if (zero_off1 < tailOff)
	3075	zero_cnt1 = tailOff - zero_off1;
	3076	}
	3077	} else {
	3078	if (uio && newEOF > oldEOF) {
	3079	zero_off1 = uio->uio_offset + io_req_size;
	3080
	3081	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
	3082	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
	3083
	3084	flags \|= IO_TAILZEROFILL;
	3085	}
	3086	}
	3087	}
	3088	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	3089	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	3090	retval, 0, 0, 0, 0);
	3091	return (0);
	3092	}
	3093	if (uio) {
	3094	write_off = uio->uio_offset;
	3095	write_cnt = uio_resid(uio);
	3096	/*
	3097	* delay updating the sequential write info
	3098	* in the control block until we've obtained
	3099	* the lock for it
	3100	*/
	3101	first_pass = TRUE;
	3102	}
	3103	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	3104	/*
	3105	* for this iteration of the loop, figure out where our starting point is
	3106	*/
	3107	if (zero_cnt) {
	3108	start_offset = (int)(zero_off & PAGE_MASK_64);
	3109	upl_f_offset = zero_off - start_offset;
	3110	} else if (io_resid) {
	3111	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3112	upl_f_offset = uio->uio_offset - start_offset;
	3113	} else {
	3114	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	3115	upl_f_offset = zero_off1 - start_offset;
	3116	}
	3117	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	3118	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	3119
	3120	if (total_size > max_io_size)
	3121	total_size = max_io_size;
	3122
	3123	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	3124
	3125	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	3126	/*
	3127	* assumption... total_size <= io_resid
	3128	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	3129	*/
	3130	if ((start_offset + total_size) > max_io_size)
	3131	total_size = max_io_size - start_offset;
	3132	xfer_resid = total_size;
	3133
	3134	retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
	3135
	3136	if (retval)
	3137	break;
	3138
	3139	io_resid -= (total_size - xfer_resid);
	3140	total_size = xfer_resid;
	3141	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3142	upl_f_offset = uio->uio_offset - start_offset;
	3143
	3144	if (total_size == 0) {
	3145	if (start_offset) {
	3146	/*
	3147	* the write did not finish on a page boundary
	3148	* which will leave upl_f_offset pointing to the
	3149	* beginning of the last page written instead of
	3150	* the page beyond it... bump it in this case
	3151	* so that the cluster code records the last page
	3152	* written as dirty
	3153	*/
	3154	upl_f_offset += PAGE_SIZE_64;
	3155	}
	3156	upl_size = 0;
	3157
	3158	goto check_cluster;
	3159	}
	3160	}
	3161	/*
	3162	* compute the size of the upl needed to encompass
	3163	* the requested write... limit each call to cluster_io
	3164	* to the maximum UPL size... cluster_io will clip if
	3165	* this exceeds the maximum io_size for the device,
	3166	* make sure to account for
	3167	* a starting offset that's not page aligned
	3168	*/
	3169	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3170
	3171	if (upl_size > max_io_size)
	3172	upl_size = max_io_size;
	3173
	3174	pages_in_upl = upl_size / PAGE_SIZE;
	3175	io_size = upl_size - start_offset;
	3176
	3177	if ((long long)io_size > total_size)
	3178	io_size = total_size;
	3179
	3180	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	3181
	3182
	3183	/*
	3184	* Gather the pages from the buffer cache.
	3185	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	3186	* that we intend to modify these pages.
	3187	*/
	3188	kret = ubc_create_upl_kernel(vp,
	3189	upl_f_offset,
	3190	upl_size,
	3191	&upl,
	3192	&pl,
	3193	UPL_SET_LITE \| (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
	3194	VM_KERN_MEMORY_FILE);
	3195	if (kret != KERN_SUCCESS)
	3196	panic("cluster_write_copy: failed to get pagelist");
	3197
	3198	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	3199	upl, (int)upl_f_offset, start_offset, 0, 0);
	3200
	3201	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
	3202	int read_size;
	3203
	3204	/*
	3205	* we're starting in the middle of the first page of the upl
	3206	* and the page isn't currently valid, so we're going to have
	3207	* to read it in first... this is a synchronous operation
	3208	*/
	3209	read_size = PAGE_SIZE;
	3210
	3211	if ((upl_f_offset + read_size) > oldEOF)
	3212	read_size = oldEOF - upl_f_offset;
	3213
	3214	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	3215	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3216	if (retval) {
	3217	/*
	3218	* we had an error during the read which causes us to abort
	3219	* the current cluster_write request... before we do, we need
	3220	* to release the rest of the pages in the upl without modifying
	3221	* there state and mark the failed page in error
	3222	*/
	3223	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3224
	3225	if (upl_size > PAGE_SIZE)
	3226	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3227
	3228	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3229	upl, 0, 0, retval, 0);
	3230	break;
	3231	}
	3232	}
	3233	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	3234	/*
	3235	* the last offset we're writing to in this upl does not end on a page
	3236	* boundary... if it's not beyond the old EOF, then we'll also need to
	3237	* pre-read this page in if it isn't already valid
	3238	*/
	3239	upl_offset = upl_size - PAGE_SIZE;
	3240
	3241	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	3242	!upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
	3243	int read_size;
	3244
	3245	read_size = PAGE_SIZE;
	3246
	3247	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
	3248	read_size = oldEOF - (upl_f_offset + upl_offset);
	3249
	3250	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	3251	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3252	if (retval) {
	3253	/*
	3254	* we had an error during the read which causes us to abort
	3255	* the current cluster_write request... before we do, we
	3256	* need to release the rest of the pages in the upl without
	3257	* modifying there state and mark the failed page in error
	3258	*/
	3259	ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3260
	3261	if (upl_size > PAGE_SIZE)
	3262	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3263
	3264	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3265	upl, 0, 0, retval, 0);
	3266	break;
	3267	}
	3268	}
	3269	}
	3270	xfer_resid = io_size;
	3271	io_offset = start_offset;
	3272
	3273	while (zero_cnt && xfer_resid) {
	3274
	3275	if (zero_cnt < (long long)xfer_resid)
	3276	bytes_to_zero = zero_cnt;
	3277	else
	3278	bytes_to_zero = xfer_resid;
	3279
	3280	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
	3281
	3282	xfer_resid -= bytes_to_zero;
	3283	zero_cnt -= bytes_to_zero;
	3284	zero_off += bytes_to_zero;
	3285	io_offset += bytes_to_zero;
	3286	}
	3287	if (xfer_resid && io_resid) {
	3288	u_int32_t io_requested;
	3289
	3290	bytes_to_move = min(io_resid, xfer_resid);
	3291	io_requested = bytes_to_move;
	3292
	3293	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
	3294
	3295	if (retval) {
	3296	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3297
	3298	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3299	upl, 0, 0, retval, 0);
	3300	} else {
	3301	io_resid -= bytes_to_move;
	3302	xfer_resid -= bytes_to_move;
	3303	io_offset += bytes_to_move;
	3304	}
	3305	}
	3306	while (xfer_resid && zero_cnt1 && retval == 0) {
	3307
	3308	if (zero_cnt1 < (long long)xfer_resid)
	3309	bytes_to_zero = zero_cnt1;
	3310	else
	3311	bytes_to_zero = xfer_resid;
	3312
	3313	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
	3314
	3315	xfer_resid -= bytes_to_zero;
	3316	zero_cnt1 -= bytes_to_zero;
	3317	zero_off1 += bytes_to_zero;
	3318	io_offset += bytes_to_zero;
	3319	}
	3320	if (retval == 0) {
	3321	int cl_index;
	3322	int ret_cluster_try_push;
	3323
	3324	io_size += start_offset;
	3325
	3326	if (newEOF >= oldEOF && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
	3327	/*
	3328	* if we're extending the file with this write
	3329	* we'll zero fill the rest of the page so that
	3330	* if the file gets extended again in such a way as to leave a
	3331	* hole starting at this EOF, we'll have zero's in the correct spot
	3332	*/
	3333	cluster_zero(upl, io_size, upl_size - io_size, NULL);
	3334	}
	3335	/*
	3336	* release the upl now if we hold one since...
	3337	* 1) pages in it may be present in the sparse cluster map
	3338	* and may span 2 separate buckets there... if they do and
	3339	* we happen to have to flush a bucket to make room and it intersects
	3340	* this upl, a deadlock may result on page BUSY
	3341	* 2) we're delaying the I/O... from this point forward we're just updating
	3342	* the cluster state... no need to hold the pages, so commit them
	3343	* 3) IO_SYNC is set...
	3344	* because we had to ask for a UPL that provides currenty non-present pages, the
	3345	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	3346	* upon committing it... this is not the behavior we want since it's possible for
	3347	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	3348	* we'll pick these pages back up later with the correct behavior specified.
	3349	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
	3350	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
	3351	* we hold since the flushing context is holding the cluster lock.
	3352	*/
	3353	ubc_upl_commit_range(upl, 0, upl_size,
	3354	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	3355	check_cluster:
	3356	/*
	3357	* calculate the last logical block number
	3358	* that this delayed I/O encompassed
	3359	*/
	3360	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	3361
	3362	if (flags & IO_SYNC) {
	3363	/*
	3364	* if the IO_SYNC flag is set than we need to
	3365	* bypass any clusters and immediately issue
	3366	* the I/O
	3367	*/
	3368	goto issue_io;
	3369	}
	3370	/*
	3371	* take the lock to protect our accesses
	3372	* of the writebehind and sparse cluster state
	3373	*/
	3374	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3375
	3376	if (wbp->cl_scmap) {
	3377
	3378	if ( !(flags & IO_NOCACHE)) {
	3379	/*
	3380	* we've fallen into the sparse
	3381	* cluster method of delaying dirty pages
	3382	*/
	3383	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3384
	3385	lck_mtx_unlock(&wbp->cl_lockw);
	3386
	3387	continue;
	3388	}
	3389	/*
	3390	* must have done cached writes that fell into
	3391	* the sparse cluster mechanism... we've switched
	3392	* to uncached writes on the file, so go ahead
	3393	* and push whatever's in the sparse map
	3394	* and switch back to normal clustering
	3395	*/
	3396	wbp->cl_number = 0;
	3397
	3398	sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg);
	3399	/*
	3400	* no clusters of either type present at this point
	3401	* so just go directly to start_new_cluster since
	3402	* we know we need to delay this I/O since we've
	3403	* already released the pages back into the cache
	3404	* to avoid the deadlock with sparse_cluster_push
	3405	*/
	3406	goto start_new_cluster;
	3407	}
	3408	if (first_pass) {
	3409	if (write_off == wbp->cl_last_write)
	3410	wbp->cl_seq_written += write_cnt;
	3411	else
	3412	wbp->cl_seq_written = write_cnt;
	3413
	3414	wbp->cl_last_write = write_off + write_cnt;
	3415
	3416	first_pass = FALSE;
	3417	}
	3418	if (wbp->cl_number == 0)
	3419	/*
	3420	* no clusters currently present
	3421	*/
	3422	goto start_new_cluster;
	3423
	3424	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3425	/*
	3426	* check each cluster that we currently hold
	3427	* try to merge some or all of this write into
	3428	* one or more of the existing clusters... if
	3429	* any portion of the write remains, start a
	3430	* new cluster
	3431	*/
	3432	if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	3433	/*
	3434	* the current write starts at or after the current cluster
	3435	*/
	3436	if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3437	/*
	3438	* we have a write that fits entirely
	3439	* within the existing cluster limits
	3440	*/
	3441	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
	3442	/*
	3443	* update our idea of where the cluster ends
	3444	*/
	3445	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3446	break;
	3447	}
	3448	if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3449	/*
	3450	* we have a write that starts in the middle of the current cluster
	3451	* but extends beyond the cluster's limit... we know this because
	3452	* of the previous checks
	3453	* we'll extend the current cluster to the max
	3454	* and update the b_addr for the current write to reflect that
	3455	* the head of it was absorbed into this cluster...
	3456	* note that we'll always have a leftover tail in this case since
	3457	* full absorbtion would have occurred in the clause above
	3458	*/
	3459	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
	3460
	3461	cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
	3462	}
	3463	/*
	3464	* we come here for the case where the current write starts
	3465	* beyond the limit of the existing cluster or we have a leftover
	3466	* tail after a partial absorbtion
	3467	*
	3468	* in either case, we'll check the remaining clusters before
	3469	* starting a new one
	3470	*/
	3471	} else {
	3472	/*
	3473	* the current write starts in front of the cluster we're currently considering
	3474	*/
	3475	if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
	3476	/*
	3477	* we can just merge the new request into
	3478	* this cluster and leave it in the cache
	3479	* since the resulting cluster is still
	3480	* less than the maximum allowable size
	3481	*/
	3482	wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
	3483
	3484	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3485	/*
	3486	* the current write completely
	3487	* envelops the existing cluster and since
	3488	* each write is limited to at most max_cluster_pgcount pages
	3489	* we can just use the start and last blocknos of the write
	3490	* to generate the cluster limits
	3491	*/
	3492	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3493	}
	3494	break;
	3495	}
	3496
	3497	/*
	3498	* if we were to combine this write with the current cluster
	3499	* we would exceed the cluster size limit.... so,
	3500	* let's see if there's any overlap of the new I/O with
	3501	* the cluster we're currently considering... in fact, we'll
	3502	* stretch the cluster out to it's full limit and see if we
	3503	* get an intersection with the current write
	3504	*
	3505	*/
	3506	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
	3507	/*
	3508	* the current write extends into the proposed cluster
	3509	* clip the length of the current write after first combining it's
	3510	* tail with the newly shaped cluster
	3511	*/
	3512	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
	3513
	3514	cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
	3515	}
	3516	/*
	3517	* if we get here, there was no way to merge
	3518	* any portion of this write with this cluster
	3519	* or we could only merge part of it which
	3520	* will leave a tail...
	3521	* we'll check the remaining clusters before starting a new one
	3522	*/
	3523	}
	3524	}
	3525	if (cl_index < wbp->cl_number)
	3526	/*
	3527	* we found an existing cluster(s) that we
	3528	* could entirely merge this I/O into
	3529	*/
	3530	goto delay_io;
	3531
	3532	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) &&
	3533	wbp->cl_number == MAX_CLUSTERS &&
	3534	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
	3535	uint32_t n;
	3536
	3537	if (vp->v_mount->mnt_minsaturationbytecount) {
	3538	n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
	3539
	3540	if (n > MAX_CLUSTERS)
	3541	n = MAX_CLUSTERS;
	3542	} else
	3543	n = 0;
	3544
	3545	if (n == 0) {
	3546	if (disk_conditioner_mount_is_ssd(vp->v_mount))
	3547	n = WRITE_BEHIND_SSD;
	3548	else
	3549	n = WRITE_BEHIND;
	3550	}
	3551	while (n--)
	3552	cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL);
	3553	}
	3554	if (wbp->cl_number < MAX_CLUSTERS) {
	3555	/*
	3556	* we didn't find an existing cluster to
	3557	* merge into, but there's room to start
	3558	* a new one
	3559	*/
	3560	goto start_new_cluster;
	3561	}
	3562	/*
	3563	* no exisitng cluster to merge with and no
	3564	* room to start a new one... we'll try
	3565	* pushing one of the existing ones... if none of
	3566	* them are able to be pushed, we'll switch
	3567	* to the sparse cluster mechanism
	3568	* cluster_try_push updates cl_number to the
	3569	* number of remaining clusters... and
	3570	* returns the number of currently unused clusters
	3571	*/
	3572	ret_cluster_try_push = 0;
	3573
	3574	/*
	3575	* if writes are not deferred, call cluster push immediately
	3576	*/
	3577	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
	3578
	3579	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL);
	3580	}
	3581
	3582	/*
	3583	* execute following regardless of writes being deferred or not
	3584	*/
	3585	if (ret_cluster_try_push == 0) {
	3586	/*
	3587	* no more room in the normal cluster mechanism
	3588	* so let's switch to the more expansive but expensive
	3589	* sparse mechanism....
	3590	*/
	3591	sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
	3592	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3593
	3594	lck_mtx_unlock(&wbp->cl_lockw);
	3595
	3596	continue;
	3597	}
	3598	start_new_cluster:
	3599	wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
	3600	wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
	3601
	3602	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
	3603
	3604	if (flags & IO_NOCACHE)
	3605	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
	3606
	3607	if (bflag & CL_PASSIVE)
	3608	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
	3609
	3610	wbp->cl_number++;
	3611	delay_io:
	3612	lck_mtx_unlock(&wbp->cl_lockw);
	3613
	3614	continue;
	3615	issue_io:
	3616	/*
	3617	* we don't hold the lock at this point
	3618	*
	3619	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
	3620	* so that we correctly deal with a change in state of the hardware modify bit...
	3621	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
	3622	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
	3623	* responsible for generating the correct sized I/O(s)
	3624	*/
	3625	retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
	3626	}
	3627	}
	3628	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END, retval, 0, io_resid, 0, 0);
	3629
	3630	return (retval);
	3631	}
	3632
	3633
	3634
	3635	int
	3636	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	3637	{
	3638	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
	3639	}
	3640
	3641
	3642	int
	3643	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int xflags, int (callback)(buf_t, void ), void callback_arg)
	3644	{
	3645	int retval = 0;
	3646	int flags;
	3647	user_ssize_t cur_resid;
	3648	u_int32_t io_size;
	3649	u_int32_t read_length = 0;
	3650	int read_type = IO_COPY;
	3651
	3652	flags = xflags;
	3653
	3654	if (vp->v_flag & VNOCACHE_DATA)
	3655	flags \|= IO_NOCACHE;
	3656	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled)
	3657	flags \|= IO_RAOFF;
	3658
	3659	if (flags & IO_SKIP_ENCRYPTION)
	3660	flags \|= IO_ENCRYPTED;
	3661
	3662	/*
	3663	* do a read through the cache if one of the following is true....
	3664	* NOCACHE is not true
	3665	* the uio request doesn't target USERSPACE
	3666	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
	3667	* Reading encrypted data from a CP filesystem should never result in the data touching
	3668	* the UBC.
	3669	*
	3670	* otherwise, find out if we want the direct or contig variant for
	3671	* the first vector in the uio request
	3672	*/
	3673	if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED) ) {
	3674
	3675	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3676	}
	3677
	3678	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
	3679
	3680	switch (read_type) {
	3681
	3682	case IO_COPY:
	3683	/*
	3684	* make sure the uio_resid isn't too big...
	3685	* internally, we want to handle all of the I/O in
	3686	* chunk sizes that fit in a 32 bit int
	3687	*/
	3688	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
	3689	io_size = MAX_IO_REQUEST_SIZE;
	3690	else
	3691	io_size = (u_int32_t)cur_resid;
	3692
	3693	retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
	3694	break;
	3695
	3696	case IO_DIRECT:
	3697	retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
	3698	break;
	3699
	3700	case IO_CONTIG:
	3701	retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
	3702	break;
	3703
	3704	case IO_UNKNOWN:
	3705	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3706	break;
	3707	}
	3708	}
	3709	return (retval);
	3710	}
	3711
	3712
	3713
	3714	static void
	3715	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
	3716	{
	3717	int range;
	3718	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	3719
	3720	if ((range = last_pg - start_pg)) {
	3721	if (take_reference)
	3722	abort_flags \|= UPL_ABORT_REFERENCE;
	3723
	3724	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
	3725	}
	3726	}
	3727
	3728
	3729	static int
	3730	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int flags, int (callback)(buf_t, void ), void callback_arg)
	3731	{
	3732	upl_page_info_t *pl;
	3733	upl_t upl;
	3734	vm_offset_t upl_offset;
	3735	u_int32_t upl_size;
	3736	off_t upl_f_offset;
	3737	int start_offset;
	3738	int start_pg;
	3739	int last_pg;
	3740	int uio_last = 0;
	3741	int pages_in_upl;
	3742	off_t max_size;
	3743	off_t last_ioread_offset;
	3744	off_t last_request_offset;
	3745	kern_return_t kret;
	3746	int error = 0;
	3747	int retval = 0;
	3748	u_int32_t size_of_prefetch;
	3749	u_int32_t xsize;
	3750	u_int32_t io_size;
	3751	u_int32_t max_rd_size;
	3752	u_int32_t max_io_size;
	3753	u_int32_t max_prefetch;
	3754	u_int rd_ahead_enabled = 1;
	3755	u_int prefetch_enabled = 1;
	3756	struct cl_readahead * rap;
	3757	struct clios iostate;
	3758	struct cl_extent extent;
	3759	int bflag;
	3760	int take_reference = 1;
	3761	int policy = IOPOL_DEFAULT;
	3762	boolean_t iolock_inited = FALSE;
	3763
	3764	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	3765	(int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
	3766
	3767	if (flags & IO_ENCRYPTED) {
	3768	panic ("encrypted blocks will hit UBC!");
	3769	}
	3770
	3771	policy = throttle_get_io_policy(NULL);
	3772
	3773	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE))
	3774	take_reference = 0;
	3775
	3776	if (flags & IO_PASSIVE)
	3777	bflag = CL_PASSIVE;
	3778	else
	3779	bflag = 0;
	3780
	3781	if (flags & IO_NOCACHE)
	3782	bflag \|= CL_NOCACHE;
	3783
	3784	if (flags & IO_SKIP_ENCRYPTION)
	3785	bflag \|= CL_ENCRYPTED;
	3786
	3787	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	3788	max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
	3789	max_rd_size = max_prefetch;
	3790
	3791	last_request_offset = uio->uio_offset + io_req_size;
	3792
	3793	if (last_request_offset > filesize)
	3794	last_request_offset = filesize;
	3795
	3796	if ((flags & (IO_RAOFF\|IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	3797	rd_ahead_enabled = 0;
	3798	rap = NULL;
	3799	} else {
	3800	if (cluster_is_throttled(vp)) {
	3801	/*
	3802	* we're in the throttle window, at the very least
	3803	* we want to limit the size of the I/O we're about
	3804	* to issue
	3805	*/
	3806	rd_ahead_enabled = 0;
	3807	prefetch_enabled = 0;
	3808
	3809	max_rd_size = THROTTLE_MAX_IOSIZE;
	3810	}
	3811	if ((rap = cluster_get_rap(vp)) == NULL)
	3812	rd_ahead_enabled = 0;
	3813	else {
	3814	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	3815	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	3816	}
	3817	}
	3818	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	3819	/*
	3820	* determine if we already have a read-ahead in the pipe courtesy of the
	3821	* last read systemcall that was issued...
	3822	* if so, pick up it's extent to determine where we should start
	3823	* with respect to any read-ahead that might be necessary to
	3824	* garner all the data needed to complete this read systemcall
	3825	*/
	3826	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	3827
	3828	if (last_ioread_offset < uio->uio_offset)
	3829	last_ioread_offset = (off_t)0;
	3830	else if (last_ioread_offset > last_request_offset)
	3831	last_ioread_offset = last_request_offset;
	3832	} else
	3833	last_ioread_offset = (off_t)0;
	3834
	3835	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
	3836
	3837	max_size = filesize - uio->uio_offset;
	3838
	3839	if ((off_t)(io_req_size) < max_size)
	3840	io_size = io_req_size;
	3841	else
	3842	io_size = max_size;
	3843
	3844	if (!(flags & IO_NOCACHE)) {
	3845
	3846	while (io_size) {
	3847	u_int32_t io_resid;
	3848	u_int32_t io_requested;
	3849
	3850	/*
	3851	* if we keep finding the pages we need already in the cache, then
	3852	* don't bother to call cluster_read_prefetch since it costs CPU cycles
	3853	* to determine that we have all the pages we need... once we miss in
	3854	* the cache and have issued an I/O, than we'll assume that we're likely
	3855	* to continue to miss in the cache and it's to our advantage to try and prefetch
	3856	*/
	3857	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
	3858	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	3859	/*
	3860	* we've already issued I/O for this request and
	3861	* there's still work to do and
	3862	* our prefetch stream is running dry, so issue a
	3863	* pre-fetch I/O... the I/O latency will overlap
	3864	* with the copying of the data
	3865	*/
	3866	if (size_of_prefetch > max_rd_size)
	3867	size_of_prefetch = max_rd_size;
	3868
	3869	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	3870
	3871	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	3872
	3873	if (last_ioread_offset > last_request_offset)
	3874	last_ioread_offset = last_request_offset;
	3875	}
	3876	}
	3877	/*
	3878	* limit the size of the copy we're about to do so that
	3879	* we can notice that our I/O pipe is running dry and
	3880	* get the next I/O issued before it does go dry
	3881	*/
	3882	if (last_ioread_offset && io_size > (max_io_size / 4))
	3883	io_resid = (max_io_size / 4);
	3884	else
	3885	io_resid = io_size;
	3886
	3887	io_requested = io_resid;
	3888
	3889	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
	3890
	3891	xsize = io_requested - io_resid;
	3892
	3893	io_size -= xsize;
	3894	io_req_size -= xsize;
	3895
	3896	if (retval \|\| io_resid)
	3897	/*
	3898	* if we run into a real error or
	3899	* a page that is not in the cache
	3900	* we need to leave streaming mode
	3901	*/
	3902	break;
	3903
	3904	if (rd_ahead_enabled && (io_size == 0 \|\| last_ioread_offset == last_request_offset)) {
	3905	/*
	3906	* we're already finished the I/O for this read request
	3907	* let's see if we should do a read-ahead
	3908	*/
	3909	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	3910	}
	3911	}
	3912	if (retval)
	3913	break;
	3914	if (io_size == 0) {
	3915	if (rap != NULL) {
	3916	if (extent.e_addr < rap->cl_lastr)
	3917	rap->cl_maxra = 0;
	3918	rap->cl_lastr = extent.e_addr;
	3919	}
	3920	break;
	3921	}
	3922	/*
	3923	* recompute max_size since cluster_copy_ubc_data_internal
	3924	* may have advanced uio->uio_offset
	3925	*/
	3926	max_size = filesize - uio->uio_offset;
	3927	}
	3928
	3929	iostate.io_completed = 0;
	3930	iostate.io_issued = 0;
	3931	iostate.io_error = 0;
	3932	iostate.io_wanted = 0;
	3933
	3934	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	3935	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	3936	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	3937	/*
	3938	* we're in the throttle window and at least 1 I/O
	3939	* has already been issued by a throttleable thread
	3940	* in this window, so return with EAGAIN to indicate
	3941	* to the FS issuing the cluster_read call that it
	3942	* should now throttle after dropping any locks
	3943	*/
	3944	throttle_info_update_by_mount(vp->v_mount);
	3945
	3946	retval = EAGAIN;
	3947	break;
	3948	}
	3949	}
	3950	}
	3951
	3952	/*
	3953	* compute the size of the upl needed to encompass
	3954	* the requested read... limit each call to cluster_io
	3955	* to the maximum UPL size... cluster_io will clip if
	3956	* this exceeds the maximum io_size for the device,
	3957	* make sure to account for
	3958	* a starting offset that's not page aligned
	3959	*/
	3960	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3961	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	3962
	3963	if (io_size > max_rd_size)
	3964	io_size = max_rd_size;
	3965
	3966	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3967
	3968	if (flags & IO_NOCACHE) {
	3969	if (upl_size > max_io_size)
	3970	upl_size = max_io_size;
	3971	} else {
	3972	if (upl_size > max_io_size / 4) {
	3973	upl_size = max_io_size / 4;
	3974	upl_size &= ~PAGE_MASK;
	3975
	3976	if (upl_size == 0)
	3977	upl_size = PAGE_SIZE;
	3978	}
	3979	}
	3980	pages_in_upl = upl_size / PAGE_SIZE;
	3981
	3982	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	3983	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3984
	3985	kret = ubc_create_upl_kernel(vp,
	3986	upl_f_offset,
	3987	upl_size,
	3988	&upl,
	3989	&pl,
	3990	UPL_FILE_IO \| UPL_SET_LITE,
	3991	VM_KERN_MEMORY_FILE);
	3992	if (kret != KERN_SUCCESS)
	3993	panic("cluster_read_copy: failed to get pagelist");
	3994
	3995	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	3996	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3997
	3998	/*
	3999	* scan from the beginning of the upl looking for the first
	4000	* non-valid page.... this will become the first page in
	4001	* the request we're going to make to 'cluster_io'... if all
	4002	* of the pages are valid, we won't call through to 'cluster_io'
	4003	*/
	4004	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	4005	if (!upl_valid_page(pl, start_pg))
	4006	break;
	4007	}
	4008
	4009	/*
	4010	* scan from the starting invalid page looking for a valid
	4011	* page before the end of the upl is reached, if we
	4012	* find one, then it will be the last page of the request to
	4013	* 'cluster_io'
	4014	*/
	4015	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4016	if (upl_valid_page(pl, last_pg))
	4017	break;
	4018	}
	4019
	4020	if (start_pg < last_pg) {
	4021	/*
	4022	* we found a range of 'invalid' pages that must be filled
	4023	* if the last page in this range is the last page of the file
	4024	* we may have to clip the size of it to keep from reading past
	4025	* the end of the last physical block associated with the file
	4026	*/
	4027	if (iolock_inited == FALSE) {
	4028	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4029
	4030	iolock_inited = TRUE;
	4031	}
	4032	upl_offset = start_pg * PAGE_SIZE;
	4033	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4034
	4035	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	4036	io_size = filesize - (upl_f_offset + upl_offset);
	4037
	4038	/*
	4039	* issue an asynchronous read to cluster_io
	4040	*/
	4041
	4042	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	4043	io_size, CL_READ \| CL_ASYNC \| bflag, (buf_t)NULL, &iostate, callback, callback_arg);
	4044
	4045	if (rap) {
	4046	if (extent.e_addr < rap->cl_maxra) {
	4047	/*
	4048	* we've just issued a read for a block that should have been
	4049	* in the cache courtesy of the read-ahead engine... something
	4050	* has gone wrong with the pipeline, so reset the read-ahead
	4051	* logic which will cause us to restart from scratch
	4052	*/
	4053	rap->cl_maxra = 0;
	4054	}
	4055	}
	4056	}
	4057	if (error == 0) {
	4058	/*
	4059	* if the read completed successfully, or there was no I/O request
	4060	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	4061	* we'll first add on any 'valid'
	4062	* pages that were present in the upl when we acquired it.
	4063	*/
	4064	u_int val_size;
	4065
	4066	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	4067	if (!upl_valid_page(pl, uio_last))
	4068	break;
	4069	}
	4070	if (uio_last < pages_in_upl) {
	4071	/*
	4072	* there were some invalid pages beyond the valid pages
	4073	* that we didn't issue an I/O for, just release them
	4074	* unchanged now, so that any prefetch/readahed can
	4075	* include them
	4076	*/
	4077	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	4078	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4079	}
	4080
	4081	/*
	4082	* compute size to transfer this round, if io_req_size is
	4083	* still non-zero after this attempt, we'll loop around and
	4084	* set up for another I/O.
	4085	*/
	4086	val_size = (uio_last * PAGE_SIZE) - start_offset;
	4087
	4088	if (val_size > max_size)
	4089	val_size = max_size;
	4090
	4091	if (val_size > io_req_size)
	4092	val_size = io_req_size;
	4093
	4094	if ((uio->uio_offset + val_size) > last_ioread_offset)
	4095	last_ioread_offset = uio->uio_offset + val_size;
	4096
	4097	if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	4098
	4099	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
	4100	/*
	4101	* if there's still I/O left to do for this request, and...
	4102	* we're not in hard throttle mode, and...
	4103	* we're close to using up the previous prefetch, then issue a
	4104	* new pre-fetch I/O... the I/O latency will overlap
	4105	* with the copying of the data
	4106	*/
	4107	if (size_of_prefetch > max_rd_size)
	4108	size_of_prefetch = max_rd_size;
	4109
	4110	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	4111
	4112	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	4113
	4114	if (last_ioread_offset > last_request_offset)
	4115	last_ioread_offset = last_request_offset;
	4116	}
	4117
	4118	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	4119	/*
	4120	* this transfer will finish this request, so...
	4121	* let's try to read ahead if we're in
	4122	* a sequential access pattern and we haven't
	4123	* explicitly disabled it
	4124	*/
	4125	if (rd_ahead_enabled)
	4126	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	4127
	4128	if (rap != NULL) {
	4129	if (extent.e_addr < rap->cl_lastr)
	4130	rap->cl_maxra = 0;
	4131	rap->cl_lastr = extent.e_addr;
	4132	}
	4133	}
	4134	if (iolock_inited == TRUE)
	4135	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4136
	4137	if (iostate.io_error)
	4138	error = iostate.io_error;
	4139	else {
	4140	u_int32_t io_requested;
	4141
	4142	io_requested = val_size;
	4143
	4144	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
	4145
	4146	io_req_size -= (val_size - io_requested);
	4147	}
	4148	} else {
	4149	if (iolock_inited == TRUE)
	4150	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4151	}
	4152	if (start_pg < last_pg) {
	4153	/*
	4154	* compute the range of pages that we actually issued an I/O for
	4155	* and either commit them as valid if the I/O succeeded
	4156	* or abort them if the I/O failed or we're not supposed to
	4157	* keep them in the cache
	4158	*/
	4159	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4160
	4161	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4162
	4163	if (error \|\| (flags & IO_NOCACHE))
	4164	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	4165	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	4166	else {
	4167	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
	4168
	4169	if (take_reference)
	4170	commit_flags \|= UPL_COMMIT_INACTIVATE;
	4171	else
	4172	commit_flags \|= UPL_COMMIT_SPECULATE;
	4173
	4174	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
	4175	}
	4176	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4177	}
	4178	if ((last_pg - start_pg) < pages_in_upl) {
	4179	/*
	4180	* the set of pages that we issued an I/O for did not encompass
	4181	* the entire upl... so just release these without modifying
	4182	* their state
	4183	*/
	4184	if (error)
	4185	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	4186	else {
	4187
	4188	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	4189	upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	4190
	4191	/*
	4192	* handle any valid pages at the beginning of
	4193	* the upl... release these appropriately
	4194	*/
	4195	cluster_read_upl_release(upl, 0, start_pg, take_reference);
	4196
	4197	/*
	4198	* handle any valid pages immediately after the
	4199	* pages we issued I/O for... ... release these appropriately
	4200	*/
	4201	cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
	4202
	4203	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, -1, -1, 0, 0);
	4204	}
	4205	}
	4206	if (retval == 0)
	4207	retval = error;
	4208
	4209	if (io_req_size) {
	4210	if (cluster_is_throttled(vp)) {
	4211	/*
	4212	* we're in the throttle window, at the very least
	4213	* we want to limit the size of the I/O we're about
	4214	* to issue
	4215	*/
	4216	rd_ahead_enabled = 0;
	4217	prefetch_enabled = 0;
	4218	max_rd_size = THROTTLE_MAX_IOSIZE;
	4219	} else {
	4220	if (max_rd_size == THROTTLE_MAX_IOSIZE) {
	4221	/*
	4222	* coming out of throttled state
	4223	*/
	4224	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
	4225	if (rap != NULL)
	4226	rd_ahead_enabled = 1;
	4227	prefetch_enabled = 1;
	4228	}
	4229	max_rd_size = max_prefetch;
	4230	last_ioread_offset = 0;
	4231	}
	4232	}
	4233	}
	4234	}
	4235	if (iolock_inited == TRUE) {
	4236	/*
	4237	* cluster_io returned an error after it
	4238	* had already issued some I/O. we need
	4239	* to wait for that I/O to complete before
	4240	* we can destroy the iostate mutex...
	4241	* 'retval' already contains the early error
	4242	* so no need to pick it up from iostate.io_error
	4243	*/
	4244	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4245
	4246	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4247	}
	4248	if (rap != NULL) {
	4249	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4250	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
	4251
	4252	lck_mtx_unlock(&rap->cl_lockr);
	4253	} else {
	4254	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4255	(int)uio->uio_offset, io_req_size, 0, retval, 0);
	4256	}
	4257
	4258	return (retval);
	4259	}
	4260
	4261	/*
	4262	* We don't want another read/write lock for every vnode in the system
	4263	* so we keep a hash of them here. There should never be very many of
	4264	* these around at any point in time.
	4265	*/
	4266	cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
	4267	{
	4268	struct cl_direct_read_locks *head
	4269	= &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
	4270	% CL_DIRECT_READ_LOCK_BUCKETS];
	4271
	4272	struct cl_direct_read_lock lck, new_lck = NULL;
	4273
	4274	for (;;) {
	4275	lck_spin_lock(&cl_direct_read_spin_lock);
	4276
	4277	LIST_FOREACH(lck, head, chain) {
	4278	if (lck->vp == vp) {
	4279	++lck->ref_count;
	4280	lck_spin_unlock(&cl_direct_read_spin_lock);
	4281	if (new_lck) {
	4282	// Someone beat us to it, ditch the allocation
	4283	lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
	4284	FREE(new_lck, M_TEMP);
	4285	}
	4286	lck_rw_lock(&lck->rw_lock, type);
	4287	return lck;
	4288	}
	4289	}
	4290
	4291	if (new_lck) {
	4292	// Use the lock we allocated
	4293	LIST_INSERT_HEAD(head, new_lck, chain);
	4294	lck_spin_unlock(&cl_direct_read_spin_lock);
	4295	lck_rw_lock(&new_lck->rw_lock, type);
	4296	return new_lck;
	4297	}
	4298
	4299	lck_spin_unlock(&cl_direct_read_spin_lock);
	4300
	4301	// Allocate a new lock
	4302	MALLOC(new_lck, cl_direct_read_lock_t , sizeof(new_lck),
	4303	M_TEMP, M_WAITOK);
	4304	lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
	4305	new_lck->vp = vp;
	4306	new_lck->ref_count = 1;
	4307
	4308	// Got to go round again
	4309	}
	4310	}
	4311
	4312	void cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
	4313	{
	4314	lck_rw_done(&lck->rw_lock);
	4315
	4316	lck_spin_lock(&cl_direct_read_spin_lock);
	4317	if (lck->ref_count == 1) {
	4318	LIST_REMOVE(lck, chain);
	4319	lck_spin_unlock(&cl_direct_read_spin_lock);
	4320	lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
	4321	FREE(lck, M_TEMP);
	4322	} else {
	4323	--lck->ref_count;
	4324	lck_spin_unlock(&cl_direct_read_spin_lock);
	4325	}
	4326	}
	4327
	4328	static int
	4329	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4330	int flags, int (callback)(buf_t, void ), void *callback_arg)
	4331	{
	4332	upl_t upl;
	4333	upl_page_info_t *pl;
	4334	off_t max_io_size;
	4335	vm_offset_t upl_offset, vector_upl_offset = 0;
	4336	upl_size_t upl_size, vector_upl_size = 0;
	4337	vm_size_t upl_needed_size;
	4338	unsigned int pages_in_pl;
	4339	upl_control_flags_t upl_flags;
	4340	kern_return_t kret;
	4341	unsigned int i;
	4342	int force_data_sync;
	4343	int retval = 0;
	4344	int no_zero_fill = 0;
	4345	int io_flag = 0;
	4346	int misaligned = 0;
	4347	struct clios iostate;
	4348	user_addr_t iov_base;
	4349	u_int32_t io_req_size;
	4350	u_int32_t offset_in_file;
	4351	u_int32_t offset_in_iovbase;
	4352	u_int32_t io_size;
	4353	u_int32_t io_min;
	4354	u_int32_t xsize;
	4355	u_int32_t devblocksize;
	4356	u_int32_t mem_alignment_mask;
	4357	u_int32_t max_upl_size;
	4358	u_int32_t max_rd_size;
	4359	u_int32_t max_rd_ahead;
	4360	u_int32_t max_vector_size;
	4361	boolean_t strict_uncached_IO = FALSE;
	4362	boolean_t io_throttled = FALSE;
	4363
	4364	u_int32_t vector_upl_iosize = 0;
	4365	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	4366	off_t v_upl_uio_offset = 0;
	4367	int vector_upl_index=0;
	4368	upl_t vector_upl = NULL;
	4369	cl_direct_read_lock_t *lock = NULL;
	4370
	4371	user_addr_t orig_iov_base = 0;
	4372	user_addr_t last_iov_base = 0;
	4373	user_addr_t next_iov_base = 0;
	4374
	4375	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	4376	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4377
	4378	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4379
	4380	max_rd_size = max_upl_size;
	4381	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4382
	4383	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
	4384
	4385	if (flags & IO_PASSIVE)
	4386	io_flag \|= CL_PASSIVE;
	4387
	4388	if (flags & IO_ENCRYPTED) {
	4389	io_flag \|= CL_RAW_ENCRYPTED;
	4390	}
	4391
	4392	if (flags & IO_NOCACHE) {
	4393	io_flag \|= CL_NOCACHE;
	4394	}
	4395
	4396	if (flags & IO_SKIP_ENCRYPTION)
	4397	io_flag \|= CL_ENCRYPTED;
	4398
	4399	iostate.io_completed = 0;
	4400	iostate.io_issued = 0;
	4401	iostate.io_error = 0;
	4402	iostate.io_wanted = 0;
	4403
	4404	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4405
	4406	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4407	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4408
	4409	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4410	(int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
	4411
	4412	if (devblocksize == 1) {
	4413	/*
	4414	* the AFP client advertises a devblocksize of 1
	4415	* however, its BLOCKMAP routine maps to physical
	4416	* blocks that are PAGE_SIZE in size...
	4417	* therefore we can't ask for I/Os that aren't page aligned
	4418	* or aren't multiples of PAGE_SIZE in size
	4419	* by setting devblocksize to PAGE_SIZE, we re-instate
	4420	* the old behavior we had before the mem_alignment_mask
	4421	* changes went in...
	4422	*/
	4423	devblocksize = PAGE_SIZE;
	4424	}
	4425
	4426	strict_uncached_IO = ubc_strict_uncached_IO(vp);
	4427
	4428	orig_iov_base = uio_curriovbase(uio);
	4429	last_iov_base = orig_iov_base;
	4430
	4431	next_dread:
	4432	io_req_size = *read_length;
	4433	iov_base = uio_curriovbase(uio);
	4434
	4435	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
	4436	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	4437
	4438	if (offset_in_file \|\| offset_in_iovbase) {
	4439	/*
	4440	* one of the 2 important offsets is misaligned
	4441	* so fire an I/O through the cache for this entire vector
	4442	*/
	4443	misaligned = 1;
	4444	}
	4445	if (iov_base & (devblocksize - 1)) {
	4446	/*
	4447	* the offset in memory must be on a device block boundary
	4448	* so that we can guarantee that we can generate an
	4449	* I/O that ends on a page boundary in cluster_io
	4450	*/
	4451	misaligned = 1;
	4452	}
	4453
	4454	max_io_size = filesize - uio->uio_offset;
	4455
	4456	/*
	4457	* The user must request IO in aligned chunks. If the
	4458	* offset into the file is bad, or the userland pointer
	4459	* is non-aligned, then we cannot service the encrypted IO request.
	4460	*/
	4461	if (flags & IO_ENCRYPTED) {
	4462	if (misaligned \|\| (io_req_size & (devblocksize - 1)))
	4463	retval = EINVAL;
	4464
	4465	max_io_size = roundup(max_io_size, devblocksize);
	4466	}
	4467
	4468	if ((off_t)io_req_size > max_io_size)
	4469	io_req_size = max_io_size;
	4470
	4471	/*
	4472	* When we get to this point, we know...
	4473	* -- the offset into the file is on a devblocksize boundary
	4474	*/
	4475
	4476	while (io_req_size && retval == 0) {
	4477	u_int32_t io_start;
	4478
	4479	if (cluster_is_throttled(vp)) {
	4480	/*
	4481	* we're in the throttle window, at the very least
	4482	* we want to limit the size of the I/O we're about
	4483	* to issue
	4484	*/
	4485	max_rd_size = THROTTLE_MAX_IOSIZE;
	4486	max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
	4487	max_vector_size = THROTTLE_MAX_IOSIZE;
	4488	} else {
	4489	max_rd_size = max_upl_size;
	4490	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4491	max_vector_size = MAX_VECTOR_UPL_SIZE;
	4492	}
	4493	io_start = io_size = io_req_size;
	4494
	4495	/*
	4496	* First look for pages already in the cache
	4497	* and move them to user space. But only do this
	4498	* check if we are not retrieving encrypted data directly
	4499	* from the filesystem; those blocks should never
	4500	* be in the UBC.
	4501	*
	4502	* cluster_copy_ubc_data returns the resid
	4503	* in io_size
	4504	*/
	4505	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4506	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
	4507	}
	4508	/*
	4509	* calculate the number of bytes actually copied
	4510	* starting size - residual
	4511	*/
	4512	xsize = io_start - io_size;
	4513
	4514	io_req_size -= xsize;
	4515
	4516	if(useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
	4517	/*
	4518	* We found something in the cache or we have an iov_base that's not
	4519	* page-aligned.
	4520	*
	4521	* Issue all I/O's that have been collected within this Vectored UPL.
	4522	*/
	4523	if(vector_upl_index) {
	4524	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4525	reset_vector_run_state();
	4526	}
	4527
	4528	if(xsize)
	4529	useVectorUPL = 0;
	4530
	4531	/*
	4532	* After this point, if we are using the Vector UPL path and the base is
	4533	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	4534	*/
	4535	}
	4536
	4537	/*
	4538	* check to see if we are finished with this request.
	4539	*
	4540	* If we satisfied this IO already, then io_req_size will be 0.
	4541	* Otherwise, see if the IO was mis-aligned and needs to go through
	4542	* the UBC to deal with the 'tail'.
	4543	*
	4544	*/
	4545	if (io_req_size == 0 \|\| (misaligned)) {
	4546	/*
	4547	* see if there's another uio vector to
	4548	* process that's of type IO_DIRECT
	4549	*
	4550	* break out of while loop to get there
	4551	*/
	4552	break;
	4553	}
	4554	/*
	4555	* assume the request ends on a device block boundary
	4556	*/
	4557	io_min = devblocksize;
	4558
	4559	/*
	4560	* we can handle I/O's in multiples of the device block size
	4561	* however, if io_size isn't a multiple of devblocksize we
	4562	* want to clip it back to the nearest page boundary since
	4563	* we are going to have to go through cluster_read_copy to
	4564	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
	4565	* multiple, we avoid asking the drive for the same physical
	4566	* blocks twice.. once for the partial page at the end of the
	4567	* request and a 2nd time for the page we read into the cache
	4568	* (which overlaps the end of the direct read) in order to
	4569	* get at the overhang bytes
	4570	*/
	4571	if (io_size & (devblocksize - 1)) {
	4572	assert(!(flags & IO_ENCRYPTED));
	4573	/*
	4574	* Clip the request to the previous page size boundary
	4575	* since request does NOT end on a device block boundary
	4576	*/
	4577	io_size &= ~PAGE_MASK;
	4578	io_min = PAGE_SIZE;
	4579	}
	4580	if (retval \|\| io_size < io_min) {
	4581	/*
	4582	* either an error or we only have the tail left to
	4583	* complete via the copy path...
	4584	* we may have already spun some portion of this request
	4585	* off as async requests... we need to wait for the I/O
	4586	* to complete before returning
	4587	*/
	4588	goto wait_for_dreads;
	4589	}
	4590
	4591	/*
	4592	* Don't re-check the UBC data if we are looking for uncached IO
	4593	* or asking for encrypted blocks.
	4594	*/
	4595	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4596
	4597	if ((xsize = io_size) > max_rd_size)
	4598	xsize = max_rd_size;
	4599
	4600	io_size = 0;
	4601
	4602	if (!lock) {
	4603	/*
	4604	* We hold a lock here between the time we check the
	4605	* cache and the time we issue I/O. This saves us
	4606	* from having to lock the pages in the cache. Not
	4607	* all clients will care about this lock but some
	4608	* clients may want to guarantee stability between
	4609	* here and when the I/O is issued in which case they
	4610	* will take the lock exclusively.
	4611	*/
	4612	lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
	4613	}
	4614
	4615	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
	4616
	4617	if (io_size == 0) {
	4618	/*
	4619	* a page must have just come into the cache
	4620	* since the first page in this range is no
	4621	* longer absent, go back and re-evaluate
	4622	*/
	4623	continue;
	4624	}
	4625	}
	4626	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	4627	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4628	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4629	/*
	4630	* we're in the throttle window and at least 1 I/O
	4631	* has already been issued by a throttleable thread
	4632	* in this window, so return with EAGAIN to indicate
	4633	* to the FS issuing the cluster_read call that it
	4634	* should now throttle after dropping any locks
	4635	*/
	4636	throttle_info_update_by_mount(vp->v_mount);
	4637
	4638	io_throttled = TRUE;
	4639	goto wait_for_dreads;
	4640	}
	4641	}
	4642	}
	4643	if (io_size > max_rd_size)
	4644	io_size = max_rd_size;
	4645
	4646	iov_base = uio_curriovbase(uio);
	4647
	4648	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4649	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	4650
	4651	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	4652	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	4653
	4654	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
	4655	no_zero_fill = 1;
	4656	else
	4657	no_zero_fill = 0;
	4658
	4659	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4660	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	4661	pages_in_pl = 0;
	4662	upl_size = upl_needed_size;
	4663	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4664	if (no_zero_fill)
	4665	upl_flags \|= UPL_NOZEROFILL;
	4666	if (force_data_sync)
	4667	upl_flags \|= UPL_FORCE_DATA_SYNC;
	4668
	4669	kret = vm_map_create_upl(map,
	4670	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4671	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
	4672
	4673	if (kret != KERN_SUCCESS) {
	4674	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4675	(int)upl_offset, upl_size, io_size, kret, 0);
	4676	/*
	4677	* failed to get pagelist
	4678	*
	4679	* we may have already spun some portion of this request
	4680	* off as async requests... we need to wait for the I/O
	4681	* to complete before returning
	4682	*/
	4683	goto wait_for_dreads;
	4684	}
	4685	pages_in_pl = upl_size / PAGE_SIZE;
	4686	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	4687
	4688	for (i = 0; i < pages_in_pl; i++) {
	4689	if (!upl_page_present(pl, i))
	4690	break;
	4691	}
	4692	if (i == pages_in_pl)
	4693	break;
	4694
	4695	ubc_upl_abort(upl, 0);
	4696	}
	4697	if (force_data_sync >= 3) {
	4698	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4699	(int)upl_offset, upl_size, io_size, kret, 0);
	4700
	4701	goto wait_for_dreads;
	4702	}
	4703	/*
	4704	* Consider the possibility that upl_size wasn't satisfied.
	4705	*/
	4706	if (upl_size < upl_needed_size) {
	4707	if (upl_size && upl_offset == 0)
	4708	io_size = upl_size;
	4709	else
	4710	io_size = 0;
	4711	}
	4712	if (io_size == 0) {
	4713	ubc_upl_abort(upl, 0);
	4714	goto wait_for_dreads;
	4715	}
	4716	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4717	(int)upl_offset, upl_size, io_size, kret, 0);
	4718
	4719	if(useVectorUPL) {
	4720	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	4721	if(end_off)
	4722	issueVectorUPL = 1;
	4723	/*
	4724	* After this point, if we are using a vector UPL, then
	4725	* either all the UPL elements end on a page boundary OR
	4726	* this UPL is the last element because it does not end
	4727	* on a page boundary.
	4728	*/
	4729	}
	4730
	4731	/*
	4732	* request asynchronously so that we can overlap
	4733	* the preparation of the next I/O
	4734	* if there are already too many outstanding reads
	4735	* wait until some have completed before issuing the next read
	4736	*/
	4737	cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
	4738
	4739	if (iostate.io_error) {
	4740	/*
	4741	* one of the earlier reads we issued ran into a hard error
	4742	* don't issue any more reads, cleanup the UPL
	4743	* that was just created but not used, then
	4744	* go wait for any other reads to complete before
	4745	* returning the error to the caller
	4746	*/
	4747	ubc_upl_abort(upl, 0);
	4748
	4749	goto wait_for_dreads;
	4750	}
	4751	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	4752	upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	4753
	4754	if(!useVectorUPL) {
	4755	if (no_zero_fill)
	4756	io_flag &= ~CL_PRESERVE;
	4757	else
	4758	io_flag \|= CL_PRESERVE;
	4759
	4760	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4761
	4762	} else {
	4763
	4764	if(!vector_upl_index) {
	4765	vector_upl = vector_upl_create(upl_offset);
	4766	v_upl_uio_offset = uio->uio_offset;
	4767	vector_upl_offset = upl_offset;
	4768	}
	4769
	4770	vector_upl_set_subupl(vector_upl,upl, upl_size);
	4771	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	4772	vector_upl_index++;
	4773	vector_upl_size += upl_size;
	4774	vector_upl_iosize += io_size;
	4775
	4776	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	4777	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4778	reset_vector_run_state();
	4779	}
	4780	}
	4781	last_iov_base = iov_base + io_size;
	4782
	4783	if (lock) {
	4784	// We don't need to wait for the I/O to complete
	4785	cluster_unlock_direct_read(lock);
	4786	lock = NULL;
	4787	}
	4788
	4789	/*
	4790	* update the uio structure
	4791	*/
	4792	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
	4793	uio_update(uio, (user_size_t)max_io_size);
	4794	}
	4795	else {
	4796	uio_update(uio, (user_size_t)io_size);
	4797	}
	4798
	4799	io_req_size -= io_size;
	4800
	4801	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	4802	upl, (int)uio->uio_offset, io_req_size, retval, 0);
	4803
	4804	} /* end while */
	4805
	4806	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
	4807
	4808	retval = cluster_io_type(uio, read_type, read_length, 0);
	4809
	4810	if (retval == 0 && *read_type == IO_DIRECT) {
	4811
	4812	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4813	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4814
	4815	goto next_dread;
	4816	}
	4817	}
	4818
	4819	wait_for_dreads:
	4820
	4821	if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	4822	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4823	reset_vector_run_state();
	4824	}
	4825
	4826	// We don't need to wait for the I/O to complete
	4827	if (lock)
	4828	cluster_unlock_direct_read(lock);
	4829
	4830	/*
	4831	* make sure all async reads that are part of this stream
	4832	* have completed before we return
	4833	*/
	4834	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
	4835
	4836	if (iostate.io_error)
	4837	retval = iostate.io_error;
	4838
	4839	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4840
	4841	if (io_throttled == TRUE && retval == 0)
	4842	retval = EAGAIN;
	4843
	4844	for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
	4845	/*
	4846	* This is specifically done for pmap accounting purposes.
	4847	* vm_pre_fault() will call vm_fault() to enter the page into
	4848	* the pmap if there isn't _a_ physical page for that VA already.
	4849	*/
	4850	vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
	4851	}
	4852
	4853	if (io_req_size && retval == 0) {
	4854	/*
	4855	* we couldn't handle the tail of this request in DIRECT mode
	4856	* so fire it through the copy path
	4857	*/
	4858	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
	4859
	4860	*read_type = IO_UNKNOWN;
	4861	}
	4862	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	4863	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
	4864
	4865	return (retval);
	4866	}
	4867
	4868
	4869	static int
	4870	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4871	int (callback)(buf_t, void ), void *callback_arg, int flags)
	4872	{
	4873	upl_page_info_t *pl;
	4874	upl_t upl[MAX_VECTS];
	4875	vm_offset_t upl_offset;
	4876	addr64_t dst_paddr = 0;
	4877	user_addr_t iov_base;
	4878	off_t max_size;
	4879	upl_size_t upl_size;
	4880	vm_size_t upl_needed_size;
	4881	mach_msg_type_number_t pages_in_pl;
	4882	upl_control_flags_t upl_flags;
	4883	kern_return_t kret;
	4884	struct clios iostate;
	4885	int error= 0;
	4886	int cur_upl = 0;
	4887	int num_upl = 0;
	4888	int n;
	4889	u_int32_t xsize;
	4890	u_int32_t io_size;
	4891	u_int32_t devblocksize;
	4892	u_int32_t mem_alignment_mask;
	4893	u_int32_t tail_size = 0;
	4894	int bflag;
	4895
	4896	if (flags & IO_PASSIVE)
	4897	bflag = CL_PASSIVE;
	4898	else
	4899	bflag = 0;
	4900
	4901	if (flags & IO_NOCACHE)
	4902	bflag \|= CL_NOCACHE;
	4903
	4904	/*
	4905	* When we enter this routine, we know
	4906	* -- the read_length will not exceed the current iov_len
	4907	* -- the target address is physically contiguous for read_length
	4908	*/
	4909	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
	4910
	4911	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4912	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4913
	4914	iostate.io_completed = 0;
	4915	iostate.io_issued = 0;
	4916	iostate.io_error = 0;
	4917	iostate.io_wanted = 0;
	4918
	4919	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4920
	4921	next_cread:
	4922	io_size = *read_length;
	4923
	4924	max_size = filesize - uio->uio_offset;
	4925
	4926	if (io_size > max_size)
	4927	io_size = max_size;
	4928
	4929	iov_base = uio_curriovbase(uio);
	4930
	4931	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4932	upl_needed_size = upl_offset + io_size;
	4933
	4934	pages_in_pl = 0;
	4935	upl_size = upl_needed_size;
	4936	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4937
	4938
	4939	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_START,
	4940	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
	4941
	4942	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4943	kret = vm_map_get_upl(map,
	4944	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4945	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	4946
	4947	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_END,
	4948	(int)upl_offset, upl_size, io_size, kret, 0);
	4949
	4950	if (kret != KERN_SUCCESS) {
	4951	/*
	4952	* failed to get pagelist
	4953	*/
	4954	error = EINVAL;
	4955	goto wait_for_creads;
	4956	}
	4957	num_upl++;
	4958
	4959	if (upl_size < upl_needed_size) {
	4960	/*
	4961	* The upl_size wasn't satisfied.
	4962	*/
	4963	error = EINVAL;
	4964	goto wait_for_creads;
	4965	}
	4966	pl = ubc_upl_pageinfo(upl[cur_upl]);
	4967
	4968	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	4969
	4970	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	4971	u_int32_t head_size;
	4972
	4973	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	4974
	4975	if (head_size > io_size)
	4976	head_size = io_size;
	4977
	4978	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
	4979
	4980	if (error)
	4981	goto wait_for_creads;
	4982
	4983	upl_offset += head_size;
	4984	dst_paddr += head_size;
	4985	io_size -= head_size;
	4986
	4987	iov_base += head_size;
	4988	}
	4989	if ((u_int32_t)iov_base & mem_alignment_mask) {
	4990	/*
	4991	* request doesn't set up on a memory boundary
	4992	* the underlying DMA engine can handle...
	4993	* return an error instead of going through
	4994	* the slow copy path since the intent of this
	4995	* path is direct I/O to device memory
	4996	*/
	4997	error = EINVAL;
	4998	goto wait_for_creads;
	4999	}
	5000
	5001	tail_size = io_size & (devblocksize - 1);
	5002
	5003	io_size -= tail_size;
	5004
	5005	while (io_size && error == 0) {
	5006
	5007	if (io_size > MAX_IO_CONTIG_SIZE)
	5008	xsize = MAX_IO_CONTIG_SIZE;
	5009	else
	5010	xsize = io_size;
	5011	/*
	5012	* request asynchronously so that we can overlap
	5013	* the preparation of the next I/O... we'll do
	5014	* the commit after all the I/O has completed
	5015	* since its all issued against the same UPL
	5016	* if there are already too many outstanding reads
	5017	* wait until some have completed before issuing the next
	5018	*/
	5019	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
	5020
	5021	if (iostate.io_error) {
	5022	/*
	5023	* one of the earlier reads we issued ran into a hard error
	5024	* don't issue any more reads...
	5025	* go wait for any other reads to complete before
	5026	* returning the error to the caller
	5027	*/
	5028	goto wait_for_creads;
	5029	}
	5030	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
	5031	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
	5032	(buf_t)NULL, &iostate, callback, callback_arg);
	5033	/*
	5034	* The cluster_io read was issued successfully,
	5035	* update the uio structure
	5036	*/
	5037	if (error == 0) {
	5038	uio_update(uio, (user_size_t)xsize);
	5039
	5040	dst_paddr += xsize;
	5041	upl_offset += xsize;
	5042	io_size -= xsize;
	5043	}
	5044	}
	5045	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
	5046
	5047	error = cluster_io_type(uio, read_type, read_length, 0);
	5048
	5049	if (error == 0 && *read_type == IO_CONTIG) {
	5050	cur_upl++;
	5051	goto next_cread;
	5052	}
	5053	} else
	5054	*read_type = IO_UNKNOWN;
	5055
	5056	wait_for_creads:
	5057	/*
	5058	* make sure all async reads that are part of this stream
	5059	* have completed before we proceed
	5060	*/
	5061	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
	5062
	5063	if (iostate.io_error)
	5064	error = iostate.io_error;
	5065
	5066	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	5067
	5068	if (error == 0 && tail_size)
	5069	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
	5070
	5071	for (n = 0; n < num_upl; n++)
	5072	/*
	5073	* just release our hold on each physically contiguous
	5074	* region without changing any state
	5075	*/
	5076	ubc_upl_abort(upl[n], 0);
	5077
	5078	return (error);
	5079	}
	5080
	5081
	5082	static int
	5083	cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length)
	5084	{
	5085	user_size_t iov_len;
	5086	user_addr_t iov_base = 0;
	5087	upl_t upl;
	5088	upl_size_t upl_size;
	5089	upl_control_flags_t upl_flags;
	5090	int retval = 0;
	5091
	5092	/*
	5093	* skip over any emtpy vectors
	5094	*/
	5095	uio_update(uio, (user_size_t)0);
	5096
	5097	iov_len = uio_curriovlen(uio);
	5098
	5099	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
	5100
	5101	if (iov_len) {
	5102	iov_base = uio_curriovbase(uio);
	5103	/*
	5104	* make sure the size of the vector isn't too big...
	5105	* internally, we want to handle all of the I/O in
	5106	* chunk sizes that fit in a 32 bit int
	5107	*/
	5108	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
	5109	upl_size = MAX_IO_REQUEST_SIZE;
	5110	else
	5111	upl_size = (u_int32_t)iov_len;
	5112
	5113	upl_flags = UPL_QUERY_OBJECT_TYPE;
	5114
	5115	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	5116	if ((vm_map_get_upl(map,
	5117	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	5118	&upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
	5119	/*
	5120	* the user app must have passed in an invalid address
	5121	*/
	5122	retval = EFAULT;
	5123	}
	5124	if (upl_size == 0)
	5125	retval = EFAULT;
	5126
	5127	*io_length = upl_size;
	5128
	5129	if (upl_flags & UPL_PHYS_CONTIG)
	5130	*io_type = IO_CONTIG;
	5131	else if (iov_len >= min_length)
	5132	*io_type = IO_DIRECT;
	5133	else
	5134	*io_type = IO_COPY;
	5135	} else {
	5136	/*
	5137	* nothing left to do for this uio
	5138	*/
	5139	*io_length = 0;
	5140	*io_type = IO_UNKNOWN;
	5141	}
	5142	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, 0);
	5143
	5144	return (retval);
	5145	}
	5146
	5147
	5148	/*
	5149	* generate advisory I/O's in the largest chunks possible
	5150	* the completed pages will be released into the VM cache
	5151	*/
	5152	int
	5153	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	5154	{
	5155	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
	5156	}
	5157
	5158	int
	5159	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	5160	{
	5161	upl_page_info_t *pl;
	5162	upl_t upl;
	5163	vm_offset_t upl_offset;
	5164	int upl_size;
	5165	off_t upl_f_offset;
	5166	int start_offset;
	5167	int start_pg;
	5168	int last_pg;
	5169	int pages_in_upl;
	5170	off_t max_size;
	5171	int io_size;
	5172	kern_return_t kret;
	5173	int retval = 0;
	5174	int issued_io;
	5175	int skip_range;
	5176	uint32_t max_io_size;
	5177
	5178
	5179	if ( !UBCINFOEXISTS(vp))
	5180	return(EINVAL);
	5181
	5182	if (resid < 0)
	5183	return(EINVAL);
	5184
	5185	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	5186
	5187	#if CONFIG_EMBEDDED
	5188	if (max_io_size > speculative_prefetch_max_iosize)
	5189	max_io_size = speculative_prefetch_max_iosize;
	5190	#else
	5191	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
	5192	if (max_io_size > speculative_prefetch_max_iosize)
	5193	max_io_size = speculative_prefetch_max_iosize;
	5194	}
	5195	#endif
	5196
	5197	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	5198	(int)f_offset, resid, (int)filesize, 0, 0);
	5199
	5200	while (resid && f_offset < filesize && retval == 0) {
	5201	/*
	5202	* compute the size of the upl needed to encompass
	5203	* the requested read... limit each call to cluster_io
	5204	* to the maximum UPL size... cluster_io will clip if
	5205	* this exceeds the maximum io_size for the device,
	5206	* make sure to account for
	5207	* a starting offset that's not page aligned
	5208	*/
	5209	start_offset = (int)(f_offset & PAGE_MASK_64);
	5210	upl_f_offset = f_offset - (off_t)start_offset;
	5211	max_size = filesize - f_offset;
	5212
	5213	if (resid < max_size)
	5214	io_size = resid;
	5215	else
	5216	io_size = max_size;
	5217
	5218	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5219	if ((uint32_t)upl_size > max_io_size)
	5220	upl_size = max_io_size;
	5221
	5222	skip_range = 0;
	5223	/*
	5224	* return the number of contiguously present pages in the cache
	5225	* starting at upl_f_offset within the file
	5226	*/
	5227	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	5228
	5229	if (skip_range) {
	5230	/*
	5231	* skip over pages already present in the cache
	5232	*/
	5233	io_size = skip_range - start_offset;
	5234
	5235	f_offset += io_size;
	5236	resid -= io_size;
	5237
	5238	if (skip_range == upl_size)
	5239	continue;
	5240	/*
	5241	* have to issue some real I/O
	5242	* at this point, we know it's starting on a page boundary
	5243	* because we've skipped over at least the first page in the request
	5244	*/
	5245	start_offset = 0;
	5246	upl_f_offset += skip_range;
	5247	upl_size -= skip_range;
	5248	}
	5249	pages_in_upl = upl_size / PAGE_SIZE;
	5250
	5251	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	5252	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5253
	5254	kret = ubc_create_upl_kernel(vp,
	5255	upl_f_offset,
	5256	upl_size,
	5257	&upl,
	5258	&pl,
	5259	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE,
	5260	VM_KERN_MEMORY_FILE);
	5261	if (kret != KERN_SUCCESS)
	5262	return(retval);
	5263	issued_io = 0;
	5264
	5265	/*
	5266	* before we start marching forward, we must make sure we end on
	5267	* a present page, otherwise we will be working with a freed
	5268	* upl
	5269	*/
	5270	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5271	if (upl_page_present(pl, last_pg))
	5272	break;
	5273	}
	5274	pages_in_upl = last_pg + 1;
	5275
	5276
	5277	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	5278	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5279
	5280
	5281	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5282	/*
	5283	* scan from the beginning of the upl looking for the first
	5284	* page that is present.... this will become the first page in
	5285	* the request we're going to make to 'cluster_io'... if all
	5286	* of the pages are absent, we won't call through to 'cluster_io'
	5287	*/
	5288	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5289	if (upl_page_present(pl, start_pg))
	5290	break;
	5291	}
	5292
	5293	/*
	5294	* scan from the starting present page looking for an absent
	5295	* page before the end of the upl is reached, if we
	5296	* find one, then it will terminate the range of pages being
	5297	* presented to 'cluster_io'
	5298	*/
	5299	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5300	if (!upl_page_present(pl, last_pg))
	5301	break;
	5302	}
	5303
	5304	if (last_pg > start_pg) {
	5305	/*
	5306	* we found a range of pages that must be filled
	5307	* if the last page in this range is the last page of the file
	5308	* we may have to clip the size of it to keep from reading past
	5309	* the end of the last physical block associated with the file
	5310	*/
	5311	upl_offset = start_pg * PAGE_SIZE;
	5312	io_size = (last_pg - start_pg) * PAGE_SIZE;
	5313
	5314	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	5315	io_size = filesize - (upl_f_offset + upl_offset);
	5316
	5317	/*
	5318	* issue an asynchronous read to cluster_io
	5319	*/
	5320	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5321	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5322
	5323	issued_io = 1;
	5324	}
	5325	}
	5326	if (issued_io == 0)
	5327	ubc_upl_abort(upl, 0);
	5328
	5329	io_size = upl_size - start_offset;
	5330
	5331	if (io_size > resid)
	5332	io_size = resid;
	5333	f_offset += io_size;
	5334	resid -= io_size;
	5335	}
	5336
	5337	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	5338	(int)f_offset, resid, retval, 0, 0);
	5339
	5340	return(retval);
	5341	}
	5342
	5343
	5344	int
	5345	cluster_push(vnode_t vp, int flags)
	5346	{
	5347	return cluster_push_ext(vp, flags, NULL, NULL);
	5348	}
	5349
	5350
	5351	int
	5352	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void ), void *callback_arg)
	5353	{
	5354	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
	5355	}
	5356
	5357	/* write errors via err, but return the number of clusters written */
	5358	int
	5359	cluster_push_err(vnode_t vp, int flags, int (callback)(buf_t, void ), void callback_arg, int err)
	5360	{
	5361	int retval;
	5362	int my_sparse_wait = 0;
	5363	struct cl_writebehind *wbp;
	5364
	5365	if (err)
	5366	*err = 0;
	5367
	5368	if ( !UBCINFOEXISTS(vp)) {
	5369	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
	5370	return (0);
	5371	}
	5372	/* return if deferred write is set */
	5373	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	5374	return (0);
	5375	}
	5376	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	5377	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
	5378	return (0);
	5379	}
	5380	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	5381	lck_mtx_unlock(&wbp->cl_lockw);
	5382
	5383	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
	5384	return(0);
	5385	}
	5386	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	5387	wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	5388
	5389	/*
	5390	* if we have an fsync in progress, we don't want to allow any additional
	5391	* sync/fsync/close(s) to occur until it finishes.
	5392	* note that its possible for writes to continue to occur to this file
	5393	* while we're waiting and also once the fsync starts to clean if we're
	5394	* in the sparse map case
	5395	*/
	5396	while (wbp->cl_sparse_wait) {
	5397	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5398
	5399	msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5400
	5401	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5402	}
	5403	if (flags & IO_SYNC) {
	5404	my_sparse_wait = 1;
	5405	wbp->cl_sparse_wait = 1;
	5406
	5407	/*
	5408	* this is an fsync (or equivalent)... we must wait for any existing async
	5409	* cleaning operations to complete before we evaulate the current state
	5410	* and finish cleaning... this insures that all writes issued before this
	5411	* fsync actually get cleaned to the disk before this fsync returns
	5412	*/
	5413	while (wbp->cl_sparse_pushes) {
	5414	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5415
	5416	msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5417
	5418	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5419	}
	5420	}
	5421	if (wbp->cl_scmap) {
	5422	void *scmap;
	5423
	5424	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
	5425
	5426	scmap = wbp->cl_scmap;
	5427	wbp->cl_scmap = NULL;
	5428
	5429	wbp->cl_sparse_pushes++;
	5430
	5431	lck_mtx_unlock(&wbp->cl_lockw);
	5432
	5433	retval = sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5434
	5435	lck_mtx_lock(&wbp->cl_lockw);
	5436
	5437	wbp->cl_sparse_pushes--;
	5438
	5439	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
	5440	wakeup((caddr_t)&wbp->cl_sparse_pushes);
	5441	} else {
	5442	retval = sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5443	}
	5444	if (err)
	5445	*err = retval;
	5446	retval = 1;
	5447	} else {
	5448	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err);
	5449	}
	5450	lck_mtx_unlock(&wbp->cl_lockw);
	5451
	5452	if (flags & IO_SYNC)
	5453	(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
	5454
	5455	if (my_sparse_wait) {
	5456	/*
	5457	* I'm the owner of the serialization token
	5458	* clear it and wakeup anyone that is waiting
	5459	* for me to finish
	5460	*/
	5461	lck_mtx_lock(&wbp->cl_lockw);
	5462
	5463	wbp->cl_sparse_wait = 0;
	5464	wakeup((caddr_t)&wbp->cl_sparse_wait);
	5465
	5466	lck_mtx_unlock(&wbp->cl_lockw);
	5467	}
	5468	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	5469	wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
	5470
	5471	return (retval);
	5472	}
	5473
	5474
	5475	__private_extern__ void
	5476	cluster_release(struct ubc_info *ubc)
	5477	{
	5478	struct cl_writebehind *wbp;
	5479	struct cl_readahead *rap;
	5480
	5481	if ((wbp = ubc->cl_wbehind)) {
	5482
	5483	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
	5484
	5485	if (wbp->cl_scmap)
	5486	vfs_drt_control(&(wbp->cl_scmap), 0);
	5487	} else {
	5488	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, 0, 0, 0, 0);
	5489	}
	5490
	5491	rap = ubc->cl_rahead;
	5492
	5493	if (wbp != NULL) {
	5494	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	5495	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	5496	}
	5497	if ((rap = ubc->cl_rahead)) {
	5498	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	5499	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	5500	}
	5501	ubc->cl_rahead = NULL;
	5502	ubc->cl_wbehind = NULL;
	5503
	5504	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, ubc, rap, wbp, 0, 0);
	5505	}
	5506
	5507
	5508	static int
	5509	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg, int *err)
	5510	{
	5511	int cl_index;
	5512	int cl_index1;
	5513	int min_index;
	5514	int cl_len;
	5515	int cl_pushed = 0;
	5516	struct cl_wextent l_clusters[MAX_CLUSTERS];
	5517	u_int max_cluster_pgcount;
	5518	int error = 0;
	5519
	5520	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	5521	/*
	5522	* the write behind context exists and has
	5523	* already been locked...
	5524	*/
	5525	if (wbp->cl_number == 0)
	5526	/*
	5527	* no clusters to push
	5528	* return number of empty slots
	5529	*/
	5530	return (MAX_CLUSTERS);
	5531
	5532	/*
	5533	* make a local 'sorted' copy of the clusters
	5534	* and clear wbp->cl_number so that new clusters can
	5535	* be developed
	5536	*/
	5537	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5538	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	5539	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
	5540	continue;
	5541	if (min_index == -1)
	5542	min_index = cl_index1;
	5543	else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
	5544	min_index = cl_index1;
	5545	}
	5546	if (min_index == -1)
	5547	break;
	5548
	5549	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	5550	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	5551	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
	5552
	5553	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	5554	}
	5555	wbp->cl_number = 0;
	5556
	5557	cl_len = cl_index;
	5558
	5559	/* skip switching to the sparse cluster mechanism if on diskimage */
	5560	if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) &&
	5561	!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) {
	5562	int i;
	5563
	5564	/*
	5565	* determine if we appear to be writing the file sequentially
	5566	* if not, by returning without having pushed any clusters
	5567	* we will cause this vnode to be pushed into the sparse cluster mechanism
	5568	* used for managing more random I/O patterns
	5569	*
	5570	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	5571	* that's why we're in try_push with PUSH_DELAY...
	5572	*
	5573	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	5574	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	5575	* so we can just make a simple pass through, up to, but not including the last one...
	5576	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	5577	* are sequential
	5578	*
	5579	* we let the last one be partial as long as it was adjacent to the previous one...
	5580	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	5581	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	5582	*/
	5583	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	5584	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
	5585	goto dont_try;
	5586	if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
	5587	goto dont_try;
	5588	}
	5589	}
	5590	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	5591	int flags;
	5592	struct cl_extent cl;
	5593	int retval;
	5594
	5595	flags = io_flags & (IO_PASSIVE\|IO_CLOSE);
	5596
	5597	/*
	5598	* try to push each cluster in turn...
	5599	*/
	5600	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
	5601	flags \|= IO_NOCACHE;
	5602
	5603	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
	5604	flags \|= IO_PASSIVE;
	5605
	5606	if (push_flag & PUSH_SYNC)
	5607	flags \|= IO_SYNC;
	5608
	5609	cl.b_addr = l_clusters[cl_index].b_addr;
	5610	cl.e_addr = l_clusters[cl_index].e_addr;
	5611
	5612	retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
	5613
	5614	if (error == 0 && retval)
	5615	error = retval;
	5616
	5617	l_clusters[cl_index].b_addr = 0;
	5618	l_clusters[cl_index].e_addr = 0;
	5619
	5620	cl_pushed++;
	5621
	5622	if ( !(push_flag & PUSH_ALL) )
	5623	break;
	5624	}
	5625	if (err)
	5626	*err = error;
	5627
	5628	dont_try:
	5629	if (cl_len > cl_pushed) {
	5630	/*
	5631	* we didn't push all of the clusters, so
	5632	* lets try to merge them back in to the vnode
	5633	*/
	5634	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	5635	/*
	5636	* we picked up some new clusters while we were trying to
	5637	* push the old ones... this can happen because I've dropped
	5638	* the vnode lock... the sum of the
	5639	* leftovers plus the new cluster count exceeds our ability
	5640	* to represent them, so switch to the sparse cluster mechanism
	5641	*
	5642	* collect the active public clusters...
	5643	*/
	5644	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5645
	5646	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	5647	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5648	continue;
	5649	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5650	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5651	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5652
	5653	cl_index1++;
	5654	}
	5655	/*
	5656	* update the cluster count
	5657	*/
	5658	wbp->cl_number = cl_index1;
	5659
	5660	/*
	5661	* and collect the original clusters that were moved into the
	5662	* local storage for sorting purposes
	5663	*/
	5664	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5665
	5666	} else {
	5667	/*
	5668	* we've got room to merge the leftovers back in
	5669	* just append them starting at the next 'hole'
	5670	* represented by wbp->cl_number
	5671	*/
	5672	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	5673	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5674	continue;
	5675
	5676	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5677	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5678	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5679
	5680	cl_index1++;
	5681	}
	5682	/*
	5683	* update the cluster count
	5684	*/
	5685	wbp->cl_number = cl_index1;
	5686	}
	5687	}
	5688	return (MAX_CLUSTERS - wbp->cl_number);
	5689	}
	5690
	5691
	5692
	5693	static int
	5694	cluster_push_now(vnode_t vp, struct cl_extent cl, off_t EOF, int flags, int (callback)(buf_t, void ), void callback_arg)
	5695	{
	5696	upl_page_info_t *pl;
	5697	upl_t upl;
	5698	vm_offset_t upl_offset;
	5699	int upl_size;
	5700	off_t upl_f_offset;
	5701	int pages_in_upl;
	5702	int start_pg;
	5703	int last_pg;
	5704	int io_size;
	5705	int io_flags;
	5706	int upl_flags;
	5707	int bflag;
	5708	int size;
	5709	int error = 0;
	5710	int retval;
	5711	kern_return_t kret;
	5712
	5713	if (flags & IO_PASSIVE)
	5714	bflag = CL_PASSIVE;
	5715	else
	5716	bflag = 0;
	5717
	5718	if (flags & IO_SKIP_ENCRYPTION)
	5719	bflag \|= CL_ENCRYPTED;
	5720
	5721	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	5722	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	5723
	5724	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	5725	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	5726
	5727	return (0);
	5728	}
	5729	upl_size = pages_in_upl * PAGE_SIZE;
	5730	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5731
	5732	if (upl_f_offset + upl_size >= EOF) {
	5733
	5734	if (upl_f_offset >= EOF) {
	5735	/*
	5736	* must have truncated the file and missed
	5737	* clearing a dangling cluster (i.e. it's completely
	5738	* beyond the new EOF
	5739	*/
	5740	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	5741
	5742	return(0);
	5743	}
	5744	size = EOF - upl_f_offset;
	5745
	5746	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5747	pages_in_upl = upl_size / PAGE_SIZE;
	5748	} else
	5749	size = upl_size;
	5750
	5751	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	5752
	5753	/*
	5754	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	5755	*
	5756	* - only pages that are currently dirty are returned... these are the ones we need to clean
	5757	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	5758	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	5759	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	5760	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	5761	*
	5762	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	5763	*/
	5764
	5765	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE))
	5766	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	5767	else
	5768	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	5769
	5770	kret = ubc_create_upl_kernel(vp,
	5771	upl_f_offset,
	5772	upl_size,
	5773	&upl,
	5774	&pl,
	5775	upl_flags,
	5776	VM_KERN_MEMORY_FILE);
	5777	if (kret != KERN_SUCCESS)
	5778	panic("cluster_push: failed to get pagelist");
	5779
	5780	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
	5781
	5782	/*
	5783	* since we only asked for the dirty pages back
	5784	* it's possible that we may only get a few or even none, so...
	5785	* before we start marching forward, we must make sure we know
	5786	* where the last present page is in the UPL, otherwise we could
	5787	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	5788	* employed by commit_range and abort_range.
	5789	*/
	5790	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5791	if (upl_page_present(pl, last_pg))
	5792	break;
	5793	}
	5794	pages_in_upl = last_pg + 1;
	5795
	5796	if (pages_in_upl == 0) {
	5797	ubc_upl_abort(upl, 0);
	5798
	5799	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	5800	return(0);
	5801	}
	5802
	5803	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5804	/*
	5805	* find the next dirty page in the UPL
	5806	* this will become the first page in the
	5807	* next I/O to generate
	5808	*/
	5809	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5810	if (upl_dirty_page(pl, start_pg))
	5811	break;
	5812	if (upl_page_present(pl, start_pg))
	5813	/*
	5814	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	5815	* just release these unchanged since we're not going
	5816	* to steal them or change their state
	5817	*/
	5818	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	5819	}
	5820	if (start_pg >= pages_in_upl)
	5821	/*
	5822	* done... no more dirty pages to push
	5823	*/
	5824	break;
	5825	if (start_pg > last_pg)
	5826	/*
	5827	* skipped over some non-dirty pages
	5828	*/
	5829	size -= ((start_pg - last_pg) * PAGE_SIZE);
	5830
	5831	/*
	5832	* find a range of dirty pages to write
	5833	*/
	5834	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5835	if (!upl_dirty_page(pl, last_pg))
	5836	break;
	5837	}
	5838	upl_offset = start_pg * PAGE_SIZE;
	5839
	5840	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	5841
	5842	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
	5843
	5844	if ( !(flags & IO_SYNC))
	5845	io_flags \|= CL_ASYNC;
	5846
	5847	if (flags & IO_CLOSE)
	5848	io_flags \|= CL_CLOSE;
	5849
	5850	if (flags & IO_NOCACHE)
	5851	io_flags \|= CL_NOCACHE;
	5852
	5853	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5854	io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5855
	5856	if (error == 0 && retval)
	5857	error = retval;
	5858
	5859	size -= io_size;
	5860	}
	5861	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, 0, 0, 0);
	5862
	5863	return(error);
	5864	}
	5865
	5866
	5867	/*
	5868	* sparse_cluster_switch is called with the write behind lock held
	5869	*/
	5870	static void
	5871	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int (callback)(buf_t, void ), void callback_arg)
	5872	{
	5873	int cl_index;
	5874
	5875	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
	5876
	5877	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5878	int flags;
	5879	struct cl_extent cl;
	5880
	5881	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	5882
	5883	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
	5884	if (flags & UPL_POP_DIRTY) {
	5885	cl.e_addr = cl.b_addr + 1;
	5886
	5887	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg);
	5888	}
	5889	}
	5890	}
	5891	}
	5892	wbp->cl_number = 0;
	5893
	5894	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
	5895	}
	5896
	5897
	5898	/*
	5899	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
	5900	* still associated with the write-behind context... however, if the scmap has been disassociated
	5901	* from the write-behind context (the cluster_push case), the wb lock is not held
	5902	*/
	5903	static int
	5904	sparse_cluster_push(void *scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg)
	5905	{
	5906	struct cl_extent cl;
	5907	off_t offset;
	5908	u_int length;
	5909	int error = 0;
	5910
	5911	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
	5912
	5913	if (push_flag & PUSH_ALL)
	5914	vfs_drt_control(scmap, 1);
	5915
	5916	for (;;) {
	5917	int retval;
	5918	if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
	5919	break;
	5920
	5921	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	5922	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	5923
	5924	retval = cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE\|IO_CLOSE), callback, callback_arg);
	5925	if (error == 0 && retval)
	5926	error = retval;
	5927
	5928	if ( !(push_flag & PUSH_ALL) )
	5929	break;
	5930	}
	5931	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
	5932
	5933	return error;
	5934	}
	5935
	5936
	5937	/*
	5938	* sparse_cluster_add is called with the write behind lock held
	5939	*/
	5940	static void
	5941	sparse_cluster_add(void *scmap, vnode_t vp, struct cl_extent cl, off_t EOF, int (callback)(buf_t, void ), void *callback_arg)
	5942	{
	5943	u_int new_dirty;
	5944	u_int length;
	5945	off_t offset;
	5946
	5947	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
	5948
	5949	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5950	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	5951
	5952	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
	5953	/*
	5954	* no room left in the map
	5955	* only a partial update was done
	5956	* push out some pages and try again
	5957	*/
	5958	sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
	5959
	5960	offset += (new_dirty * PAGE_SIZE_64);
	5961	length -= (new_dirty * PAGE_SIZE);
	5962	}
	5963	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
	5964	}
	5965
	5966
	5967	static int
	5968	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (callback)(buf_t, void ), void callback_arg)
	5969	{
	5970	upl_page_info_t *pl;
	5971	upl_t upl;
	5972	addr64_t ubc_paddr;
	5973	kern_return_t kret;
	5974	int error = 0;
	5975	int did_read = 0;
	5976	int abort_flags;
	5977	int upl_flags;
	5978	int bflag;
	5979
	5980	if (flags & IO_PASSIVE)
	5981	bflag = CL_PASSIVE;
	5982	else
	5983	bflag = 0;
	5984
	5985	if (flags & IO_NOCACHE)
	5986	bflag \|= CL_NOCACHE;
	5987
	5988	upl_flags = UPL_SET_LITE;
	5989
	5990	if ( !(flags & CL_READ) ) {
	5991	/*
	5992	* "write" operation: let the UPL subsystem know
	5993	* that we intend to modify the buffer cache pages
	5994	* we're gathering.
	5995	*/
	5996	upl_flags \|= UPL_WILL_MODIFY;
	5997	} else {
	5998	/*
	5999	* indicate that there is no need to pull the
	6000	* mapping for this page... we're only going
	6001	* to read from it, not modify it.
	6002	*/
	6003	upl_flags \|= UPL_FILE_IO;
	6004	}
	6005	kret = ubc_create_upl_kernel(vp,
	6006	uio->uio_offset & ~PAGE_MASK_64,
	6007	PAGE_SIZE,
	6008	&upl,
	6009	&pl,
	6010	upl_flags,
	6011	VM_KERN_MEMORY_FILE);
	6012
	6013	if (kret != KERN_SUCCESS)
	6014	return(EINVAL);
	6015
	6016	if (!upl_valid_page(pl, 0)) {
	6017	/*
	6018	* issue a synchronous read to cluster_io
	6019	*/
	6020	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6021	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6022	if (error) {
	6023	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	6024
	6025	return(error);
	6026	}
	6027	did_read = 1;
	6028	}
	6029	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	6030
	6031	/*
	6032	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	6033	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	6034	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	6035	* way to do so without exporting them to kexts as well.
	6036	*/
	6037	if (flags & CL_READ)
	6038	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	6039	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	6040	else
	6041	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	6042	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	6043
	6044	if ( !(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	6045	/*
	6046	* issue a synchronous write to cluster_io
	6047	*/
	6048	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6049	bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6050	}
	6051	if (error == 0)
	6052	uio_update(uio, (user_size_t)xsize);
	6053
	6054	if (did_read)
	6055	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	6056	else
	6057	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	6058
	6059	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	6060
	6061	return (error);
	6062	}
	6063
	6064	int
	6065	cluster_copy_upl_data(struct uio uio, upl_t upl, int upl_offset, int io_resid)
	6066	{
	6067	int pg_offset;
	6068	int pg_index;
	6069	int csize;
	6070	int segflg;
	6071	int retval = 0;
	6072	int xsize;
	6073	upl_page_info_t *pl;
	6074	int dirty_count;
	6075
	6076	xsize = *io_resid;
	6077
	6078	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6079	(int)uio->uio_offset, upl_offset, xsize, 0, 0);
	6080
	6081	segflg = uio->uio_segflg;
	6082
	6083	switch(segflg) {
	6084
	6085	case UIO_USERSPACE32:
	6086	case UIO_USERISPACE32:
	6087	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6088	break;
	6089
	6090	case UIO_USERSPACE:
	6091	case UIO_USERISPACE:
	6092	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6093	break;
	6094
	6095	case UIO_USERSPACE64:
	6096	case UIO_USERISPACE64:
	6097	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6098	break;
	6099
	6100	case UIO_SYSSPACE:
	6101	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6102	break;
	6103
	6104	}
	6105	pl = ubc_upl_pageinfo(upl);
	6106
	6107	pg_index = upl_offset / PAGE_SIZE;
	6108	pg_offset = upl_offset & PAGE_MASK;
	6109	csize = min(PAGE_SIZE - pg_offset, xsize);
	6110
	6111	dirty_count = 0;
	6112	while (xsize && retval == 0) {
	6113	addr64_t paddr;
	6114
	6115	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
	6116	if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE))
	6117	dirty_count++;
	6118
	6119	retval = uiomove64(paddr, csize, uio);
	6120
	6121	pg_index += 1;
	6122	pg_offset = 0;
	6123	xsize -= csize;
	6124	csize = min(PAGE_SIZE, xsize);
	6125	}
	6126	*io_resid = xsize;
	6127
	6128	uio->uio_segflg = segflg;
	6129
	6130	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
	6131	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6132	(int)uio->uio_offset, xsize, retval, segflg, 0);
	6133
	6134	return (retval);
	6135	}
	6136
	6137
	6138	int
	6139	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	6140	{
	6141
	6142	return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
	6143	}
	6144
	6145
	6146	static int
	6147	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference)
	6148	{
	6149	int segflg;
	6150	int io_size;
	6151	int xsize;
	6152	int start_offset;
	6153	int retval = 0;
	6154	memory_object_control_t control;
	6155
	6156	io_size = *io_resid;
	6157
	6158	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6159	(int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
	6160
	6161	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	6162
	6163	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	6164	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6165	(int)uio->uio_offset, io_size, retval, 3, 0);
	6166
	6167	return(0);
	6168	}
	6169	segflg = uio->uio_segflg;
	6170
	6171	switch(segflg) {
	6172
	6173	case UIO_USERSPACE32:
	6174	case UIO_USERISPACE32:
	6175	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6176	break;
	6177
	6178	case UIO_USERSPACE64:
	6179	case UIO_USERISPACE64:
	6180	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6181	break;
	6182
	6183	case UIO_USERSPACE:
	6184	case UIO_USERISPACE:
	6185	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6186	break;
	6187
	6188	case UIO_SYSSPACE:
	6189	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6190	break;
	6191	}
	6192
	6193	if ( (io_size = *io_resid) ) {
	6194	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	6195	xsize = uio_resid(uio);
	6196
	6197	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
	6198	start_offset, io_size, mark_dirty, take_reference);
	6199	xsize -= uio_resid(uio);
	6200	io_size -= xsize;
	6201	}
	6202	uio->uio_segflg = segflg;
	6203	*io_resid = io_size;
	6204
	6205	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6206	(int)uio->uio_offset, io_size, retval, 0x80000000 \| segflg, 0);
	6207
	6208	return(retval);
	6209	}
	6210
	6211
	6212	int
	6213	is_file_clean(vnode_t vp, off_t filesize)
	6214	{
	6215	off_t f_offset;
	6216	int flags;
	6217	int total_dirty = 0;
	6218
	6219	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	6220	if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
	6221	if (flags & UPL_POP_DIRTY) {
	6222	total_dirty++;
	6223	}
	6224	}
	6225	}
	6226	if (total_dirty)
	6227	return(EINVAL);
	6228
	6229	return (0);
	6230	}
	6231
	6232
	6233
	6234	/*
	6235	* Dirty region tracking/clustering mechanism.
	6236	*
	6237	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	6238	* dirty regions within a larger space (file). It is primarily intended to
	6239	* support clustering in large files with many dirty areas.
	6240	*
	6241	* The implementation assumes that the dirty regions are pages.
	6242	*
	6243	* To represent dirty pages within the file, we store bit vectors in a
	6244	* variable-size circular hash.
	6245	*/
	6246
	6247	/*
	6248	* Bitvector size. This determines the number of pages we group in a
	6249	* single hashtable entry. Each hashtable entry is aligned to this
	6250	* size within the file.
	6251	*/
	6252	#define DRT_BITVECTOR_PAGES ((1024 * 1024) / PAGE_SIZE)
	6253
	6254	/*
	6255	* File offset handling.
	6256	*
	6257	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	6258	* the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6259	*/
	6260	#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6261	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	6262
	6263	/*
	6264	* Hashtable address field handling.
	6265	*
	6266	* The low-order bits of the hashtable address are used to conserve
	6267	* space.
	6268	*
	6269	* DRT_HASH_COUNT_MASK must be large enough to store the range
	6270	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	6271	* to indicate that the bucket is actually unoccupied.
	6272	*/
	6273	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	6274	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	6275	do { \
	6276	(scm)->scm_hashtable[(i)].dhe_control = \
	6277	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	6278	} while (0)
	6279	#define DRT_HASH_COUNT_MASK 0x1ff
	6280	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	6281	#define DRT_HASH_SET_COUNT(scm, i, c) \
	6282	do { \
	6283	(scm)->scm_hashtable[(i)].dhe_control = \
	6284	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	6285	} while (0)
	6286	#define DRT_HASH_CLEAR(scm, i) \
	6287	do { \
	6288	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	6289	} while (0)
	6290	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	6291	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	6292	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	6293	do { \
	6294	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	6295	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	6296	} while(0);
	6297
	6298
	6299	/*
	6300	* Hash table moduli.
	6301	*
	6302	* Since the hashtable entry's size is dependent on the size of
	6303	* the bitvector, and since the hashtable size is constrained to
	6304	* both being prime and fitting within the desired allocation
	6305	* size, these values need to be manually determined.
	6306	*
	6307	* For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
	6308	*
	6309	* The small hashtable allocation is 1024 bytes, so the modulus is 23.
	6310	* The large hashtable allocation is 16384 bytes, so the modulus is 401.
	6311	*/
	6312	#define DRT_HASH_SMALL_MODULUS 23
	6313	#define DRT_HASH_LARGE_MODULUS 401
	6314
	6315	/*
	6316	* Physical memory required before the large hash modulus is permitted.
	6317	*
	6318	* On small memory systems, the large hash modulus can lead to phsyical
	6319	* memory starvation, so we avoid using it there.
	6320	*/
	6321	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
	6322
	6323	#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
	6324	#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
	6325
	6326	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	6327
	6328	/*
	6329	* Hashtable bitvector handling.
	6330	*
	6331	* Bitvector fields are 32 bits long.
	6332	*/
	6333
	6334	#define DRT_HASH_SET_BIT(scm, i, bit) \
	6335	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	6336
	6337	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	6338	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	6339
	6340	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	6341	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	6342
	6343	#define DRT_BITVECTOR_CLEAR(scm, i) \
	6344	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6345
	6346	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	6347	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	6348	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	6349	(DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6350
	6351
	6352
	6353	/*
	6354	* Hashtable entry.
	6355	*/
	6356	struct vfs_drt_hashentry {
	6357	u_int64_t dhe_control;
	6358	/*
	6359	* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	6360	* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
	6361	* Since PAGE_SIZE is only known at boot time,
	6362	* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
	6363	* -declare dhe_bitvector array for largest possible length
	6364	*/
	6365	#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
	6366	u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
	6367	};
	6368
	6369	/*
	6370	* Dirty Region Tracking structure.
	6371	*
	6372	* The hashtable is allocated entirely inside the DRT structure.
	6373	*
	6374	* The hash is a simple circular prime modulus arrangement, the structure
	6375	* is resized from small to large if it overflows.
	6376	*/
	6377
	6378	struct vfs_drt_clustermap {
	6379	u_int32_t scm_magic; /* sanity/detection */
	6380	#define DRT_SCM_MAGIC 0x12020003
	6381	u_int32_t scm_modulus; /* current ring size */
	6382	u_int32_t scm_buckets; /* number of occupied buckets */
	6383	u_int32_t scm_lastclean; /* last entry we cleaned */
	6384	u_int32_t scm_iskips; /* number of slot skips */
	6385
	6386	struct vfs_drt_hashentry scm_hashtable[0];
	6387	};
	6388
	6389
	6390	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	6391	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	6392
	6393	/*
	6394	* Debugging codes and arguments.
	6395	*/
	6396	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	6397	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	6398	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	6399	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	6400	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	6401	* dirty */
	6402	/* 0, setcount */
	6403	/* 1 (clean, no map) */
	6404	/* 2 (map alloc fail) */
	6405	/* 3, resid (partial) */
	6406	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	6407	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	6408	* lastclean, iskips */
	6409
	6410
	6411	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	6412	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	6413	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	6414	u_int64_t offset, int *indexp);
	6415	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	6416	u_int64_t offset,
	6417	int *indexp,
	6418	int recursed);
	6419	static kern_return_t vfs_drt_do_mark_pages(
	6420	void **cmapp,
	6421	u_int64_t offset,
	6422	u_int length,
	6423	u_int *setcountp,
	6424	int dirty);
	6425	static void vfs_drt_trace(
	6426	struct vfs_drt_clustermap *cmap,
	6427	int code,
	6428	int arg1,
	6429	int arg2,
	6430	int arg3,
	6431	int arg4);
	6432
	6433
	6434	/*
	6435	* Allocate and initialise a sparse cluster map.
	6436	*
	6437	* Will allocate a new map, resize or compact an existing map.
	6438	*
	6439	* XXX we should probably have at least one intermediate map size,
	6440	* as the 1:16 ratio seems a bit drastic.
	6441	*/
	6442	static kern_return_t
	6443	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	6444	{
	6445	struct vfs_drt_clustermap cmap, ocmap;
	6446	kern_return_t kret;
	6447	u_int64_t offset;
	6448	u_int32_t i;
	6449	int nsize, active_buckets, index, copycount;
	6450
	6451	ocmap = NULL;
	6452	if (cmapp != NULL)
	6453	ocmap = *cmapp;
	6454
	6455	/*
	6456	* Decide on the size of the new map.
	6457	*/
	6458	if (ocmap == NULL) {
	6459	nsize = DRT_HASH_SMALL_MODULUS;
	6460	} else {
	6461	/* count the number of active buckets in the old map */
	6462	active_buckets = 0;
	6463	for (i = 0; i < ocmap->scm_modulus; i++) {
	6464	if (!DRT_HASH_VACANT(ocmap, i) &&
	6465	(DRT_HASH_GET_COUNT(ocmap, i) != 0))
	6466	active_buckets++;
	6467	}
	6468	/*
	6469	* If we're currently using the small allocation, check to
	6470	* see whether we should grow to the large one.
	6471	*/
	6472	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	6473	/*
	6474	* If the ring is nearly full and we are allowed to
	6475	* use the large modulus, upgrade.
	6476	*/
	6477	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
	6478	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
	6479	nsize = DRT_HASH_LARGE_MODULUS;
	6480	} else {
	6481	nsize = DRT_HASH_SMALL_MODULUS;
	6482	}
	6483	} else {
	6484	/* already using the large modulus */
	6485	nsize = DRT_HASH_LARGE_MODULUS;
	6486	/*
	6487	* If the ring is completely full, there's
	6488	* nothing useful for us to do. Behave as
	6489	* though we had compacted into the new
	6490	* array and return.
	6491	*/
	6492	if (active_buckets >= DRT_HASH_LARGE_MODULUS)
	6493	return(KERN_SUCCESS);
	6494	}
	6495	}
	6496
	6497	/*
	6498	* Allocate and initialise the new map.
	6499	*/
	6500
	6501	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
	6502	(nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE);
	6503	if (kret != KERN_SUCCESS)
	6504	return(kret);
	6505	cmap->scm_magic = DRT_SCM_MAGIC;
	6506	cmap->scm_modulus = nsize;
	6507	cmap->scm_buckets = 0;
	6508	cmap->scm_lastclean = 0;
	6509	cmap->scm_iskips = 0;
	6510	for (i = 0; i < cmap->scm_modulus; i++) {
	6511	DRT_HASH_CLEAR(cmap, i);
	6512	DRT_HASH_VACATE(cmap, i);
	6513	DRT_BITVECTOR_CLEAR(cmap, i);
	6514	}
	6515
	6516	/*
	6517	* If there's an old map, re-hash entries from it into the new map.
	6518	*/
	6519	copycount = 0;
	6520	if (ocmap != NULL) {
	6521	for (i = 0; i < ocmap->scm_modulus; i++) {
	6522	/* skip empty buckets */
	6523	if (DRT_HASH_VACANT(ocmap, i) \|\|
	6524	(DRT_HASH_GET_COUNT(ocmap, i) == 0))
	6525	continue;
	6526	/* get new index */
	6527	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	6528	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	6529	if (kret != KERN_SUCCESS) {
	6530	/* XXX need to bail out gracefully here */
	6531	panic("vfs_drt: new cluster map mysteriously too small");
	6532	index = 0;
	6533	}
	6534	/* copy */
	6535	DRT_HASH_COPY(ocmap, i, cmap, index);
	6536	copycount++;
	6537	}
	6538	}
	6539
	6540	/* log what we've done */
	6541	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	6542
	6543	/*
	6544	* It's important to ensure that *cmapp always points to
	6545	* a valid map, so we must overwrite it before freeing
	6546	* the old map.
	6547	*/
	6548	*cmapp = cmap;
	6549	if (ocmap != NULL) {
	6550	/* emit stats into trace buffer */
	6551	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	6552	ocmap->scm_modulus,
	6553	ocmap->scm_buckets,
	6554	ocmap->scm_lastclean,
	6555	ocmap->scm_iskips);
	6556
	6557	vfs_drt_free_map(ocmap);
	6558	}
	6559	return(KERN_SUCCESS);
	6560	}
	6561
	6562
	6563	/*
	6564	* Free a sparse cluster map.
	6565	*/
	6566	static kern_return_t
	6567	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	6568	{
	6569	kmem_free(kernel_map, (vm_offset_t)cmap,
	6570	(cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	6571	return(KERN_SUCCESS);
	6572	}
	6573
	6574
	6575	/*
	6576	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	6577	*/
	6578	static kern_return_t
	6579	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	6580	{
	6581	int index;
	6582	u_int32_t i;
	6583
	6584	offset = DRT_ALIGN_ADDRESS(offset);
	6585	index = DRT_HASH(cmap, offset);
	6586
	6587	/* traverse the hashtable */
	6588	for (i = 0; i < cmap->scm_modulus; i++) {
	6589
	6590	/*
	6591	* If the slot is vacant, we can stop.
	6592	*/
	6593	if (DRT_HASH_VACANT(cmap, index))
	6594	break;
	6595
	6596	/*
	6597	* If the address matches our offset, we have success.
	6598	*/
	6599	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	6600	*indexp = index;
	6601	return(KERN_SUCCESS);
	6602	}
	6603
	6604	/*
	6605	* Move to the next slot, try again.
	6606	*/
	6607	index = DRT_HASH_NEXT(cmap, index);
	6608	}
	6609	/*
	6610	* It's not there.
	6611	*/
	6612	return(KERN_FAILURE);
	6613	}
	6614
	6615	/*
	6616	* Find the hashtable slot for the supplied offset. If we haven't allocated
	6617	* one yet, allocate one and populate the address field. Note that it will
	6618	* not have a nonzero page count and thus will still technically be free, so
	6619	* in the case where we are called to clean pages, the slot will remain free.
	6620	*/
	6621	static kern_return_t
	6622	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	6623	{
	6624	struct vfs_drt_clustermap *cmap;
	6625	kern_return_t kret;
	6626	u_int32_t index;
	6627	u_int32_t i;
	6628
	6629	cmap = *cmapp;
	6630
	6631	/* look for an existing entry */
	6632	kret = vfs_drt_search_index(cmap, offset, indexp);
	6633	if (kret == KERN_SUCCESS)
	6634	return(kret);
	6635
	6636	/* need to allocate an entry */
	6637	offset = DRT_ALIGN_ADDRESS(offset);
	6638	index = DRT_HASH(cmap, offset);
	6639
	6640	/* scan from the index forwards looking for a vacant slot */
	6641	for (i = 0; i < cmap->scm_modulus; i++) {
	6642	/* slot vacant? */
	6643	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap,index) == 0) {
	6644	cmap->scm_buckets++;
	6645	if (index < cmap->scm_lastclean)
	6646	cmap->scm_lastclean = index;
	6647	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	6648	DRT_HASH_SET_COUNT(cmap, index, 0);
	6649	DRT_BITVECTOR_CLEAR(cmap, index);
	6650	*indexp = index;
	6651	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	6652	return(KERN_SUCCESS);
	6653	}
	6654	cmap->scm_iskips += i;
	6655	index = DRT_HASH_NEXT(cmap, index);
	6656	}
	6657
	6658	/*
	6659	* We haven't found a vacant slot, so the map is full. If we're not
	6660	* already recursed, try reallocating/compacting it.
	6661	*/
	6662	if (recursed)
	6663	return(KERN_FAILURE);
	6664	kret = vfs_drt_alloc_map(cmapp);
	6665	if (kret == KERN_SUCCESS) {
	6666	/* now try to insert again */
	6667	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	6668	}
	6669	return(kret);
	6670	}
	6671
	6672	/*
	6673	* Implementation of set dirty/clean.
	6674	*
	6675	* In the 'clean' case, not finding a map is OK.
	6676	*/
	6677	static kern_return_t
	6678	vfs_drt_do_mark_pages(
	6679	void **private,
	6680	u_int64_t offset,
	6681	u_int length,
	6682	u_int *setcountp,
	6683	int dirty)
	6684	{
	6685	struct vfs_drt_clustermap cmap, *cmapp;
	6686	kern_return_t kret;
	6687	int i, index, pgoff, pgcount, setcount, ecount;
	6688
	6689	cmapp = (struct vfs_drt_clustermap **)private;
	6690	cmap = *cmapp;
	6691
	6692	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	6693
	6694	if (setcountp != NULL)
	6695	*setcountp = 0;
	6696
	6697	/* allocate a cluster map if we don't already have one */
	6698	if (cmap == NULL) {
	6699	/* no cluster map, nothing to clean */
	6700	if (!dirty) {
	6701	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	6702	return(KERN_SUCCESS);
	6703	}
	6704	kret = vfs_drt_alloc_map(cmapp);
	6705	if (kret != KERN_SUCCESS) {
	6706	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	6707	return(kret);
	6708	}
	6709	}
	6710	setcount = 0;
	6711
	6712	/*
	6713	* Iterate over the length of the region.
	6714	*/
	6715	while (length > 0) {
	6716	/*
	6717	* Get the hashtable index for this offset.
	6718	*
	6719	* XXX this will add blank entries if we are clearing a range
	6720	* that hasn't been dirtied.
	6721	*/
	6722	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	6723	cmap = cmapp; / may have changed! */
	6724	/* this may be a partial-success return */
	6725	if (kret != KERN_SUCCESS) {
	6726	if (setcountp != NULL)
	6727	*setcountp = setcount;
	6728	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	6729
	6730	return(kret);
	6731	}
	6732
	6733	/*
	6734	* Work out how many pages we're modifying in this
	6735	* hashtable entry.
	6736	*/
	6737	pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
	6738	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	6739
	6740	/*
	6741	* Iterate over pages, dirty/clearing as we go.
	6742	*/
	6743	ecount = DRT_HASH_GET_COUNT(cmap, index);
	6744	for (i = 0; i < pgcount; i++) {
	6745	if (dirty) {
	6746	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6747	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	6748	ecount++;
	6749	setcount++;
	6750	}
	6751	} else {
	6752	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6753	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	6754	ecount--;
	6755	setcount++;
	6756	}
	6757	}
	6758	}
	6759	DRT_HASH_SET_COUNT(cmap, index, ecount);
	6760
	6761	offset += pgcount * PAGE_SIZE;
	6762	length -= pgcount * PAGE_SIZE;
	6763	}
	6764	if (setcountp != NULL)
	6765	*setcountp = setcount;
	6766
	6767	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	6768
	6769	return(KERN_SUCCESS);
	6770	}
	6771
	6772	/*
	6773	* Mark a set of pages as dirty/clean.
	6774	*
	6775	* This is a public interface.
	6776	*
	6777	* cmapp
	6778	* Pointer to storage suitable for holding a pointer. Note that
	6779	* this must either be NULL or a value set by this function.
	6780	*
	6781	* size
	6782	* Current file size in bytes.
	6783	*
	6784	* offset
	6785	* Offset of the first page to be marked as dirty, in bytes. Must be
	6786	* page-aligned.
	6787	*
	6788	* length
	6789	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	6790	*
	6791	* setcountp
	6792	* Number of pages newly marked dirty by this call (optional).
	6793	*
	6794	* Returns KERN_SUCCESS if all the pages were successfully marked.
	6795	*/
	6796	static kern_return_t
	6797	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
	6798	{
	6799	/* XXX size unused, drop from interface */
	6800	return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
	6801	}
	6802
	6803	#if 0
	6804	static kern_return_t
	6805	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	6806	{
	6807	return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
	6808	}
	6809	#endif
	6810
	6811	/*
	6812	* Get a cluster of dirty pages.
	6813	*
	6814	* This is a public interface.
	6815	*
	6816	* cmapp
	6817	* Pointer to storage managed by drt_mark_pages. Note that this must
	6818	* be NULL or a value set by drt_mark_pages.
	6819	*
	6820	* offsetp
	6821	* Returns the byte offset into the file of the first page in the cluster.
	6822	*
	6823	* lengthp
	6824	* Returns the length in bytes of the cluster of dirty pages.
	6825	*
	6826	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	6827	* are no dirty pages meeting the minmum size criteria. Private storage will
	6828	* be released if there are no more dirty pages left in the map
	6829	*
	6830	*/
	6831	static kern_return_t
	6832	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	6833	{
	6834	struct vfs_drt_clustermap *cmap;
	6835	u_int64_t offset;
	6836	u_int length;
	6837	u_int32_t j;
	6838	int index, i, fs, ls;
	6839
	6840	/* sanity */
	6841	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6842	return(KERN_FAILURE);
	6843	cmap = *cmapp;
	6844
	6845	/* walk the hashtable */
	6846	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	6847	index = DRT_HASH(cmap, offset);
	6848
	6849	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0))
	6850	continue;
	6851
	6852	/* scan the bitfield for a string of bits */
	6853	fs = -1;
	6854
	6855	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6856	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	6857	fs = i;
	6858	break;
	6859	}
	6860	}
	6861	if (fs == -1) {
	6862	/* didn't find any bits set */
	6863	panic("vfs_drt: entry summary count > 0 but no bits set in map");
	6864	}
	6865	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	6866	if (!DRT_HASH_TEST_BIT(cmap, index, i))
	6867	break;
	6868	}
	6869
	6870	/* compute offset and length, mark pages clean */
	6871	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	6872	length = ls * PAGE_SIZE;
	6873	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	6874	cmap->scm_lastclean = index;
	6875
	6876	/* return successful */
	6877	*offsetp = (off_t)offset;
	6878	*lengthp = length;
	6879
	6880	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	6881	return(KERN_SUCCESS);
	6882	}
	6883	/*
	6884	* We didn't find anything... hashtable is empty
	6885	* emit stats into trace buffer and
	6886	* then free it
	6887	*/
	6888	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6889	cmap->scm_modulus,
	6890	cmap->scm_buckets,
	6891	cmap->scm_lastclean,
	6892	cmap->scm_iskips);
	6893
	6894	vfs_drt_free_map(cmap);
	6895	*cmapp = NULL;
	6896
	6897	return(KERN_FAILURE);
	6898	}
	6899
	6900
	6901	static kern_return_t
	6902	vfs_drt_control(void **cmapp, int op_type)
	6903	{
	6904	struct vfs_drt_clustermap *cmap;
	6905
	6906	/* sanity */
	6907	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6908	return(KERN_FAILURE);
	6909	cmap = *cmapp;
	6910
	6911	switch (op_type) {
	6912	case 0:
	6913	/* emit stats into trace buffer */
	6914	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6915	cmap->scm_modulus,
	6916	cmap->scm_buckets,
	6917	cmap->scm_lastclean,
	6918	cmap->scm_iskips);
	6919
	6920	vfs_drt_free_map(cmap);
	6921	*cmapp = NULL;
	6922	break;
	6923
	6924	case 1:
	6925	cmap->scm_lastclean = 0;
	6926	break;
	6927	}
	6928	return(KERN_SUCCESS);
	6929	}
	6930
	6931
	6932
	6933	/*
	6934	* Emit a summary of the state of the clustermap into the trace buffer
	6935	* along with some caller-provided data.
	6936	*/
	6937	#if KDEBUG
	6938	static void
	6939	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	6940	{
	6941	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	6942	}
	6943	#else
	6944	static void
	6945	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	6946	__unused int arg1, __unused int arg2, __unused int arg3,
	6947	__unused int arg4)
	6948	{
	6949	}
	6950	#endif
	6951
	6952	#if 0
	6953	/*
	6954	* Perform basic sanity check on the hash entry summary count
	6955	* vs. the actual bits set in the entry.
	6956	*/
	6957	static void
	6958	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	6959	{
	6960	int index, i;
	6961	int bits_on;
	6962
	6963	for (index = 0; index < cmap->scm_modulus; index++) {
	6964	if (DRT_HASH_VACANT(cmap, index))
	6965	continue;
	6966
	6967	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6968	if (DRT_HASH_TEST_BIT(cmap, index, i))
	6969	bits_on++;
	6970	}
	6971	if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
	6972	panic("bits_on = %d, index = %d\n", bits_on, index);
	6973	}
	6974	}
	6975	#endif