git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/buf_internal.h>
	67	#include <sys/mount_internal.h>
	68	#include <sys/vnode_internal.h>
	69	#include <sys/trace.h>
	70	#include <kern/kalloc.h>
	71	#include <sys/time.h>
	72	#include <sys/kernel.h>
	73	#include <sys/resourcevar.h>
	74	#include <miscfs/specfs/specdev.h>
	75	#include <sys/uio_internal.h>
	76	#include <libkern/libkern.h>
	77	#include <machine/machine_routines.h>
	78
	79	#include <sys/ubc_internal.h>
	80	#include <vm/vnode_pager.h>
	81
	82	#include <mach/mach_types.h>
	83	#include <mach/memory_object_types.h>
	84	#include <mach/vm_map.h>
	85	#include <mach/upl.h>
	86	#include <kern/task.h>
	87	#include <kern/policy_internal.h>
	88
	89	#include <vm/vm_kern.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_fault.h>
	93
	94	#include <sys/kdebug.h>
	95	#include <libkern/OSAtomic.h>
	96
	97	#include <sys/sdt.h>
	98
	99	#include <stdbool.h>
	100
	101	#include <vfs/vfs_disk_conditioner.h>
	102
	103	#if 0
	104	#undef KERNEL_DEBUG
	105	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	106	#endif
	107
	108
	109	#define CL_READ 0x01
	110	#define CL_WRITE 0x02
	111	#define CL_ASYNC 0x04
	112	#define CL_COMMIT 0x08
	113	#define CL_PAGEOUT 0x10
	114	#define CL_AGE 0x20
	115	#define CL_NOZERO 0x40
	116	#define CL_PAGEIN 0x80
	117	#define CL_DEV_MEMORY 0x100
	118	#define CL_PRESERVE 0x200
	119	#define CL_THROTTLE 0x400
	120	#define CL_KEEPCACHED 0x800
	121	#define CL_DIRECT_IO 0x1000
	122	#define CL_PASSIVE 0x2000
	123	#define CL_IOSTREAMING 0x4000
	124	#define CL_CLOSE 0x8000
	125	#define CL_ENCRYPTED 0x10000
	126	#define CL_RAW_ENCRYPTED 0x20000
	127	#define CL_NOCACHE 0x40000
	128
	129	#define MAX_VECTOR_UPL_ELEMENTS 8
	130	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
	131
	132	#define CLUSTER_IO_WAITING ((buf_t)1)
	133
	134	extern upl_t vector_upl_create(vm_offset_t);
	135	extern boolean_t vector_upl_is_valid(upl_t);
	136	extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
	137	extern void vector_upl_set_pagelist(upl_t);
	138	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
	139
	140	struct clios {
	141	lck_mtx_t io_mtxp;
	142	u_int io_completed; /* amount of io that has currently completed */
	143	u_int io_issued; /* amount of io that was successfully issued */
	144	int io_error; /* error code of first error encountered */
	145	int io_wanted; /* someone is sleeping waiting for a change in state */
	146	};
	147
	148	struct cl_direct_read_lock {
	149	LIST_ENTRY(cl_direct_read_lock) chain;
	150	int32_t ref_count;
	151	vnode_t vp;
	152	lck_rw_t rw_lock;
	153	};
	154
	155	#define CL_DIRECT_READ_LOCK_BUCKETS 61
	156
	157	static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
	158	cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
	159
	160	static LCK_GRP_DECLARE(cl_mtx_grp, "cluster I/O");
	161	static LCK_MTX_DECLARE(cl_transaction_mtxp, &cl_mtx_grp);
	162	static LCK_SPIN_DECLARE(cl_direct_read_spin_lock, &cl_mtx_grp);
	163
	164	static ZONE_DECLARE(cl_rd_zone, "cluster_read",
	165	sizeof(struct cl_readahead), ZC_ZFREE_CLEARMEM \| ZC_NOENCRYPT);
	166
	167	static ZONE_DECLARE(cl_wr_zone, "cluster_write",
	168	sizeof(struct cl_writebehind), ZC_ZFREE_CLEARMEM \| ZC_NOENCRYPT);
	169
	170	#define IO_UNKNOWN 0
	171	#define IO_DIRECT 1
	172	#define IO_CONTIG 2
	173	#define IO_COPY 3
	174
	175	#define PUSH_DELAY 0x01
	176	#define PUSH_ALL 0x02
	177	#define PUSH_SYNC 0x04
	178
	179
	180	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
	181	static void cluster_wait_IO(buf_t cbp_head, int async);
	182	static void cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait);
	183
	184	static int cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length);
	185
	186	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	187	int flags, buf_t real_bp, struct clios iostate, int ()(buf_t, void ), void callback_arg);
	188	static int cluster_iodone(buf_t bp, void *callback_arg);
	189	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
	190	static int cluster_is_throttled(vnode_t vp);
	191
	192	static void cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name);
	193
	194	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void ), void *callback_arg, int flags);
	195
	196	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
	197	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference);
	198
	199	static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
	200	int ()(buf_t, void ), void *callback_arg) __attribute__((noinline));
	201	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	202	int flags, int ()(buf_t, void ), void *callback_arg) __attribute__((noinline));
	203	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	204	int ()(buf_t, void ), void *callback_arg, int flags) __attribute__((noinline));
	205
	206	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
	207	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void ), void *callback_arg) __attribute__((noinline));
	208	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	209	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void ), void *callback_arg) __attribute__((noinline));
	210	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
	211	int write_type, u_int32_t write_length, int ()(buf_t, void ), void *callback_arg, int bflag) __attribute__((noinline));
	212
	213	static void cluster_update_state_internal(vnode_t vp, struct cl_extent cl, int flags, boolean_t defer_writes, boolean_t first_pass,
	214	off_t write_off, int write_cnt, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	215
	216	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int ()(buf_t, void ), void callback_arg);
	217
	218	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	219	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra,
	220	int (callback)(buf_t, void ), void *callback_arg, int bflag);
	221
	222	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int flags, int ()(buf_t, void ), void callback_arg, boolean_t vm_ioitiated);
	223
	224	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int push_flag, int flags, int ()(buf_t, void *),
	225	void callback_arg, int err, boolean_t vm_initiated);
	226
	227	static int sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int ()(buf_t, void ), void callback_arg, boolean_t vm_initiated);
	228	static int sparse_cluster_push(struct cl_writebehind , void *cmapp, vnode_t vp, off_t EOF, int push_flag,
	229	int io_flags, int ()(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	230	static int sparse_cluster_add(struct cl_writebehind , void cmapp, vnode_t vp, struct cl_extent , off_t EOF,
	231	int ()(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	232
	233	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
	234	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	235	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	236	static kern_return_t vfs_get_scmap_push_behavior_internal(void *cmapp, int push_flag);
	237
	238
	239	/*
	240	* For throttled IO to check whether
	241	* a block is cached by the boot cache
	242	* and thus it can avoid delaying the IO.
	243	*
	244	* bootcache_contains_block is initially
	245	* NULL. The BootCache will set it while
	246	* the cache is active and clear it when
	247	* the cache is jettisoned.
	248	*
	249	* Returns 0 if the block is not
	250	* contained in the cache, 1 if it is
	251	* contained.
	252	*
	253	* The function pointer remains valid
	254	* after the cache has been evicted even
	255	* if bootcache_contains_block has been
	256	* cleared.
	257	*
	258	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
	259	*/
	260	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
	261
	262
	263	/*
	264	* limit the internal I/O size so that we
	265	* can represent it in a 32 bit int
	266	*/
	267	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
	268	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
	269	#define MAX_VECTS 16
	270	/*
	271	* The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
	272	* allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
	273	* we have not historically allowed the write to bypass the UBC.
	274	*/
	275	#define MIN_DIRECT_WRITE_SIZE (16384)
	276
	277	#define WRITE_THROTTLE 6
	278	#define WRITE_THROTTLE_SSD 2
	279	#define WRITE_BEHIND 1
	280	#define WRITE_BEHIND_SSD 1
	281
	282	#if !defined(XNU_TARGET_OS_OSX)
	283	#define PREFETCH 1
	284	#define PREFETCH_SSD 1
	285	uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
	286	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
	287	#else /* XNU_TARGET_OS_OSX */
	288	#define PREFETCH 3
	289	#define PREFETCH_SSD 2
	290	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
	291	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
	292	#endif /* ! XNU_TARGET_OS_OSX */
	293
	294
	295	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
	296	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
	297	#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
	298
	299	int speculative_reads_disabled = 0;
	300
	301	/*
	302	* throttle the number of async writes that
	303	* can be outstanding on a single vnode
	304	* before we issue a synchronous write
	305	*/
	306	#define THROTTLE_MAXCNT 0
	307
	308	uint32_t throttle_max_iosize = (128 * 1024);
	309
	310	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
	311
	312	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
	313
	314
	315	void
	316	cluster_init(void)
	317	{
	318	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
	319	LIST_INIT(&cl_direct_read_locks[i]);
	320	}
	321	}
	322
	323
	324	uint32_t
	325	cluster_max_io_size(mount_t mp, int type)
	326	{
	327	uint32_t max_io_size;
	328	uint32_t segcnt;
	329	uint32_t maxcnt;
	330
	331	switch (type) {
	332	case CL_READ:
	333	segcnt = mp->mnt_segreadcnt;
	334	maxcnt = mp->mnt_maxreadcnt;
	335	break;
	336	case CL_WRITE:
	337	segcnt = mp->mnt_segwritecnt;
	338	maxcnt = mp->mnt_maxwritecnt;
	339	break;
	340	default:
	341	segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
	342	maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
	343	break;
	344	}
	345	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
	346	/*
	347	* don't allow a size beyond the max UPL size we can create
	348	*/
	349	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
	350	}
	351	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
	352
	353	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
	354	/*
	355	* don't allow a size smaller than the old fixed limit
	356	*/
	357	max_io_size = MAX_UPL_TRANSFER_BYTES;
	358	} else {
	359	/*
	360	* make sure the size specified is a multiple of PAGE_SIZE
	361	*/
	362	max_io_size &= ~PAGE_MASK;
	363	}
	364	return max_io_size;
	365	}
	366
	367
	368
	369
	370	#define CLW_ALLOCATE 0x01
	371	#define CLW_RETURNLOCKED 0x02
	372	#define CLW_IONOCACHE 0x04
	373	#define CLW_IOPASSIVE 0x08
	374
	375	/*
	376	* if the read ahead context doesn't yet exist,
	377	* allocate and initialize it...
	378	* the vnode lock serializes multiple callers
	379	* during the actual assignment... first one
	380	* to grab the lock wins... the other callers
	381	* will release the now unnecessary storage
	382	*
	383	* once the context is present, try to grab (but don't block on)
	384	* the lock associated with it... if someone
	385	* else currently owns it, than the read
	386	* will run without read-ahead. this allows
	387	* multiple readers to run in parallel and
	388	* since there's only 1 read ahead context,
	389	* there's no real loss in only allowing 1
	390	* reader to have read-ahead enabled.
	391	*/
	392	static struct cl_readahead *
	393	cluster_get_rap(vnode_t vp)
	394	{
	395	struct ubc_info *ubc;
	396	struct cl_readahead *rap;
	397
	398	ubc = vp->v_ubcinfo;
	399
	400	if ((rap = ubc->cl_rahead) == NULL) {
	401	rap = zalloc_flags(cl_rd_zone, Z_WAITOK \| Z_ZERO);
	402	rap->cl_lastr = -1;
	403	lck_mtx_init(&rap->cl_lockr, &cl_mtx_grp, LCK_ATTR_NULL);
	404
	405	vnode_lock(vp);
	406
	407	if (ubc->cl_rahead == NULL) {
	408	ubc->cl_rahead = rap;
	409	} else {
	410	lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
	411	zfree(cl_rd_zone, rap);
	412	rap = ubc->cl_rahead;
	413	}
	414	vnode_unlock(vp);
	415	}
	416	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
	417	return rap;
	418	}
	419
	420	return (struct cl_readahead *)NULL;
	421	}
	422
	423
	424	/*
	425	* if the write behind context doesn't yet exist,
	426	* and CLW_ALLOCATE is specified, allocate and initialize it...
	427	* the vnode lock serializes multiple callers
	428	* during the actual assignment... first one
	429	* to grab the lock wins... the other callers
	430	* will release the now unnecessary storage
	431	*
	432	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	433	* the lock associated with the write behind context before
	434	* returning
	435	*/
	436
	437	static struct cl_writebehind *
	438	cluster_get_wbp(vnode_t vp, int flags)
	439	{
	440	struct ubc_info *ubc;
	441	struct cl_writebehind *wbp;
	442
	443	ubc = vp->v_ubcinfo;
	444
	445	if ((wbp = ubc->cl_wbehind) == NULL) {
	446	if (!(flags & CLW_ALLOCATE)) {
	447	return (struct cl_writebehind *)NULL;
	448	}
	449
	450	wbp = zalloc_flags(cl_wr_zone, Z_WAITOK \| Z_ZERO);
	451
	452	lck_mtx_init(&wbp->cl_lockw, &cl_mtx_grp, LCK_ATTR_NULL);
	453
	454	vnode_lock(vp);
	455
	456	if (ubc->cl_wbehind == NULL) {
	457	ubc->cl_wbehind = wbp;
	458	} else {
	459	lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
	460	zfree(cl_wr_zone, wbp);
	461	wbp = ubc->cl_wbehind;
	462	}
	463	vnode_unlock(vp);
	464	}
	465	if (flags & CLW_RETURNLOCKED) {
	466	lck_mtx_lock(&wbp->cl_lockw);
	467	}
	468
	469	return wbp;
	470	}
	471
	472
	473	static void
	474	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, int flags)
	475	{
	476	struct cl_writebehind *wbp;
	477
	478	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
	479	if (wbp->cl_number) {
	480	lck_mtx_lock(&wbp->cl_lockw);
	481
	482	cluster_try_push(wbp, vp, newEOF, PUSH_ALL \| flags, 0, callback, callback_arg, NULL, FALSE);
	483
	484	lck_mtx_unlock(&wbp->cl_lockw);
	485	}
	486	}
	487	}
	488
	489
	490	static int
	491	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
	492	{
	493	daddr64_t blkno;
	494	size_t io_size;
	495	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
	496
	497	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
	498	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ \| VNODE_BLOCKMAP_NO_TRACK, NULL)) {
	499	return 0;
	500	}
	501
	502	if (io_size == 0) {
	503	return 0;
	504	}
	505
	506	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
	507	return 1;
	508	}
	509	}
	510	return 0;
	511	}
	512
	513
	514	static int
	515	cluster_is_throttled(vnode_t vp)
	516	{
	517	return throttle_io_will_be_throttled(-1, vp->v_mount);
	518	}
	519
	520
	521	static void
	522	cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name)
	523	{
	524	lck_mtx_lock(&iostate->io_mtxp);
	525
	526	while ((iostate->io_issued - iostate->io_completed) > target) {
	527	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_START,
	528	iostate->io_issued, iostate->io_completed, target, 0, 0);
	529
	530	iostate->io_wanted = 1;
	531	msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
	532
	533	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_END,
	534	iostate->io_issued, iostate->io_completed, target, 0, 0);
	535	}
	536	lck_mtx_unlock(&iostate->io_mtxp);
	537	}
	538
	539	static void
	540	cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
	541	upl_offset_t upl_offset, upl_size_t size)
	542	{
	543	if (!size) {
	544	return;
	545	}
	546
	547	upl_t associated_upl = upl_associated_upl(upl);
	548
	549	if (!associated_upl) {
	550	return;
	551	}
	552
	553	#if 0
	554	printf("1: %d %d\n", upl_offset, upl_offset + size);
	555	#endif
	556
	557	/*
	558	* The associated UPL is page aligned to file offsets whereas the
	559	* UPL it's attached to has different alignment requirements. The
	560	* upl_offset that we have refers to @upl. The code that follows
	561	* has to deal with the first and last pages in this transaction
	562	* which might straddle pages in the associated UPL. To keep
	563	* track of these pages, we use the mark bits: if the mark bit is
	564	* set, we know another transaction has completed its part of that
	565	* page and so we can unlock that page here.
	566	*
	567	* The following illustrates what we have to deal with:
	568	*
	569	* MEM u <------------ 1 PAGE ------------> e
	570	* +-------------+----------------------+-----------------
	571	* \| \|######################\|#################
	572	* +-------------+----------------------+-----------------
	573	* FILE \| <--- a ---> o <------------ 1 PAGE ------------>
	574	*
	575	* So here we show a write to offset @o. The data that is to be
	576	* written is in a buffer that is not page aligned; it has offset
	577	* @a in the page. The upl that carries the data starts in memory
	578	* at @u. The associated upl starts in the file at offset @o. A
	579	* transaction will always end on a page boundary (like @e above)
	580	* except for the very last transaction in the group. We cannot
	581	* unlock the page at @o in the associated upl until both the
	582	* transaction ending at @e and the following transaction (that
	583	* starts at @e) has completed.
	584	*/
	585
	586	/*
	587	* We record whether or not the two UPLs are aligned as the mark
	588	* bit in the first page of @upl.
	589	*/
	590	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	591	bool is_unaligned = upl_page_get_mark(pl, 0);
	592
	593	if (is_unaligned) {
	594	upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
	595
	596	upl_offset_t upl_end = upl_offset + size;
	597	assert(upl_end >= PAGE_SIZE);
	598
	599	upl_size_t assoc_upl_size = upl_get_size(associated_upl);
	600
	601	/*
	602	* In the very first transaction in the group, upl_offset will
	603	* not be page aligned, but after that it will be and in that
	604	* case we want the preceding page in the associated UPL hence
	605	* the minus one.
	606	*/
	607	assert(upl_offset);
	608	if (upl_offset) {
	609	upl_offset = trunc_page_32(upl_offset - 1);
	610	}
	611
	612	lck_mtx_lock_spin(&iostate->io_mtxp);
	613
	614	// Look at the first page...
	615	if (upl_offset
	616	&& !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
	617	/*
	618	* The first page isn't marked so let another transaction
	619	* completion handle it.
	620	*/
	621	upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
	622	upl_offset += PAGE_SIZE;
	623	}
	624
	625	// And now the last page...
	626
	627	/*
	628	* This needs to be > rather than >= because if it's equal, it
	629	* means there's another transaction that is sharing the last
	630	* page.
	631	*/
	632	if (upl_end > assoc_upl_size) {
	633	upl_end = assoc_upl_size;
	634	} else {
	635	upl_end = trunc_page_32(upl_end);
	636	const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
	637
	638	if (!upl_page_get_mark(assoc_pl, last_pg)) {
	639	/*
	640	* The last page isn't marked so mark the page and let another
	641	* transaction completion handle it.
	642	*/
	643	upl_page_set_mark(assoc_pl, last_pg, true);
	644	upl_end -= PAGE_SIZE;
	645	}
	646	}
	647
	648	lck_mtx_unlock(&iostate->io_mtxp);
	649
	650	#if 0
	651	printf("2: %d %d\n", upl_offset, upl_end);
	652	#endif
	653
	654	if (upl_end <= upl_offset) {
	655	return;
	656	}
	657
	658	size = upl_end - upl_offset;
	659	} else {
	660	assert(!(upl_offset & PAGE_MASK));
	661	assert(!(size & PAGE_MASK));
	662	}
	663
	664	boolean_t empty;
	665
	666	/*
	667	* We can unlock these pages now and as this is for a
	668	* direct/uncached write, we want to dump the pages too.
	669	*/
	670	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
	671	UPL_ABORT_DUMP_PAGES, &empty);
	672
	673	assert(!kr);
	674
	675	if (!kr && empty) {
	676	upl_set_associated_upl(upl, NULL);
	677	upl_deallocate(associated_upl);
	678	}
	679	}
	680
	681	static int
	682	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
	683	{
	684	int upl_abort_code = 0;
	685	int page_in = 0;
	686	int page_out = 0;
	687
	688	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE)) {
	689	/*
	690	* direct write of any flavor, or a direct read that wasn't aligned
	691	*/
	692	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
	693	} else {
	694	if (io_flags & B_PAGEIO) {
	695	if (io_flags & B_READ) {
	696	page_in = 1;
	697	} else {
	698	page_out = 1;
	699	}
	700	}
	701	if (io_flags & B_CACHE) {
	702	/*
	703	* leave pages in the cache unchanged on error
	704	*/
	705	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	706	} else if (((io_flags & B_READ) == 0) && ((error != ENXIO) \|\| vnode_isswap(vp))) {
	707	/*
	708	* transient error on pageout/write path... leave pages unchanged
	709	*/
	710	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	711	} else if (page_in) {
	712	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	713	} else {
	714	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	715	}
	716
	717	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
	718	}
	719	return upl_abort_code;
	720	}
	721
	722
	723	static int
	724	cluster_iodone(buf_t bp, void *callback_arg)
	725	{
	726	int b_flags;
	727	int error;
	728	int total_size;
	729	int total_resid;
	730	int upl_offset;
	731	int zero_offset;
	732	int pg_offset = 0;
	733	int commit_size = 0;
	734	int upl_flags = 0;
	735	int transaction_size = 0;
	736	upl_t upl;
	737	buf_t cbp;
	738	buf_t cbp_head;
	739	buf_t cbp_next;
	740	buf_t real_bp;
	741	vnode_t vp;
	742	struct clios *iostate;
	743	boolean_t transaction_complete = FALSE;
	744
	745	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
	746
	747	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	748	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	749
	750	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
	751	lck_mtx_lock_spin(&cl_transaction_mtxp);
	752
	753	bp->b_flags \|= B_TDONE;
	754
	755	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	756	/*
	757	* all I/O requests that are part of this transaction
	758	* have to complete before we can process it
	759	*/
	760	if (!(cbp->b_flags & B_TDONE)) {
	761	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	762	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	763
	764	lck_mtx_unlock(&cl_transaction_mtxp);
	765
	766	return 0;
	767	}
	768
	769	if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
	770	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	771	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	772
	773	lck_mtx_unlock(&cl_transaction_mtxp);
	774	wakeup(cbp);
	775
	776	return 0;
	777	}
	778
	779	if (cbp->b_flags & B_EOT) {
	780	transaction_complete = TRUE;
	781	}
	782	}
	783	lck_mtx_unlock(&cl_transaction_mtxp);
	784
	785	if (transaction_complete == FALSE) {
	786	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	787	cbp_head, 0, 0, 0, 0);
	788	return 0;
	789	}
	790	}
	791	error = 0;
	792	total_size = 0;
	793	total_resid = 0;
	794
	795	cbp = cbp_head;
	796	vp = cbp->b_vp;
	797	upl_offset = cbp->b_uploffset;
	798	upl = cbp->b_upl;
	799	b_flags = cbp->b_flags;
	800	real_bp = cbp->b_real_bp;
	801	zero_offset = cbp->b_validend;
	802	iostate = (struct clios *)cbp->b_iostate;
	803
	804	if (real_bp) {
	805	real_bp->b_dev = cbp->b_dev;
	806	}
	807
	808	while (cbp) {
	809	if ((cbp->b_flags & B_ERROR) && error == 0) {
	810	error = cbp->b_error;
	811	}
	812
	813	total_resid += cbp->b_resid;
	814	total_size += cbp->b_bcount;
	815
	816	cbp_next = cbp->b_trans_next;
	817
	818	if (cbp_next == NULL) {
	819	/*
	820	* compute the overall size of the transaction
	821	* in case we created one that has 'holes' in it
	822	* 'total_size' represents the amount of I/O we
	823	* did, not the span of the transaction w/r to the UPL
	824	*/
	825	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
	826	}
	827
	828	if (cbp != cbp_head) {
	829	free_io_buf(cbp);
	830	}
	831
	832	cbp = cbp_next;
	833	}
	834
	835	if (ISSET(b_flags, B_COMMIT_UPL)) {
	836	cluster_handle_associated_upl(iostate,
	837	cbp_head->b_upl,
	838	upl_offset,
	839	transaction_size);
	840	}
	841
	842	if (error == 0 && total_resid) {
	843	error = EIO;
	844	}
	845
	846	if (error == 0) {
	847	int (cliodone_func)(buf_t, void ) = (int ()(buf_t, void ))(cbp_head->b_cliodone);
	848
	849	if (cliodone_func != NULL) {
	850	cbp_head->b_bcount = transaction_size;
	851
	852	error = (*cliodone_func)(cbp_head, callback_arg);
	853	}
	854	}
	855	if (zero_offset) {
	856	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	857	}
	858
	859	free_io_buf(cbp_head);
	860
	861	if (iostate) {
	862	int need_wakeup = 0;
	863
	864	/*
	865	* someone has issued multiple I/Os asynchrounsly
	866	* and is waiting for them to complete (streaming)
	867	*/
	868	lck_mtx_lock_spin(&iostate->io_mtxp);
	869
	870	if (error && iostate->io_error == 0) {
	871	iostate->io_error = error;
	872	}
	873
	874	iostate->io_completed += total_size;
	875
	876	if (iostate->io_wanted) {
	877	/*
	878	* someone is waiting for the state of
	879	* this io stream to change
	880	*/
	881	iostate->io_wanted = 0;
	882	need_wakeup = 1;
	883	}
	884	lck_mtx_unlock(&iostate->io_mtxp);
	885
	886	if (need_wakeup) {
	887	wakeup((caddr_t)&iostate->io_wanted);
	888	}
	889	}
	890
	891	if (b_flags & B_COMMIT_UPL) {
	892	pg_offset = upl_offset & PAGE_MASK;
	893	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	894
	895	if (error) {
	896	upl_set_iodone_error(upl, error);
	897
	898	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
	899	} else {
	900	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
	901
	902	if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
	903	upl_flags \|= UPL_COMMIT_SET_DIRTY;
	904	}
	905
	906	if (b_flags & B_AGE) {
	907	upl_flags \|= UPL_COMMIT_INACTIVATE;
	908	}
	909
	910	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
	911	}
	912	}
	913	if (real_bp) {
	914	if (error) {
	915	real_bp->b_flags \|= B_ERROR;
	916	real_bp->b_error = error;
	917	}
	918	real_bp->b_resid = total_resid;
	919
	920	buf_biodone(real_bp);
	921	}
	922	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	923	upl, upl_offset - pg_offset, commit_size, (error << 24) \| upl_flags, 0);
	924
	925	return error;
	926	}
	927
	928
	929	uint32_t
	930	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
	931	{
	932	if (cluster_is_throttled(vp)) {
	933	*limit = THROTTLE_MAX_IOSIZE;
	934	return 1;
	935	}
	936	return 0;
	937	}
	938
	939
	940	void
	941	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
	942	{
	943	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	944	upl_offset, size, bp, 0, 0);
	945
	946	if (bp == NULL \|\| bp->b_datap == 0) {
	947	upl_page_info_t *pl;
	948	addr64_t zero_addr;
	949
	950	pl = ubc_upl_pageinfo(upl);
	951
	952	if (upl_device_page(pl) == TRUE) {
	953	zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
	954
	955	bzero_phys_nc(zero_addr, size);
	956	} else {
	957	while (size) {
	958	int page_offset;
	959	int page_index;
	960	int zero_cnt;
	961
	962	page_index = upl_offset / PAGE_SIZE;
	963	page_offset = upl_offset & PAGE_MASK;
	964
	965	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
	966	zero_cnt = min(PAGE_SIZE - page_offset, size);
	967
	968	bzero_phys(zero_addr, zero_cnt);
	969
	970	size -= zero_cnt;
	971	upl_offset += zero_cnt;
	972	}
	973	}
	974	} else {
	975	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	976	}
	977
	978	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	979	upl_offset, size, 0, 0, 0);
	980	}
	981
	982
	983	static void
	984	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
	985	{
	986	cbp_head->b_validend = zero_offset;
	987	cbp_tail->b_flags \|= B_EOT;
	988	}
	989
	990	static void
	991	cluster_wait_IO(buf_t cbp_head, int async)
	992	{
	993	buf_t cbp;
	994
	995	if (async) {
	996	/*
	997	* Async callback completion will not normally generate a
	998	* wakeup upon I/O completion. To get woken up, we set
	999	* b_trans_next (which is safe for us to modify) on the last
	1000	* buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
	1001	* to wake us up when all buffers as part of this transaction
	1002	* are completed. This is done under the umbrella of
	1003	* cl_transaction_mtxp which is also taken in cluster_iodone.
	1004	*/
	1005	bool done = true;
	1006	buf_t last = NULL;
	1007
	1008	lck_mtx_lock_spin(&cl_transaction_mtxp);
	1009
	1010	for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
	1011	if (!ISSET(cbp->b_flags, B_TDONE)) {
	1012	done = false;
	1013	}
	1014	}
	1015
	1016	if (!done) {
	1017	last->b_trans_next = CLUSTER_IO_WAITING;
	1018
	1019	DTRACE_IO1(wait__start, buf_t, last);
	1020	do {
	1021	msleep(last, &cl_transaction_mtxp, PSPIN \| (PRIBIO + 1), "cluster_wait_IO", NULL);
	1022
	1023	/*
	1024	* We should only have been woken up if all the
	1025	* buffers are completed, but just in case...
	1026	*/
	1027	done = true;
	1028	for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
	1029	if (!ISSET(cbp->b_flags, B_TDONE)) {
	1030	done = false;
	1031	break;
	1032	}
	1033	}
	1034	} while (!done);
	1035	DTRACE_IO1(wait__done, buf_t, last);
	1036
	1037	last->b_trans_next = NULL;
	1038	}
	1039
	1040	lck_mtx_unlock(&cl_transaction_mtxp);
	1041	} else { // !async
	1042	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	1043	buf_biowait(cbp);
	1044	}
	1045	}
	1046	}
	1047
	1048	static void
	1049	cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait)
	1050	{
	1051	buf_t cbp;
	1052	int error;
	1053	boolean_t isswapout = FALSE;
	1054
	1055	/*
	1056	* cluster_complete_transaction will
	1057	* only be called if we've issued a complete chain in synchronous mode
	1058	* or, we've already done a cluster_wait_IO on an incomplete chain
	1059	*/
	1060	if (needwait) {
	1061	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
	1062	buf_biowait(cbp);
	1063	}
	1064	}
	1065	/*
	1066	* we've already waited on all of the I/Os in this transaction,
	1067	* so mark all of the buf_t's in this transaction as B_TDONE
	1068	* so that cluster_iodone sees the transaction as completed
	1069	*/
	1070	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
	1071	cbp->b_flags \|= B_TDONE;
	1072	}
	1073	cbp = *cbp_head;
	1074
	1075	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
	1076	isswapout = TRUE;
	1077	}
	1078
	1079	error = cluster_iodone(cbp, callback_arg);
	1080
	1081	if (!(flags & CL_ASYNC) && error && *retval == 0) {
	1082	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO)) {
	1083	*retval = error;
	1084	} else if (isswapout == TRUE) {
	1085	*retval = error;
	1086	}
	1087	}
	1088	*cbp_head = (buf_t)NULL;
	1089	}
	1090
	1091
	1092	static int
	1093	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	1094	int flags, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1095	{
	1096	buf_t cbp;
	1097	u_int size;
	1098	u_int io_size;
	1099	int io_flags;
	1100	int bmap_flags;
	1101	int error = 0;
	1102	int retval = 0;
	1103	buf_t cbp_head = NULL;
	1104	buf_t cbp_tail = NULL;
	1105	int trans_count = 0;
	1106	int max_trans_count;
	1107	u_int pg_count;
	1108	int pg_offset;
	1109	u_int max_iosize;
	1110	u_int max_vectors;
	1111	int priv;
	1112	int zero_offset = 0;
	1113	int async_throttle = 0;
	1114	mount_t mp;
	1115	vm_offset_t upl_end_offset;
	1116	boolean_t need_EOT = FALSE;
	1117
	1118	/*
	1119	* we currently don't support buffers larger than a page
	1120	*/
	1121	if (real_bp && non_rounded_size > PAGE_SIZE) {
	1122	panic("%s(): Called with real buffer of size %d bytes which "
	1123	"is greater than the maximum allowed size of "
	1124	"%d bytes (the system PAGE_SIZE).\n",
	1125	__FUNCTION__, non_rounded_size, PAGE_SIZE);
	1126	}
	1127
	1128	mp = vp->v_mount;
	1129
	1130	/*
	1131	* we don't want to do any funny rounding of the size for IO requests
	1132	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
	1133	* belong to us... we can't extend (nor do we need to) the I/O to fill
	1134	* out a page
	1135	*/
	1136	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
	1137	/*
	1138	* round the requested size up so that this I/O ends on a
	1139	* page boundary in case this is a 'write'... if the filesystem
	1140	* has blocks allocated to back the page beyond the EOF, we want to
	1141	* make sure to write out the zero's that are sitting beyond the EOF
	1142	* so that in case the filesystem doesn't explicitly zero this area
	1143	* if a hole is created via a lseek/write beyond the current EOF,
	1144	* it will return zeros when it's read back from the disk. If the
	1145	* physical allocation doesn't extend for the whole page, we'll
	1146	* only write/read from the disk up to the end of this allocation
	1147	* via the extent info returned from the VNOP_BLOCKMAP call.
	1148	*/
	1149	pg_offset = upl_offset & PAGE_MASK;
	1150
	1151	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	1152	} else {
	1153	/*
	1154	* anyone advertising a blocksize of 1 byte probably
	1155	* can't deal with us rounding up the request size
	1156	* AFP is one such filesystem/device
	1157	*/
	1158	size = non_rounded_size;
	1159	}
	1160	upl_end_offset = upl_offset + size;
	1161
	1162	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
	1163
	1164	/*
	1165	* Set the maximum transaction size to the maximum desired number of
	1166	* buffers.
	1167	*/
	1168	max_trans_count = 8;
	1169	if (flags & CL_DEV_MEMORY) {
	1170	max_trans_count = 16;
	1171	}
	1172
	1173	if (flags & CL_READ) {
	1174	io_flags = B_READ;
	1175	bmap_flags = VNODE_READ;
	1176
	1177	max_iosize = mp->mnt_maxreadcnt;
	1178	max_vectors = mp->mnt_segreadcnt;
	1179	} else {
	1180	io_flags = B_WRITE;
	1181	bmap_flags = VNODE_WRITE;
	1182
	1183	max_iosize = mp->mnt_maxwritecnt;
	1184	max_vectors = mp->mnt_segwritecnt;
	1185	}
	1186	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	1187
	1188	/*
	1189	* make sure the maximum iosize is a
	1190	* multiple of the page size
	1191	*/
	1192	max_iosize &= ~PAGE_MASK;
	1193
	1194	/*
	1195	* Ensure the maximum iosize is sensible.
	1196	*/
	1197	if (!max_iosize) {
	1198	max_iosize = PAGE_SIZE;
	1199	}
	1200
	1201	if (flags & CL_THROTTLE) {
	1202	if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
	1203	if (max_iosize > THROTTLE_MAX_IOSIZE) {
	1204	max_iosize = THROTTLE_MAX_IOSIZE;
	1205	}
	1206	async_throttle = THROTTLE_MAXCNT;
	1207	} else {
	1208	if ((flags & CL_DEV_MEMORY)) {
	1209	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
	1210	} else {
	1211	u_int max_cluster;
	1212	u_int max_cluster_size;
	1213	u_int scale;
	1214
	1215	if (vp->v_mount->mnt_minsaturationbytecount) {
	1216	max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
	1217
	1218	scale = 1;
	1219	} else {
	1220	max_cluster_size = MAX_CLUSTER_SIZE(vp);
	1221
	1222	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
	1223	scale = WRITE_THROTTLE_SSD;
	1224	} else {
	1225	scale = WRITE_THROTTLE;
	1226	}
	1227	}
	1228	if (max_iosize > max_cluster_size) {
	1229	max_cluster = max_cluster_size;
	1230	} else {
	1231	max_cluster = max_iosize;
	1232	}
	1233
	1234	if (size < max_cluster) {
	1235	max_cluster = size;
	1236	}
	1237
	1238	if (flags & CL_CLOSE) {
	1239	scale += MAX_CLUSTERS;
	1240	}
	1241
	1242	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
	1243	}
	1244	}
	1245	}
	1246	if (flags & CL_AGE) {
	1247	io_flags \|= B_AGE;
	1248	}
	1249	if (flags & (CL_PAGEIN \| CL_PAGEOUT)) {
	1250	io_flags \|= B_PAGEIO;
	1251	}
	1252	if (flags & (CL_IOSTREAMING)) {
	1253	io_flags \|= B_IOSTREAMING;
	1254	}
	1255	if (flags & CL_COMMIT) {
	1256	io_flags \|= B_COMMIT_UPL;
	1257	}
	1258	if (flags & CL_DIRECT_IO) {
	1259	io_flags \|= B_PHYS;
	1260	}
	1261	if (flags & (CL_PRESERVE \| CL_KEEPCACHED)) {
	1262	io_flags \|= B_CACHE;
	1263	}
	1264	if (flags & CL_PASSIVE) {
	1265	io_flags \|= B_PASSIVE;
	1266	}
	1267	if (flags & CL_ENCRYPTED) {
	1268	io_flags \|= B_ENCRYPTED_IO;
	1269	}
	1270
	1271	if (vp->v_flag & VSYSTEM) {
	1272	io_flags \|= B_META;
	1273	}
	1274
	1275	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	1276	/*
	1277	* then we are going to end up
	1278	* with a page that we can't complete (the file size wasn't a multiple
	1279	* of PAGE_SIZE and we're trying to read to the end of the file
	1280	* so we'll go ahead and zero out the portion of the page we can't
	1281	* read in from the file
	1282	*/
	1283	zero_offset = (int)(upl_offset + non_rounded_size);
	1284	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
	1285	assert(ISSET(flags, CL_COMMIT));
	1286
	1287	// For a direct/uncached write, we need to lock pages...
	1288
	1289	upl_t cached_upl;
	1290
	1291	/*
	1292	* Create a UPL to lock the pages in the cache whilst the
	1293	* write is in progress.
	1294	*/
	1295	ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
	1296	NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
	1297
	1298	/*
	1299	* Attach this UPL to the other UPL so that we can find it
	1300	* later.
	1301	*/
	1302	upl_set_associated_upl(upl, cached_upl);
	1303
	1304	if (upl_offset & PAGE_MASK) {
	1305	/*
	1306	* The two UPLs are not aligned, so mark the first page in
	1307	* @upl so that cluster_handle_associated_upl can handle
	1308	* it accordingly.
	1309	*/
	1310	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	1311	upl_page_set_mark(pl, 0, true);
	1312	}
	1313	}
	1314
	1315	while (size) {
	1316	daddr64_t blkno;
	1317	daddr64_t lblkno;
	1318	u_int io_size_wanted;
	1319	size_t io_size_tmp;
	1320
	1321	if (size > max_iosize) {
	1322	io_size = max_iosize;
	1323	} else {
	1324	io_size = size;
	1325	}
	1326
	1327	io_size_wanted = io_size;
	1328	io_size_tmp = (size_t)io_size;
	1329
	1330	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
	1331	break;
	1332	}
	1333
	1334	if (io_size_tmp > io_size_wanted) {
	1335	io_size = io_size_wanted;
	1336	} else {
	1337	io_size = (u_int)io_size_tmp;
	1338	}
	1339
	1340	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
	1341	real_bp->b_blkno = blkno;
	1342	}
	1343
	1344	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	1345	(int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
	1346
	1347	if (io_size == 0) {
	1348	/*
	1349	* vnop_blockmap didn't return an error... however, it did
	1350	* return an extent size of 0 which means we can't
	1351	* make forward progress on this I/O... a hole in the
	1352	* file would be returned as a blkno of -1 with a non-zero io_size
	1353	* a real extent is returned with a blkno != -1 and a non-zero io_size
	1354	*/
	1355	error = EINVAL;
	1356	break;
	1357	}
	1358	if (!(flags & CL_READ) && blkno == -1) {
	1359	off_t e_offset;
	1360	int pageout_flags;
	1361
	1362	if (upl_get_internal_vectorupl(upl)) {
	1363	panic("Vector UPLs should not take this code-path\n");
	1364	}
	1365	/*
	1366	* we're writing into a 'hole'
	1367	*/
	1368	if (flags & CL_PAGEOUT) {
	1369	/*
	1370	* if we got here via cluster_pageout
	1371	* then just error the request and return
	1372	* the 'hole' should already have been covered
	1373	*/
	1374	error = EINVAL;
	1375	break;
	1376	}
	1377	/*
	1378	* we can get here if the cluster code happens to
	1379	* pick up a page that was dirtied via mmap vs
	1380	* a 'write' and the page targets a 'hole'...
	1381	* i.e. the writes to the cluster were sparse
	1382	* and the file was being written for the first time
	1383	*
	1384	* we can also get here if the filesystem supports
	1385	* 'holes' that are less than PAGE_SIZE.... because
	1386	* we can't know if the range in the page that covers
	1387	* the 'hole' has been dirtied via an mmap or not,
	1388	* we have to assume the worst and try to push the
	1389	* entire page to storage.
	1390	*
	1391	* Try paging out the page individually before
	1392	* giving up entirely and dumping it (the pageout
	1393	* path will insure that the zero extent accounting
	1394	* has been taken care of before we get back into cluster_io)
	1395	*
	1396	* go direct to vnode_pageout so that we don't have to
	1397	* unbusy the page from the UPL... we used to do this
	1398	* so that we could call ubc_msync, but that results
	1399	* in a potential deadlock if someone else races us to acquire
	1400	* that page and wins and in addition needs one of the pages
	1401	* we're continuing to hold in the UPL
	1402	*/
	1403	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
	1404
	1405	if (!(flags & CL_ASYNC)) {
	1406	pageout_flags \|= UPL_IOSYNC;
	1407	}
	1408	if (!(flags & CL_COMMIT)) {
	1409	pageout_flags \|= UPL_NOCOMMIT;
	1410	}
	1411
	1412	if (cbp_head) {
	1413	buf_t prev_cbp;
	1414	uint32_t bytes_in_last_page;
	1415
	1416	/*
	1417	* first we have to wait for the the current outstanding I/Os
	1418	* to complete... EOT hasn't been set yet on this transaction
	1419	* so the pages won't be released
	1420	*/
	1421	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1422
	1423	bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
	1424	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	1425	bytes_in_last_page += cbp->b_bcount;
	1426	}
	1427	bytes_in_last_page &= PAGE_MASK;
	1428
	1429	while (bytes_in_last_page) {
	1430	/*
	1431	* we've got a transcation that
	1432	* includes the page we're about to push out through vnode_pageout...
	1433	* find the bp's in the list which intersect this page and either
	1434	* remove them entirely from the transaction (there could be multiple bp's), or
	1435	* round it's iosize down to the page boundary (there can only be one)...
	1436	*
	1437	* find the last bp in the list and act on it
	1438	*/
	1439	for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
	1440	prev_cbp = cbp;
	1441	}
	1442
	1443	if (bytes_in_last_page >= cbp->b_bcount) {
	1444	/*
	1445	* this buf no longer has any I/O associated with it
	1446	*/
	1447	bytes_in_last_page -= cbp->b_bcount;
	1448	cbp->b_bcount = 0;
	1449
	1450	free_io_buf(cbp);
	1451
	1452	if (cbp == cbp_head) {
	1453	assert(bytes_in_last_page == 0);
	1454	/*
	1455	* the buf we just freed was the only buf in
	1456	* this transaction... so there's no I/O to do
	1457	*/
	1458	cbp_head = NULL;
	1459	cbp_tail = NULL;
	1460	} else {
	1461	/*
	1462	* remove the buf we just freed from
	1463	* the transaction list
	1464	*/
	1465	prev_cbp->b_trans_next = NULL;
	1466	cbp_tail = prev_cbp;
	1467	}
	1468	} else {
	1469	/*
	1470	* this is the last bp that has I/O
	1471	* intersecting the page of interest
	1472	* only some of the I/O is in the intersection
	1473	* so clip the size but keep it in the transaction list
	1474	*/
	1475	cbp->b_bcount -= bytes_in_last_page;
	1476	cbp_tail = cbp;
	1477	bytes_in_last_page = 0;
	1478	}
	1479	}
	1480	if (cbp_head) {
	1481	/*
	1482	* there was more to the current transaction
	1483	* than just the page we are pushing out via vnode_pageout...
	1484	* mark it as finished and complete it... we've already
	1485	* waited for the I/Os to complete above in the call to cluster_wait_IO
	1486	*/
	1487	cluster_EOT(cbp_head, cbp_tail, 0);
	1488
	1489	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1490
	1491	trans_count = 0;
	1492	}
	1493	}
	1494	if (vnode_pageout(vp, upl, (upl_offset_t)trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
	1495	error = EINVAL;
	1496	}
	1497	e_offset = round_page_64(f_offset + 1);
	1498	io_size = (u_int)(e_offset - f_offset);
	1499
	1500	f_offset += io_size;
	1501	upl_offset += io_size;
	1502
	1503	if (size >= io_size) {
	1504	size -= io_size;
	1505	} else {
	1506	size = 0;
	1507	}
	1508	/*
	1509	* keep track of how much of the original request
	1510	* that we've actually completed... non_rounded_size
	1511	* may go negative due to us rounding the request
	1512	* to a page size multiple (i.e. size > non_rounded_size)
	1513	*/
	1514	non_rounded_size -= io_size;
	1515
	1516	if (non_rounded_size <= 0) {
	1517	/*
	1518	* we've transferred all of the data in the original
	1519	* request, but we were unable to complete the tail
	1520	* of the last page because the file didn't have
	1521	* an allocation to back that portion... this is ok.
	1522	*/
	1523	size = 0;
	1524	}
	1525	if (error) {
	1526	if (size == 0) {
	1527	flags &= ~CL_COMMIT;
	1528	}
	1529	break;
	1530	}
	1531	continue;
	1532	}
	1533	lblkno = (daddr64_t)(f_offset / 0x1000);
	1534	/*
	1535	* we have now figured out how much I/O we can do - this is in 'io_size'
	1536	* pg_offset is the starting point in the first page for the I/O
	1537	* pg_count is the number of full and partial pages that 'io_size' encompasses
	1538	*/
	1539	pg_offset = upl_offset & PAGE_MASK;
	1540
	1541	if (flags & CL_DEV_MEMORY) {
	1542	/*
	1543	* treat physical requests as one 'giant' page
	1544	*/
	1545	pg_count = 1;
	1546	} else {
	1547	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1548	}
	1549
	1550	if ((flags & CL_READ) && blkno == -1) {
	1551	vm_offset_t commit_offset;
	1552	int bytes_to_zero;
	1553	int complete_transaction_now = 0;
	1554
	1555	/*
	1556	* if we're reading and blkno == -1, then we've got a
	1557	* 'hole' in the file that we need to deal with by zeroing
	1558	* out the affected area in the upl
	1559	*/
	1560	if (io_size >= (u_int)non_rounded_size) {
	1561	/*
	1562	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	1563	* than 'zero_offset' will be non-zero
	1564	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	1565	* (indicated by the io_size finishing off the I/O request for this UPL)
	1566	* than we're not going to issue an I/O for the
	1567	* last page in this upl... we need to zero both the hole and the tail
	1568	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	1569	*/
	1570	bytes_to_zero = non_rounded_size;
	1571	if (!(flags & CL_NOZERO)) {
	1572	bytes_to_zero = (int)((((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset);
	1573	}
	1574
	1575	zero_offset = 0;
	1576	} else {
	1577	bytes_to_zero = io_size;
	1578	}
	1579
	1580	pg_count = 0;
	1581
	1582	cluster_zero(upl, (upl_offset_t)upl_offset, bytes_to_zero, real_bp);
	1583
	1584	if (cbp_head) {
	1585	int pg_resid;
	1586
	1587	/*
	1588	* if there is a current I/O chain pending
	1589	* then the first page of the group we just zero'd
	1590	* will be handled by the I/O completion if the zero
	1591	* fill started in the middle of the page
	1592	*/
	1593	commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1594
	1595	pg_resid = (int)(commit_offset - upl_offset);
	1596
	1597	if (bytes_to_zero >= pg_resid) {
	1598	/*
	1599	* the last page of the current I/O
	1600	* has been completed...
	1601	* compute the number of fully zero'd
	1602	* pages that are beyond it
	1603	* plus the last page if its partial
	1604	* and we have no more I/O to issue...
	1605	* otherwise a partial page is left
	1606	* to begin the next I/O
	1607	*/
	1608	if ((int)io_size >= non_rounded_size) {
	1609	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1610	} else {
	1611	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
	1612	}
	1613
	1614	complete_transaction_now = 1;
	1615	}
	1616	} else {
	1617	/*
	1618	* no pending I/O to deal with
	1619	* so, commit all of the fully zero'd pages
	1620	* plus the last page if its partial
	1621	* and we have no more I/O to issue...
	1622	* otherwise a partial page is left
	1623	* to begin the next I/O
	1624	*/
	1625	if ((int)io_size >= non_rounded_size) {
	1626	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1627	} else {
	1628	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
	1629	}
	1630
	1631	commit_offset = upl_offset & ~PAGE_MASK;
	1632	}
	1633
	1634	// Associated UPL is currently only used in the direct write path
	1635	assert(!upl_associated_upl(upl));
	1636
	1637	if ((flags & CL_COMMIT) && pg_count) {
	1638	ubc_upl_commit_range(upl, (upl_offset_t)commit_offset,
	1639	pg_count * PAGE_SIZE,
	1640	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	1641	}
	1642	upl_offset += io_size;
	1643	f_offset += io_size;
	1644	size -= io_size;
	1645
	1646	/*
	1647	* keep track of how much of the original request
	1648	* that we've actually completed... non_rounded_size
	1649	* may go negative due to us rounding the request
	1650	* to a page size multiple (i.e. size > non_rounded_size)
	1651	*/
	1652	non_rounded_size -= io_size;
	1653
	1654	if (non_rounded_size <= 0) {
	1655	/*
	1656	* we've transferred all of the data in the original
	1657	* request, but we were unable to complete the tail
	1658	* of the last page because the file didn't have
	1659	* an allocation to back that portion... this is ok.
	1660	*/
	1661	size = 0;
	1662	}
	1663	if (cbp_head && (complete_transaction_now \|\| size == 0)) {
	1664	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1665
	1666	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1667
	1668	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1669
	1670	trans_count = 0;
	1671	}
	1672	continue;
	1673	}
	1674	if (pg_count > max_vectors) {
	1675	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	1676	io_size = PAGE_SIZE - pg_offset;
	1677	pg_count = 1;
	1678	} else {
	1679	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	1680	pg_count = max_vectors;
	1681	}
	1682	}
	1683	/*
	1684	* If the transaction is going to reach the maximum number of
	1685	* desired elements, truncate the i/o to the nearest page so
	1686	* that the actual i/o is initiated after this buffer is
	1687	* created and added to the i/o chain.
	1688	*
	1689	* I/O directed to physically contiguous memory
	1690	* doesn't have a requirement to make sure we 'fill' a page
	1691	*/
	1692	if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
	1693	((upl_offset + io_size) & PAGE_MASK)) {
	1694	vm_offset_t aligned_ofs;
	1695
	1696	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
	1697	/*
	1698	* If the io_size does not actually finish off even a
	1699	* single page we have to keep adding buffers to the
	1700	* transaction despite having reached the desired limit.
	1701	*
	1702	* Eventually we get here with the page being finished
	1703	* off (and exceeded) and then we truncate the size of
	1704	* this i/o request so that it is page aligned so that
	1705	* we can finally issue the i/o on the transaction.
	1706	*/
	1707	if (aligned_ofs > upl_offset) {
	1708	io_size = (u_int)(aligned_ofs - upl_offset);
	1709	pg_count--;
	1710	}
	1711	}
	1712
	1713	if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
	1714	/*
	1715	* if we're not targeting a virtual device i.e. a disk image
	1716	* it's safe to dip into the reserve pool since real devices
	1717	* can complete this I/O request without requiring additional
	1718	* bufs from the alloc_io_buf pool
	1719	*/
	1720	priv = 1;
	1721	} else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT) && !cbp_head) {
	1722	/*
	1723	* Throttle the speculative IO
	1724	*
	1725	* We can only throttle this if it is the first iobuf
	1726	* for the transaction. alloc_io_buf implements
	1727	* additional restrictions for diskimages anyway.
	1728	*/
	1729	priv = 0;
	1730	} else {
	1731	priv = 1;
	1732	}
	1733
	1734	cbp = alloc_io_buf(vp, priv);
	1735
	1736	if (flags & CL_PAGEOUT) {
	1737	u_int i;
	1738
	1739	/*
	1740	* since blocks are in offsets of 0x1000, scale
	1741	* iteration to (PAGE_SIZE * pg_count) of blks.
	1742	*/
	1743	for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
	1744	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
	1745	panic("BUSY bp found in cluster_io");
	1746	}
	1747	}
	1748	}
	1749	if (flags & CL_ASYNC) {
	1750	if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
	1751	panic("buf_setcallback failed\n");
	1752	}
	1753	}
	1754	cbp->b_cliodone = (void *)callback;
	1755	cbp->b_flags \|= io_flags;
	1756	if (flags & CL_NOCACHE) {
	1757	cbp->b_attr.ba_flags \|= BA_NOCACHE;
	1758	}
	1759
	1760	cbp->b_lblkno = lblkno;
	1761	cbp->b_blkno = blkno;
	1762	cbp->b_bcount = io_size;
	1763
	1764	if (buf_setupl(cbp, upl, (uint32_t)upl_offset)) {
	1765	panic("buf_setupl failed\n");
	1766	}
	1767	#if CONFIG_IOSCHED
	1768	upl_set_blkno(upl, upl_offset, io_size, blkno);
	1769	#endif
	1770	cbp->b_trans_next = (buf_t)NULL;
	1771
	1772	if ((cbp->b_iostate = (void *)iostate)) {
	1773	/*
	1774	* caller wants to track the state of this
	1775	* io... bump the amount issued against this stream
	1776	*/
	1777	iostate->io_issued += io_size;
	1778	}
	1779
	1780	if (flags & CL_READ) {
	1781	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	1782	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1783	} else {
	1784	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	1785	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1786	}
	1787
	1788	if (cbp_head) {
	1789	cbp_tail->b_trans_next = cbp;
	1790	cbp_tail = cbp;
	1791	} else {
	1792	cbp_head = cbp;
	1793	cbp_tail = cbp;
	1794
	1795	if ((cbp_head->b_real_bp = real_bp)) {
	1796	real_bp = (buf_t)NULL;
	1797	}
	1798	}
	1799	(buf_t )(&cbp->b_trans_head) = cbp_head;
	1800
	1801	trans_count++;
	1802
	1803	upl_offset += io_size;
	1804	f_offset += io_size;
	1805	size -= io_size;
	1806	/*
	1807	* keep track of how much of the original request
	1808	* that we've actually completed... non_rounded_size
	1809	* may go negative due to us rounding the request
	1810	* to a page size multiple (i.e. size > non_rounded_size)
	1811	*/
	1812	non_rounded_size -= io_size;
	1813
	1814	if (non_rounded_size <= 0) {
	1815	/*
	1816	* we've transferred all of the data in the original
	1817	* request, but we were unable to complete the tail
	1818	* of the last page because the file didn't have
	1819	* an allocation to back that portion... this is ok.
	1820	*/
	1821	size = 0;
	1822	}
	1823	if (size == 0) {
	1824	/*
	1825	* we have no more I/O to issue, so go
	1826	* finish the final transaction
	1827	*/
	1828	need_EOT = TRUE;
	1829	} else if (((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == 0) &&
	1830	((flags & CL_ASYNC) \|\| trans_count > max_trans_count)) {
	1831	/*
	1832	* I/O directed to physically contiguous memory...
	1833	* which doesn't have a requirement to make sure we 'fill' a page
	1834	* or...
	1835	* the current I/O we've prepared fully
	1836	* completes the last page in this request
	1837	* and ...
	1838	* it's either an ASYNC request or
	1839	* we've already accumulated more than 8 I/O's into
	1840	* this transaction so mark it as complete so that
	1841	* it can finish asynchronously or via the cluster_complete_transaction
	1842	* below if the request is synchronous
	1843	*/
	1844	need_EOT = TRUE;
	1845	}
	1846	if (need_EOT == TRUE) {
	1847	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1848	}
	1849
	1850	if (flags & CL_THROTTLE) {
	1851	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
	1852	}
	1853
	1854	if (!(io_flags & B_READ)) {
	1855	vnode_startwrite(vp);
	1856	}
	1857
	1858	if (flags & CL_RAW_ENCRYPTED) {
	1859	/*
	1860	* User requested raw encrypted bytes.
	1861	* Twiddle the bit in the ba_flags for the buffer
	1862	*/
	1863	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
	1864	}
	1865
	1866	(void) VNOP_STRATEGY(cbp);
	1867
	1868	if (need_EOT == TRUE) {
	1869	if (!(flags & CL_ASYNC)) {
	1870	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
	1871	}
	1872
	1873	need_EOT = FALSE;
	1874	trans_count = 0;
	1875	cbp_head = NULL;
	1876	}
	1877	}
	1878	if (error) {
	1879	int abort_size;
	1880
	1881	io_size = 0;
	1882
	1883	if (cbp_head) {
	1884	/*
	1885	* Wait until all of the outstanding I/O
	1886	* for this partial transaction has completed
	1887	*/
	1888	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1889
	1890	/*
	1891	* Rewind the upl offset to the beginning of the
	1892	* transaction.
	1893	*/
	1894	upl_offset = cbp_head->b_uploffset;
	1895	}
	1896
	1897	if (ISSET(flags, CL_COMMIT)) {
	1898	cluster_handle_associated_upl(iostate, upl,
	1899	(upl_offset_t)upl_offset,
	1900	(upl_size_t)(upl_end_offset - upl_offset));
	1901	}
	1902
	1903	// Free all the IO buffers in this transaction
	1904	for (cbp = cbp_head; cbp;) {
	1905	buf_t cbp_next;
	1906
	1907	size += cbp->b_bcount;
	1908	io_size += cbp->b_bcount;
	1909
	1910	cbp_next = cbp->b_trans_next;
	1911	free_io_buf(cbp);
	1912	cbp = cbp_next;
	1913	}
	1914
	1915	if (iostate) {
	1916	int need_wakeup = 0;
	1917
	1918	/*
	1919	* update the error condition for this stream
	1920	* since we never really issued the io
	1921	* just go ahead and adjust it back
	1922	*/
	1923	lck_mtx_lock_spin(&iostate->io_mtxp);
	1924
	1925	if (iostate->io_error == 0) {
	1926	iostate->io_error = error;
	1927	}
	1928	iostate->io_issued -= io_size;
	1929
	1930	if (iostate->io_wanted) {
	1931	/*
	1932	* someone is waiting for the state of
	1933	* this io stream to change
	1934	*/
	1935	iostate->io_wanted = 0;
	1936	need_wakeup = 1;
	1937	}
	1938	lck_mtx_unlock(&iostate->io_mtxp);
	1939
	1940	if (need_wakeup) {
	1941	wakeup((caddr_t)&iostate->io_wanted);
	1942	}
	1943	}
	1944
	1945	if (flags & CL_COMMIT) {
	1946	int upl_flags;
	1947
	1948	pg_offset = upl_offset & PAGE_MASK;
	1949	abort_size = (int)((upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK);
	1950
	1951	upl_flags = cluster_ioerror(upl, (int)(upl_offset - pg_offset),
	1952	abort_size, error, io_flags, vp);
	1953
	1954	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1955	upl, upl_offset - pg_offset, abort_size, (error << 24) \| upl_flags, 0);
	1956	}
	1957	if (retval == 0) {
	1958	retval = error;
	1959	}
	1960	} else if (cbp_head) {
	1961	panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
	1962	}
	1963
	1964	if (real_bp) {
	1965	/*
	1966	* can get here if we either encountered an error
	1967	* or we completely zero-filled the request and
	1968	* no I/O was issued
	1969	*/
	1970	if (error) {
	1971	real_bp->b_flags \|= B_ERROR;
	1972	real_bp->b_error = error;
	1973	}
	1974	buf_biodone(real_bp);
	1975	}
	1976	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
	1977
	1978	return retval;
	1979	}
	1980
	1981	#define reset_vector_run_state() \
	1982	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
	1983
	1984	static int
	1985	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
	1986	int io_flag, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1987	{
	1988	vector_upl_set_pagelist(vector_upl);
	1989
	1990	if (io_flag & CL_READ) {
	1991	if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
	1992	io_flag &= ~CL_PRESERVE; /don't zero fill/
	1993	} else {
	1994	io_flag \|= CL_PRESERVE; /zero fill/
	1995	}
	1996	}
	1997	return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
	1998	}
	1999
	2000	static int
	2001	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2002	{
	2003	int pages_in_prefetch;
	2004
	2005	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	2006	(int)f_offset, size, (int)filesize, 0, 0);
	2007
	2008	if (f_offset >= filesize) {
	2009	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	2010	(int)f_offset, 0, 0, 0, 0);
	2011	return 0;
	2012	}
	2013	if ((off_t)size > (filesize - f_offset)) {
	2014	size = (u_int)(filesize - f_offset);
	2015	}
	2016	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	2017
	2018	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
	2019
	2020	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	2021	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	2022
	2023	return pages_in_prefetch;
	2024	}
	2025
	2026
	2027
	2028	static void
	2029	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap, int (callback)(buf_t, void ), void *callback_arg,
	2030	int bflag)
	2031	{
	2032	daddr64_t r_addr;
	2033	off_t f_offset;
	2034	int size_of_prefetch;
	2035	u_int max_prefetch;
	2036
	2037
	2038	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	2039	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	2040
	2041	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	2042	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2043	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	2044	return;
	2045	}
	2046	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
	2047	rap->cl_ralen = 0;
	2048	rap->cl_maxra = 0;
	2049
	2050	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2051	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	2052
	2053	return;
	2054	}
	2055	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
	2056
	2057	if (max_prefetch > speculative_prefetch_max) {
	2058	max_prefetch = speculative_prefetch_max;
	2059	}
	2060
	2061	if (max_prefetch <= PAGE_SIZE) {
	2062	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2063	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
	2064	return;
	2065	}
	2066	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
	2067	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
	2068	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2069	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	2070	return;
	2071	}
	2072	}
	2073	r_addr = MAX(extent->e_addr, rap->cl_maxra) + 1;
	2074	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	2075
	2076	size_of_prefetch = 0;
	2077
	2078	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	2079
	2080	if (size_of_prefetch) {
	2081	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2082	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	2083	return;
	2084	}
	2085	if (f_offset < filesize) {
	2086	daddr64_t read_size;
	2087
	2088	rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
	2089
	2090	read_size = (extent->e_addr + 1) - extent->b_addr;
	2091
	2092	if (read_size > rap->cl_ralen) {
	2093	if (read_size > max_prefetch / PAGE_SIZE) {
	2094	rap->cl_ralen = max_prefetch / PAGE_SIZE;
	2095	} else {
	2096	rap->cl_ralen = (int)read_size;
	2097	}
	2098	}
	2099	size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
	2100
	2101	if (size_of_prefetch) {
	2102	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	2103	}
	2104	}
	2105	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2106	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	2107	}
	2108
	2109
	2110	int
	2111	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2112	int size, off_t filesize, int flags)
	2113	{
	2114	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2115	}
	2116
	2117
	2118	int
	2119	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2120	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2121	{
	2122	int io_size;
	2123	int rounded_size;
	2124	off_t max_size;
	2125	int local_flags;
	2126
	2127	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	2128
	2129	if ((flags & UPL_IOSYNC) == 0) {
	2130	local_flags \|= CL_ASYNC;
	2131	}
	2132	if ((flags & UPL_NOCOMMIT) == 0) {
	2133	local_flags \|= CL_COMMIT;
	2134	}
	2135	if ((flags & UPL_KEEPCACHED)) {
	2136	local_flags \|= CL_KEEPCACHED;
	2137	}
	2138	if (flags & UPL_PAGING_ENCRYPTED) {
	2139	local_flags \|= CL_ENCRYPTED;
	2140	}
	2141
	2142
	2143	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	2144	(int)f_offset, size, (int)filesize, local_flags, 0);
	2145
	2146	/*
	2147	* If they didn't specify any I/O, then we are done...
	2148	* we can't issue an abort because we don't know how
	2149	* big the upl really is
	2150	*/
	2151	if (size <= 0) {
	2152	return EINVAL;
	2153	}
	2154
	2155	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	2156	if (local_flags & CL_COMMIT) {
	2157	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2158	}
	2159	return EROFS;
	2160	}
	2161	/*
	2162	* can't page-in from a negative offset
	2163	* or if we're starting beyond the EOF
	2164	* or if the file offset isn't page aligned
	2165	* or the size requested isn't a multiple of PAGE_SIZE
	2166	*/
	2167	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2168	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	2169	if (local_flags & CL_COMMIT) {
	2170	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2171	}
	2172	return EINVAL;
	2173	}
	2174	max_size = filesize - f_offset;
	2175
	2176	if (size < max_size) {
	2177	io_size = size;
	2178	} else {
	2179	io_size = (int)max_size;
	2180	}
	2181
	2182	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2183
	2184	if (size > rounded_size) {
	2185	if (local_flags & CL_COMMIT) {
	2186	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	2187	UPL_ABORT_FREE_ON_EMPTY);
	2188	}
	2189	}
	2190	return cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2191	local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2192	}
	2193
	2194
	2195	int
	2196	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2197	int size, off_t filesize, int flags)
	2198	{
	2199	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2200	}
	2201
	2202
	2203	int
	2204	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2205	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2206	{
	2207	u_int io_size;
	2208	int rounded_size;
	2209	off_t max_size;
	2210	int retval;
	2211	int local_flags = 0;
	2212
	2213	if (upl == NULL \|\| size < 0) {
	2214	panic("cluster_pagein: NULL upl passed in");
	2215	}
	2216
	2217	if ((flags & UPL_IOSYNC) == 0) {
	2218	local_flags \|= CL_ASYNC;
	2219	}
	2220	if ((flags & UPL_NOCOMMIT) == 0) {
	2221	local_flags \|= CL_COMMIT;
	2222	}
	2223	if (flags & UPL_IOSTREAMING) {
	2224	local_flags \|= CL_IOSTREAMING;
	2225	}
	2226	if (flags & UPL_PAGING_ENCRYPTED) {
	2227	local_flags \|= CL_ENCRYPTED;
	2228	}
	2229
	2230
	2231	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	2232	(int)f_offset, size, (int)filesize, local_flags, 0);
	2233
	2234	/*
	2235	* can't page-in from a negative offset
	2236	* or if we're starting beyond the EOF
	2237	* or if the file offset isn't page aligned
	2238	* or the size requested isn't a multiple of PAGE_SIZE
	2239	*/
	2240	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2241	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	2242	if (local_flags & CL_COMMIT) {
	2243	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2244	}
	2245	return EINVAL;
	2246	}
	2247	max_size = filesize - f_offset;
	2248
	2249	if (size < max_size) {
	2250	io_size = size;
	2251	} else {
	2252	io_size = (int)max_size;
	2253	}
	2254
	2255	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2256
	2257	if (size > rounded_size && (local_flags & CL_COMMIT)) {
	2258	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	2259	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2260	}
	2261
	2262	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2263	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2264
	2265	return retval;
	2266	}
	2267
	2268
	2269	int
	2270	cluster_bp(buf_t bp)
	2271	{
	2272	return cluster_bp_ext(bp, NULL, NULL);
	2273	}
	2274
	2275
	2276	int
	2277	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void ), void *callback_arg)
	2278	{
	2279	off_t f_offset;
	2280	int flags;
	2281
	2282	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	2283	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	2284
	2285	if (bp->b_flags & B_READ) {
	2286	flags = CL_ASYNC \| CL_READ;
	2287	} else {
	2288	flags = CL_ASYNC;
	2289	}
	2290	if (bp->b_flags & B_PASSIVE) {
	2291	flags \|= CL_PASSIVE;
	2292	}
	2293
	2294	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	2295
	2296	return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
	2297	}
	2298
	2299
	2300
	2301	int
	2302	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	2303	{
	2304	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
	2305	}
	2306
	2307
	2308	int
	2309	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
	2310	int xflags, int (callback)(buf_t, void ), void *callback_arg)
	2311	{
	2312	user_ssize_t cur_resid;
	2313	int retval = 0;
	2314	int flags;
	2315	int zflags;
	2316	int bflag;
	2317	int write_type = IO_COPY;
	2318	u_int32_t write_length;
	2319
	2320	flags = xflags;
	2321
	2322	if (flags & IO_PASSIVE) {
	2323	bflag = CL_PASSIVE;
	2324	} else {
	2325	bflag = 0;
	2326	}
	2327
	2328	if (vp->v_flag & VNOCACHE_DATA) {
	2329	flags \|= IO_NOCACHE;
	2330	bflag \|= CL_NOCACHE;
	2331	}
	2332	if (uio == NULL) {
	2333	/*
	2334	* no user data...
	2335	* this call is being made to zero-fill some range in the file
	2336	*/
	2337	retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
	2338
	2339	return retval;
	2340	}
	2341	/*
	2342	* do a write through the cache if one of the following is true....
	2343	* NOCACHE is not true or NODIRECT is true
	2344	* the uio request doesn't target USERSPACE
	2345	* otherwise, find out if we want the direct or contig variant for
	2346	* the first vector in the uio request
	2347	*/
	2348	if (((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
	2349	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2350	}
	2351
	2352	if ((flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
	2353	/*
	2354	* must go through the cached variant in this case
	2355	*/
	2356	write_type = IO_COPY;
	2357	}
	2358
	2359	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
	2360	switch (write_type) {
	2361	case IO_COPY:
	2362	/*
	2363	* make sure the uio_resid isn't too big...
	2364	* internally, we want to handle all of the I/O in
	2365	* chunk sizes that fit in a 32 bit int
	2366	*/
	2367	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	2368	/*
	2369	* we're going to have to call cluster_write_copy
	2370	* more than once...
	2371	*
	2372	* only want the last call to cluster_write_copy to
	2373	* have the IO_TAILZEROFILL flag set and only the
	2374	* first call should have IO_HEADZEROFILL
	2375	*/
	2376	zflags = flags & ~IO_TAILZEROFILL;
	2377	flags &= ~IO_HEADZEROFILL;
	2378
	2379	write_length = MAX_IO_REQUEST_SIZE;
	2380	} else {
	2381	/*
	2382	* last call to cluster_write_copy
	2383	*/
	2384	zflags = flags;
	2385
	2386	write_length = (u_int32_t)cur_resid;
	2387	}
	2388	retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
	2389	break;
	2390
	2391	case IO_CONTIG:
	2392	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
	2393
	2394	if (flags & IO_HEADZEROFILL) {
	2395	/*
	2396	* only do this once per request
	2397	*/
	2398	flags &= ~IO_HEADZEROFILL;
	2399
	2400	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
	2401	headOff, (off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2402	if (retval) {
	2403	break;
	2404	}
	2405	}
	2406	retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
	2407
	2408	if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
	2409	/*
	2410	* we're done with the data from the user specified buffer(s)
	2411	* and we've been requested to zero fill at the tail
	2412	* treat this as an IO_HEADZEROFILL which doesn't require a uio
	2413	* by rearranging the args and passing in IO_HEADZEROFILL
	2414	*/
	2415	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
	2416	(off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2417	}
	2418	break;
	2419
	2420	case IO_DIRECT:
	2421	/*
	2422	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
	2423	*/
	2424	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
	2425	break;
	2426
	2427	case IO_UNKNOWN:
	2428	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2429	break;
	2430	}
	2431	/*
	2432	* in case we end up calling cluster_write_copy (from cluster_write_direct)
	2433	* multiple times to service a multi-vector request that is not aligned properly
	2434	* we need to update the oldEOF so that we
	2435	* don't zero-fill the head of a page if we've successfully written
	2436	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2437	* page that is beyond the oldEOF if the write is unaligned... we only
	2438	* want that to happen for the very first page of the cluster_write,
	2439	* NOT the first page of each vector making up a multi-vector write.
	2440	*/
	2441	if (uio->uio_offset > oldEOF) {
	2442	oldEOF = uio->uio_offset;
	2443	}
	2444	}
	2445	return retval;
	2446	}
	2447
	2448
	2449	static int
	2450	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int write_type, u_int32_t *write_length,
	2451	int flags, int (callback)(buf_t, void ), void *callback_arg)
	2452	{
	2453	upl_t upl;
	2454	upl_page_info_t *pl;
	2455	vm_offset_t upl_offset;
	2456	vm_offset_t vector_upl_offset = 0;
	2457	u_int32_t io_req_size;
	2458	u_int32_t offset_in_file;
	2459	u_int32_t offset_in_iovbase;
	2460	u_int32_t io_size;
	2461	int io_flag = 0;
	2462	upl_size_t upl_size, vector_upl_size = 0;
	2463	vm_size_t upl_needed_size;
	2464	mach_msg_type_number_t pages_in_pl;
	2465	upl_control_flags_t upl_flags;
	2466	kern_return_t kret;
	2467	mach_msg_type_number_t i;
	2468	int force_data_sync;
	2469	int retval = 0;
	2470	int first_IO = 1;
	2471	struct clios iostate;
	2472	user_addr_t iov_base;
	2473	u_int32_t mem_alignment_mask;
	2474	u_int32_t devblocksize;
	2475	u_int32_t max_io_size;
	2476	u_int32_t max_upl_size;
	2477	u_int32_t max_vector_size;
	2478	u_int32_t bytes_outstanding_limit;
	2479	boolean_t io_throttled = FALSE;
	2480
	2481	u_int32_t vector_upl_iosize = 0;
	2482	int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
	2483	off_t v_upl_uio_offset = 0;
	2484	int vector_upl_index = 0;
	2485	upl_t vector_upl = NULL;
	2486
	2487
	2488	/*
	2489	* When we enter this routine, we know
	2490	* -- the resid will not exceed iov_len
	2491	*/
	2492	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	2493	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2494
	2495	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
	2496
	2497	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2498
	2499	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
	2500
	2501	if (flags & IO_PASSIVE) {
	2502	io_flag \|= CL_PASSIVE;
	2503	}
	2504
	2505	if (flags & IO_NOCACHE) {
	2506	io_flag \|= CL_NOCACHE;
	2507	}
	2508
	2509	if (flags & IO_SKIP_ENCRYPTION) {
	2510	io_flag \|= CL_ENCRYPTED;
	2511	}
	2512
	2513	iostate.io_completed = 0;
	2514	iostate.io_issued = 0;
	2515	iostate.io_error = 0;
	2516	iostate.io_wanted = 0;
	2517
	2518	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
	2519
	2520	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2521	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2522
	2523	if (devblocksize == 1) {
	2524	/*
	2525	* the AFP client advertises a devblocksize of 1
	2526	* however, its BLOCKMAP routine maps to physical
	2527	* blocks that are PAGE_SIZE in size...
	2528	* therefore we can't ask for I/Os that aren't page aligned
	2529	* or aren't multiples of PAGE_SIZE in size
	2530	* by setting devblocksize to PAGE_SIZE, we re-instate
	2531	* the old behavior we had before the mem_alignment_mask
	2532	* changes went in...
	2533	*/
	2534	devblocksize = PAGE_SIZE;
	2535	}
	2536
	2537	next_dwrite:
	2538	io_req_size = *write_length;
	2539	iov_base = uio_curriovbase(uio);
	2540
	2541	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
	2542	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	2543
	2544	if (offset_in_file \|\| offset_in_iovbase) {
	2545	/*
	2546	* one of the 2 important offsets is misaligned
	2547	* so fire an I/O through the cache for this entire vector
	2548	*/
	2549	goto wait_for_dwrites;
	2550	}
	2551	if (iov_base & (devblocksize - 1)) {
	2552	/*
	2553	* the offset in memory must be on a device block boundary
	2554	* so that we can guarantee that we can generate an
	2555	* I/O that ends on a page boundary in cluster_io
	2556	*/
	2557	goto wait_for_dwrites;
	2558	}
	2559
	2560	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
	2561	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
	2562	int throttle_type;
	2563
	2564	if ((throttle_type = cluster_is_throttled(vp))) {
	2565	/*
	2566	* we're in the throttle window, at the very least
	2567	* we want to limit the size of the I/O we're about
	2568	* to issue
	2569	*/
	2570	if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
	2571	/*
	2572	* we're in the throttle window and at least 1 I/O
	2573	* has already been issued by a throttleable thread
	2574	* in this window, so return with EAGAIN to indicate
	2575	* to the FS issuing the cluster_write call that it
	2576	* should now throttle after dropping any locks
	2577	*/
	2578	throttle_info_update_by_mount(vp->v_mount);
	2579
	2580	io_throttled = TRUE;
	2581	goto wait_for_dwrites;
	2582	}
	2583	max_vector_size = THROTTLE_MAX_IOSIZE;
	2584	max_io_size = THROTTLE_MAX_IOSIZE;
	2585	} else {
	2586	max_vector_size = MAX_VECTOR_UPL_SIZE;
	2587	max_io_size = max_upl_size;
	2588	}
	2589
	2590	if (first_IO) {
	2591	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2592	first_IO = 0;
	2593	}
	2594	io_size = io_req_size & ~PAGE_MASK;
	2595	iov_base = uio_curriovbase(uio);
	2596
	2597	if (io_size > max_io_size) {
	2598	io_size = max_io_size;
	2599	}
	2600
	2601	if (useVectorUPL && (iov_base & PAGE_MASK)) {
	2602	/*
	2603	* We have an iov_base that's not page-aligned.
	2604	* Issue all I/O's that have been collected within
	2605	* this Vectored UPL.
	2606	*/
	2607	if (vector_upl_index) {
	2608	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2609	reset_vector_run_state();
	2610	}
	2611
	2612	/*
	2613	* After this point, if we are using the Vector UPL path and the base is
	2614	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	2615	*/
	2616	}
	2617
	2618	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2619	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2620
	2621	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	2622	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	2623
	2624	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2625	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	2626	pages_in_pl = 0;
	2627	upl_size = (upl_size_t)upl_needed_size;
	2628	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2629	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2630
	2631	kret = vm_map_get_upl(map,
	2632	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
	2633	&upl_size,
	2634	&upl,
	2635	NULL,
	2636	&pages_in_pl,
	2637	&upl_flags,
	2638	VM_KERN_MEMORY_FILE,
	2639	force_data_sync);
	2640
	2641	if (kret != KERN_SUCCESS) {
	2642	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2643	0, 0, 0, kret, 0);
	2644	/*
	2645	* failed to get pagelist
	2646	*
	2647	* we may have already spun some portion of this request
	2648	* off as async requests... we need to wait for the I/O
	2649	* to complete before returning
	2650	*/
	2651	goto wait_for_dwrites;
	2652	}
	2653	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2654	pages_in_pl = upl_size / PAGE_SIZE;
	2655
	2656	for (i = 0; i < pages_in_pl; i++) {
	2657	if (!upl_valid_page(pl, i)) {
	2658	break;
	2659	}
	2660	}
	2661	if (i == pages_in_pl) {
	2662	break;
	2663	}
	2664
	2665	/*
	2666	* didn't get all the pages back that we
	2667	* needed... release this upl and try again
	2668	*/
	2669	ubc_upl_abort(upl, 0);
	2670	}
	2671	if (force_data_sync >= 3) {
	2672	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2673	i, pages_in_pl, upl_size, kret, 0);
	2674	/*
	2675	* for some reason, we couldn't acquire a hold on all
	2676	* the pages needed in the user's address space
	2677	*
	2678	* we may have already spun some portion of this request
	2679	* off as async requests... we need to wait for the I/O
	2680	* to complete before returning
	2681	*/
	2682	goto wait_for_dwrites;
	2683	}
	2684
	2685	/*
	2686	* Consider the possibility that upl_size wasn't satisfied.
	2687	*/
	2688	if (upl_size < upl_needed_size) {
	2689	if (upl_size && upl_offset == 0) {
	2690	io_size = upl_size;
	2691	} else {
	2692	io_size = 0;
	2693	}
	2694	}
	2695	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2696	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	2697
	2698	if (io_size == 0) {
	2699	ubc_upl_abort(upl, 0);
	2700	/*
	2701	* we may have already spun some portion of this request
	2702	* off as async requests... we need to wait for the I/O
	2703	* to complete before returning
	2704	*/
	2705	goto wait_for_dwrites;
	2706	}
	2707
	2708	if (useVectorUPL) {
	2709	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	2710	if (end_off) {
	2711	issueVectorUPL = 1;
	2712	}
	2713	/*
	2714	* After this point, if we are using a vector UPL, then
	2715	* either all the UPL elements end on a page boundary OR
	2716	* this UPL is the last element because it does not end
	2717	* on a page boundary.
	2718	*/
	2719	}
	2720
	2721	/*
	2722	* we want push out these writes asynchronously so that we can overlap
	2723	* the preparation of the next I/O
	2724	* if there are already too many outstanding writes
	2725	* wait until some complete before issuing the next
	2726	*/
	2727	if (vp->v_mount->mnt_minsaturationbytecount) {
	2728	bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
	2729	} else {
	2730	bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
	2731	}
	2732
	2733	cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
	2734
	2735	if (iostate.io_error) {
	2736	/*
	2737	* one of the earlier writes we issued ran into a hard error
	2738	* don't issue any more writes, cleanup the UPL
	2739	* that was just created but not used, then
	2740	* go wait for all writes that are part of this stream
	2741	* to complete before returning the error to the caller
	2742	*/
	2743	ubc_upl_abort(upl, 0);
	2744
	2745	goto wait_for_dwrites;
	2746	}
	2747
	2748	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	2749	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	2750
	2751	if (!useVectorUPL) {
	2752	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	2753	io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2754	} else {
	2755	if (!vector_upl_index) {
	2756	vector_upl = vector_upl_create(upl_offset);
	2757	v_upl_uio_offset = uio->uio_offset;
	2758	vector_upl_offset = upl_offset;
	2759	}
	2760
	2761	vector_upl_set_subupl(vector_upl, upl, upl_size);
	2762	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	2763	vector_upl_index++;
	2764	vector_upl_iosize += io_size;
	2765	vector_upl_size += upl_size;
	2766
	2767	if (issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	2768	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2769	reset_vector_run_state();
	2770	}
	2771	}
	2772
	2773	/*
	2774	* update the uio structure to
	2775	* reflect the I/O that we just issued
	2776	*/
	2777	uio_update(uio, (user_size_t)io_size);
	2778
	2779	/*
	2780	* in case we end up calling through to cluster_write_copy to finish
	2781	* the tail of this request, we need to update the oldEOF so that we
	2782	* don't zero-fill the head of a page if we've successfully written
	2783	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2784	* page that is beyond the oldEOF if the write is unaligned... we only
	2785	* want that to happen for the very first page of the cluster_write,
	2786	* NOT the first page of each vector making up a multi-vector write.
	2787	*/
	2788	if (uio->uio_offset > oldEOF) {
	2789	oldEOF = uio->uio_offset;
	2790	}
	2791
	2792	io_req_size -= io_size;
	2793
	2794	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	2795	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
	2796	} /* end while */
	2797
	2798	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
	2799	retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
	2800
	2801	if (retval == 0 && *write_type == IO_DIRECT) {
	2802	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_NONE,
	2803	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2804
	2805	goto next_dwrite;
	2806	}
	2807	}
	2808
	2809	wait_for_dwrites:
	2810
	2811	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	2812	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2813	reset_vector_run_state();
	2814	}
	2815	/*
	2816	* make sure all async writes issued as part of this stream
	2817	* have completed before we return
	2818	*/
	2819	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
	2820
	2821	if (iostate.io_error) {
	2822	retval = iostate.io_error;
	2823	}
	2824
	2825	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
	2826
	2827	if (io_throttled == TRUE && retval == 0) {
	2828	retval = EAGAIN;
	2829	}
	2830
	2831	if (io_req_size && retval == 0) {
	2832	/*
	2833	* we couldn't handle the tail of this request in DIRECT mode
	2834	* so fire it through the copy path
	2835	*
	2836	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
	2837	* so we can just pass 0 in for the headOff and tailOff
	2838	*/
	2839	if (uio->uio_offset > oldEOF) {
	2840	oldEOF = uio->uio_offset;
	2841	}
	2842
	2843	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
	2844
	2845	*write_type = IO_UNKNOWN;
	2846	}
	2847	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	2848	(int)uio->uio_offset, io_req_size, retval, 4, 0);
	2849
	2850	return retval;
	2851	}
	2852
	2853
	2854	static int
	2855	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int write_type, u_int32_t *write_length,
	2856	int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2857	{
	2858	upl_page_info_t *pl;
	2859	addr64_t src_paddr = 0;
	2860	upl_t upl[MAX_VECTS];
	2861	vm_offset_t upl_offset;
	2862	u_int32_t tail_size = 0;
	2863	u_int32_t io_size;
	2864	u_int32_t xsize;
	2865	upl_size_t upl_size;
	2866	vm_size_t upl_needed_size;
	2867	mach_msg_type_number_t pages_in_pl;
	2868	upl_control_flags_t upl_flags;
	2869	kern_return_t kret;
	2870	struct clios iostate;
	2871	int error = 0;
	2872	int cur_upl = 0;
	2873	int num_upl = 0;
	2874	int n;
	2875	user_addr_t iov_base;
	2876	u_int32_t devblocksize;
	2877	u_int32_t mem_alignment_mask;
	2878
	2879	/*
	2880	* When we enter this routine, we know
	2881	* -- the io_req_size will not exceed iov_len
	2882	* -- the target address is physically contiguous
	2883	*/
	2884	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2885
	2886	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2887	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2888
	2889	iostate.io_completed = 0;
	2890	iostate.io_issued = 0;
	2891	iostate.io_error = 0;
	2892	iostate.io_wanted = 0;
	2893
	2894	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
	2895
	2896	next_cwrite:
	2897	io_size = *write_length;
	2898
	2899	iov_base = uio_curriovbase(uio);
	2900
	2901	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2902	upl_needed_size = upl_offset + io_size;
	2903
	2904	pages_in_pl = 0;
	2905	upl_size = (upl_size_t)upl_needed_size;
	2906	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2907	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2908
	2909	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2910	kret = vm_map_get_upl(map,
	2911	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
	2912	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	2913
	2914	if (kret != KERN_SUCCESS) {
	2915	/*
	2916	* failed to get pagelist
	2917	*/
	2918	error = EINVAL;
	2919	goto wait_for_cwrites;
	2920	}
	2921	num_upl++;
	2922
	2923	/*
	2924	* Consider the possibility that upl_size wasn't satisfied.
	2925	*/
	2926	if (upl_size < upl_needed_size) {
	2927	/*
	2928	* This is a failure in the physical memory case.
	2929	*/
	2930	error = EINVAL;
	2931	goto wait_for_cwrites;
	2932	}
	2933	pl = ubc_upl_pageinfo(upl[cur_upl]);
	2934
	2935	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	2936
	2937	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	2938	u_int32_t head_size;
	2939
	2940	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	2941
	2942	if (head_size > io_size) {
	2943	head_size = io_size;
	2944	}
	2945
	2946	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
	2947
	2948	if (error) {
	2949	goto wait_for_cwrites;
	2950	}
	2951
	2952	upl_offset += head_size;
	2953	src_paddr += head_size;
	2954	io_size -= head_size;
	2955
	2956	iov_base += head_size;
	2957	}
	2958	if ((u_int32_t)iov_base & mem_alignment_mask) {
	2959	/*
	2960	* request doesn't set up on a memory boundary
	2961	* the underlying DMA engine can handle...
	2962	* return an error instead of going through
	2963	* the slow copy path since the intent of this
	2964	* path is direct I/O from device memory
	2965	*/
	2966	error = EINVAL;
	2967	goto wait_for_cwrites;
	2968	}
	2969
	2970	tail_size = io_size & (devblocksize - 1);
	2971	io_size -= tail_size;
	2972
	2973	while (io_size && error == 0) {
	2974	if (io_size > MAX_IO_CONTIG_SIZE) {
	2975	xsize = MAX_IO_CONTIG_SIZE;
	2976	} else {
	2977	xsize = io_size;
	2978	}
	2979	/*
	2980	* request asynchronously so that we can overlap
	2981	* the preparation of the next I/O... we'll do
	2982	* the commit after all the I/O has completed
	2983	* since its all issued against the same UPL
	2984	* if there are already too many outstanding writes
	2985	* wait until some have completed before issuing the next
	2986	*/
	2987	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
	2988
	2989	if (iostate.io_error) {
	2990	/*
	2991	* one of the earlier writes we issued ran into a hard error
	2992	* don't issue any more writes...
	2993	* go wait for all writes that are part of this stream
	2994	* to complete before returning the error to the caller
	2995	*/
	2996	goto wait_for_cwrites;
	2997	}
	2998	/*
	2999	* issue an asynchronous write to cluster_io
	3000	*/
	3001	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
	3002	xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
	3003
	3004	if (error == 0) {
	3005	/*
	3006	* The cluster_io write completed successfully,
	3007	* update the uio structure
	3008	*/
	3009	uio_update(uio, (user_size_t)xsize);
	3010
	3011	upl_offset += xsize;
	3012	src_paddr += xsize;
	3013	io_size -= xsize;
	3014	}
	3015	}
	3016	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
	3017	error = cluster_io_type(uio, write_type, write_length, 0);
	3018
	3019	if (error == 0 && *write_type == IO_CONTIG) {
	3020	cur_upl++;
	3021	goto next_cwrite;
	3022	}
	3023	} else {
	3024	*write_type = IO_UNKNOWN;
	3025	}
	3026
	3027	wait_for_cwrites:
	3028	/*
	3029	* make sure all async writes that are part of this stream
	3030	* have completed before we proceed
	3031	*/
	3032	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
	3033
	3034	if (iostate.io_error) {
	3035	error = iostate.io_error;
	3036	}
	3037
	3038	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
	3039
	3040	if (error == 0 && tail_size) {
	3041	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
	3042	}
	3043
	3044	for (n = 0; n < num_upl; n++) {
	3045	/*
	3046	* just release our hold on each physically contiguous
	3047	* region without changing any state
	3048	*/
	3049	ubc_upl_abort(upl[n], 0);
	3050	}
	3051
	3052	return error;
	3053	}
	3054
	3055
	3056	/*
	3057	* need to avoid a race between an msync of a range of pages dirtied via mmap
	3058	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
	3059	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
	3060	*
	3061	* we should never force-zero-fill pages that are already valid in the cache...
	3062	* the entire page contains valid data (either from disk, zero-filled or dirtied
	3063	* via an mmap) so we can only do damage by trying to zero-fill
	3064	*
	3065	*/
	3066	static int
	3067	cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
	3068	{
	3069	int zero_pg_index;
	3070	boolean_t need_cluster_zero = TRUE;
	3071
	3072	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	3073	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	3074	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	3075
	3076	if (upl_valid_page(pl, zero_pg_index)) {
	3077	/*
	3078	* never force zero valid pages - dirty or clean
	3079	* we'll leave these in the UPL for cluster_write_copy to deal with
	3080	*/
	3081	need_cluster_zero = FALSE;
	3082	}
	3083	}
	3084	if (need_cluster_zero == TRUE) {
	3085	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	3086	}
	3087
	3088	return bytes_to_zero;
	3089	}
	3090
	3091
	3092	void
	3093	cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
	3094	{
	3095	struct cl_extent cl;
	3096	boolean_t first_pass = TRUE;
	3097
	3098	assert(s_offset < e_offset);
	3099	assert((s_offset & PAGE_MASK_64) == 0);
	3100	assert((e_offset & PAGE_MASK_64) == 0);
	3101
	3102	cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
	3103	cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
	3104
	3105	cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
	3106	vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
	3107	}
	3108
	3109
	3110	static void
	3111	cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
	3112	boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
	3113	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	3114	{
	3115	struct cl_writebehind *wbp;
	3116	int cl_index;
	3117	int ret_cluster_try_push;
	3118	u_int max_cluster_pgcount;
	3119
	3120
	3121	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	3122
	3123	/*
	3124	* take the lock to protect our accesses
	3125	* of the writebehind and sparse cluster state
	3126	*/
	3127	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3128
	3129	if (wbp->cl_scmap) {
	3130	if (!(flags & IO_NOCACHE)) {
	3131	/*
	3132	* we've fallen into the sparse
	3133	* cluster method of delaying dirty pages
	3134	*/
	3135	sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
	3136
	3137	lck_mtx_unlock(&wbp->cl_lockw);
	3138	return;
	3139	}
	3140	/*
	3141	* must have done cached writes that fell into
	3142	* the sparse cluster mechanism... we've switched
	3143	* to uncached writes on the file, so go ahead
	3144	* and push whatever's in the sparse map
	3145	* and switch back to normal clustering
	3146	*/
	3147	wbp->cl_number = 0;
	3148
	3149	sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
	3150	/*
	3151	* no clusters of either type present at this point
	3152	* so just go directly to start_new_cluster since
	3153	* we know we need to delay this I/O since we've
	3154	* already released the pages back into the cache
	3155	* to avoid the deadlock with sparse_cluster_push
	3156	*/
	3157	goto start_new_cluster;
	3158	}
	3159	if (*first_pass == TRUE) {
	3160	if (write_off == wbp->cl_last_write) {
	3161	wbp->cl_seq_written += write_cnt;
	3162	} else {
	3163	wbp->cl_seq_written = write_cnt;
	3164	}
	3165
	3166	wbp->cl_last_write = write_off + write_cnt;
	3167
	3168	*first_pass = FALSE;
	3169	}
	3170	if (wbp->cl_number == 0) {
	3171	/*
	3172	* no clusters currently present
	3173	*/
	3174	goto start_new_cluster;
	3175	}
	3176
	3177	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3178	/*
	3179	* check each cluster that we currently hold
	3180	* try to merge some or all of this write into
	3181	* one or more of the existing clusters... if
	3182	* any portion of the write remains, start a
	3183	* new cluster
	3184	*/
	3185	if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	3186	/*
	3187	* the current write starts at or after the current cluster
	3188	*/
	3189	if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3190	/*
	3191	* we have a write that fits entirely
	3192	* within the existing cluster limits
	3193	*/
	3194	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3195	/*
	3196	* update our idea of where the cluster ends
	3197	*/
	3198	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
	3199	}
	3200	break;
	3201	}
	3202	if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3203	/*
	3204	* we have a write that starts in the middle of the current cluster
	3205	* but extends beyond the cluster's limit... we know this because
	3206	* of the previous checks
	3207	* we'll extend the current cluster to the max
	3208	* and update the b_addr for the current write to reflect that
	3209	* the head of it was absorbed into this cluster...
	3210	* note that we'll always have a leftover tail in this case since
	3211	* full absorbtion would have occurred in the clause above
	3212	*/
	3213	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
	3214
	3215	cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
	3216	}
	3217	/*
	3218	* we come here for the case where the current write starts
	3219	* beyond the limit of the existing cluster or we have a leftover
	3220	* tail after a partial absorbtion
	3221	*
	3222	* in either case, we'll check the remaining clusters before
	3223	* starting a new one
	3224	*/
	3225	} else {
	3226	/*
	3227	* the current write starts in front of the cluster we're currently considering
	3228	*/
	3229	if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
	3230	/*
	3231	* we can just merge the new request into
	3232	* this cluster and leave it in the cache
	3233	* since the resulting cluster is still
	3234	* less than the maximum allowable size
	3235	*/
	3236	wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
	3237
	3238	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3239	/*
	3240	* the current write completely
	3241	* envelops the existing cluster and since
	3242	* each write is limited to at most max_cluster_pgcount pages
	3243	* we can just use the start and last blocknos of the write
	3244	* to generate the cluster limits
	3245	*/
	3246	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
	3247	}
	3248	break;
	3249	}
	3250	/*
	3251	* if we were to combine this write with the current cluster
	3252	* we would exceed the cluster size limit.... so,
	3253	* let's see if there's any overlap of the new I/O with
	3254	* the cluster we're currently considering... in fact, we'll
	3255	* stretch the cluster out to it's full limit and see if we
	3256	* get an intersection with the current write
	3257	*
	3258	*/
	3259	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
	3260	/*
	3261	* the current write extends into the proposed cluster
	3262	* clip the length of the current write after first combining it's
	3263	* tail with the newly shaped cluster
	3264	*/
	3265	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
	3266
	3267	cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
	3268	}
	3269	/*
	3270	* if we get here, there was no way to merge
	3271	* any portion of this write with this cluster
	3272	* or we could only merge part of it which
	3273	* will leave a tail...
	3274	* we'll check the remaining clusters before starting a new one
	3275	*/
	3276	}
	3277	}
	3278	if (cl_index < wbp->cl_number) {
	3279	/*
	3280	* we found an existing cluster(s) that we
	3281	* could entirely merge this I/O into
	3282	*/
	3283	goto delay_io;
	3284	}
	3285
	3286	if (defer_writes == FALSE &&
	3287	wbp->cl_number == MAX_CLUSTERS &&
	3288	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
	3289	uint32_t n;
	3290
	3291	if (vp->v_mount->mnt_minsaturationbytecount) {
	3292	n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
	3293
	3294	if (n > MAX_CLUSTERS) {
	3295	n = MAX_CLUSTERS;
	3296	}
	3297	} else {
	3298	n = 0;
	3299	}
	3300
	3301	if (n == 0) {
	3302	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
	3303	n = WRITE_BEHIND_SSD;
	3304	} else {
	3305	n = WRITE_BEHIND;
	3306	}
	3307	}
	3308	while (n--) {
	3309	cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
	3310	}
	3311	}
	3312	if (wbp->cl_number < MAX_CLUSTERS) {
	3313	/*
	3314	* we didn't find an existing cluster to
	3315	* merge into, but there's room to start
	3316	* a new one
	3317	*/
	3318	goto start_new_cluster;
	3319	}
	3320	/*
	3321	* no exisitng cluster to merge with and no
	3322	* room to start a new one... we'll try
	3323	* pushing one of the existing ones... if none of
	3324	* them are able to be pushed, we'll switch
	3325	* to the sparse cluster mechanism
	3326	* cluster_try_push updates cl_number to the
	3327	* number of remaining clusters... and
	3328	* returns the number of currently unused clusters
	3329	*/
	3330	ret_cluster_try_push = 0;
	3331
	3332	/*
	3333	* if writes are not deferred, call cluster push immediately
	3334	*/
	3335	if (defer_writes == FALSE) {
	3336	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
	3337	}
	3338	/*
	3339	* execute following regardless of writes being deferred or not
	3340	*/
	3341	if (ret_cluster_try_push == 0) {
	3342	/*
	3343	* no more room in the normal cluster mechanism
	3344	* so let's switch to the more expansive but expensive
	3345	* sparse mechanism....
	3346	*/
	3347	sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
	3348	sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
	3349
	3350	lck_mtx_unlock(&wbp->cl_lockw);
	3351	return;
	3352	}
	3353	start_new_cluster:
	3354	wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
	3355	wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
	3356
	3357	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
	3358
	3359	if (flags & IO_NOCACHE) {
	3360	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
	3361	}
	3362
	3363	if (flags & IO_PASSIVE) {
	3364	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
	3365	}
	3366
	3367	wbp->cl_number++;
	3368	delay_io:
	3369	lck_mtx_unlock(&wbp->cl_lockw);
	3370	return;
	3371	}
	3372
	3373
	3374	static int
	3375	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
	3376	off_t tailOff, int flags, int (callback)(buf_t, void ), void *callback_arg)
	3377	{
	3378	upl_page_info_t *pl;
	3379	upl_t upl;
	3380	vm_offset_t upl_offset = 0;
	3381	vm_size_t upl_size;
	3382	off_t upl_f_offset;
	3383	int pages_in_upl;
	3384	int start_offset;
	3385	int xfer_resid;
	3386	int io_size;
	3387	int io_offset;
	3388	int bytes_to_zero;
	3389	int bytes_to_move;
	3390	kern_return_t kret;
	3391	int retval = 0;
	3392	int io_resid;
	3393	long long total_size;
	3394	long long zero_cnt;
	3395	off_t zero_off;
	3396	long long zero_cnt1;
	3397	off_t zero_off1;
	3398	off_t write_off = 0;
	3399	int write_cnt = 0;
	3400	boolean_t first_pass = FALSE;
	3401	struct cl_extent cl;
	3402	int bflag;
	3403	u_int max_io_size;
	3404
	3405	if (uio) {
	3406	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3407	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
	3408
	3409	io_resid = io_req_size;
	3410	} else {
	3411	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3412	0, 0, (int)oldEOF, (int)newEOF, 0);
	3413
	3414	io_resid = 0;
	3415	}
	3416	if (flags & IO_PASSIVE) {
	3417	bflag = CL_PASSIVE;
	3418	} else {
	3419	bflag = 0;
	3420	}
	3421	if (flags & IO_NOCACHE) {
	3422	bflag \|= CL_NOCACHE;
	3423	}
	3424
	3425	if (flags & IO_SKIP_ENCRYPTION) {
	3426	bflag \|= CL_ENCRYPTED;
	3427	}
	3428
	3429	zero_cnt = 0;
	3430	zero_cnt1 = 0;
	3431	zero_off = 0;
	3432	zero_off1 = 0;
	3433
	3434	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	3435
	3436	if (flags & IO_HEADZEROFILL) {
	3437	/*
	3438	* some filesystems (HFS is one) don't support unallocated holes within a file...
	3439	* so we zero fill the intervening space between the old EOF and the offset
	3440	* where the next chunk of real data begins.... ftruncate will also use this
	3441	* routine to zero fill to the new EOF when growing a file... in this case, the
	3442	* uio structure will not be provided
	3443	*/
	3444	if (uio) {
	3445	if (headOff < uio->uio_offset) {
	3446	zero_cnt = uio->uio_offset - headOff;
	3447	zero_off = headOff;
	3448	}
	3449	} else if (headOff < newEOF) {
	3450	zero_cnt = newEOF - headOff;
	3451	zero_off = headOff;
	3452	}
	3453	} else {
	3454	if (uio && uio->uio_offset > oldEOF) {
	3455	zero_off = uio->uio_offset & ~PAGE_MASK_64;
	3456
	3457	if (zero_off >= oldEOF) {
	3458	zero_cnt = uio->uio_offset - zero_off;
	3459
	3460	flags \|= IO_HEADZEROFILL;
	3461	}
	3462	}
	3463	}
	3464	if (flags & IO_TAILZEROFILL) {
	3465	if (uio) {
	3466	zero_off1 = uio->uio_offset + io_req_size;
	3467
	3468	if (zero_off1 < tailOff) {
	3469	zero_cnt1 = tailOff - zero_off1;
	3470	}
	3471	}
	3472	} else {
	3473	if (uio && newEOF > oldEOF) {
	3474	zero_off1 = uio->uio_offset + io_req_size;
	3475
	3476	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
	3477	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
	3478
	3479	flags \|= IO_TAILZEROFILL;
	3480	}
	3481	}
	3482	}
	3483	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	3484	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	3485	retval, 0, 0, 0, 0);
	3486	return 0;
	3487	}
	3488	if (uio) {
	3489	write_off = uio->uio_offset;
	3490	write_cnt = (int)uio_resid(uio);
	3491	/*
	3492	* delay updating the sequential write info
	3493	* in the control block until we've obtained
	3494	* the lock for it
	3495	*/
	3496	first_pass = TRUE;
	3497	}
	3498	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	3499	/*
	3500	* for this iteration of the loop, figure out where our starting point is
	3501	*/
	3502	if (zero_cnt) {
	3503	start_offset = (int)(zero_off & PAGE_MASK_64);
	3504	upl_f_offset = zero_off - start_offset;
	3505	} else if (io_resid) {
	3506	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3507	upl_f_offset = uio->uio_offset - start_offset;
	3508	} else {
	3509	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	3510	upl_f_offset = zero_off1 - start_offset;
	3511	}
	3512	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	3513	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	3514
	3515	if (total_size > max_io_size) {
	3516	total_size = max_io_size;
	3517	}
	3518
	3519	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	3520
	3521	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	3522	/*
	3523	* assumption... total_size <= io_resid
	3524	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	3525	*/
	3526	if ((start_offset + total_size) > max_io_size) {
	3527	total_size = max_io_size - start_offset;
	3528	}
	3529	xfer_resid = (int)total_size;
	3530
	3531	retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
	3532
	3533	if (retval) {
	3534	break;
	3535	}
	3536
	3537	io_resid -= (total_size - xfer_resid);
	3538	total_size = xfer_resid;
	3539	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3540	upl_f_offset = uio->uio_offset - start_offset;
	3541
	3542	if (total_size == 0) {
	3543	if (start_offset) {
	3544	/*
	3545	* the write did not finish on a page boundary
	3546	* which will leave upl_f_offset pointing to the
	3547	* beginning of the last page written instead of
	3548	* the page beyond it... bump it in this case
	3549	* so that the cluster code records the last page
	3550	* written as dirty
	3551	*/
	3552	upl_f_offset += PAGE_SIZE_64;
	3553	}
	3554	upl_size = 0;
	3555
	3556	goto check_cluster;
	3557	}
	3558	}
	3559	/*
	3560	* compute the size of the upl needed to encompass
	3561	* the requested write... limit each call to cluster_io
	3562	* to the maximum UPL size... cluster_io will clip if
	3563	* this exceeds the maximum io_size for the device,
	3564	* make sure to account for
	3565	* a starting offset that's not page aligned
	3566	*/
	3567	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3568
	3569	if (upl_size > max_io_size) {
	3570	upl_size = max_io_size;
	3571	}
	3572
	3573	pages_in_upl = (int)(upl_size / PAGE_SIZE);
	3574	io_size = (int)(upl_size - start_offset);
	3575
	3576	if ((long long)io_size > total_size) {
	3577	io_size = (int)total_size;
	3578	}
	3579
	3580	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	3581
	3582
	3583	/*
	3584	* Gather the pages from the buffer cache.
	3585	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	3586	* that we intend to modify these pages.
	3587	*/
	3588	kret = ubc_create_upl_kernel(vp,
	3589	upl_f_offset,
	3590	(int)upl_size,
	3591	&upl,
	3592	&pl,
	3593	UPL_SET_LITE \| ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
	3594	VM_KERN_MEMORY_FILE);
	3595	if (kret != KERN_SUCCESS) {
	3596	panic("cluster_write_copy: failed to get pagelist");
	3597	}
	3598
	3599	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	3600	upl, (int)upl_f_offset, start_offset, 0, 0);
	3601
	3602	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
	3603	int read_size;
	3604
	3605	/*
	3606	* we're starting in the middle of the first page of the upl
	3607	* and the page isn't currently valid, so we're going to have
	3608	* to read it in first... this is a synchronous operation
	3609	*/
	3610	read_size = PAGE_SIZE;
	3611
	3612	if ((upl_f_offset + read_size) > oldEOF) {
	3613	read_size = (int)(oldEOF - upl_f_offset);
	3614	}
	3615
	3616	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	3617	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3618	if (retval) {
	3619	/*
	3620	* we had an error during the read which causes us to abort
	3621	* the current cluster_write request... before we do, we need
	3622	* to release the rest of the pages in the upl without modifying
	3623	* there state and mark the failed page in error
	3624	*/
	3625	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3626
	3627	if (upl_size > PAGE_SIZE) {
	3628	ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size,
	3629	UPL_ABORT_FREE_ON_EMPTY);
	3630	}
	3631
	3632	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3633	upl, 0, 0, retval, 0);
	3634	break;
	3635	}
	3636	}
	3637	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	3638	/*
	3639	* the last offset we're writing to in this upl does not end on a page
	3640	* boundary... if it's not beyond the old EOF, then we'll also need to
	3641	* pre-read this page in if it isn't already valid
	3642	*/
	3643	upl_offset = upl_size - PAGE_SIZE;
	3644
	3645	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	3646	!upl_valid_page(pl, (int)(upl_offset / PAGE_SIZE))) {
	3647	int read_size;
	3648
	3649	read_size = PAGE_SIZE;
	3650
	3651	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
	3652	read_size = (int)(oldEOF - (upl_f_offset + upl_offset));
	3653	}
	3654
	3655	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	3656	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3657	if (retval) {
	3658	/*
	3659	* we had an error during the read which causes us to abort
	3660	* the current cluster_write request... before we do, we
	3661	* need to release the rest of the pages in the upl without
	3662	* modifying there state and mark the failed page in error
	3663	*/
	3664	ubc_upl_abort_range(upl, (upl_offset_t)upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3665
	3666	if (upl_size > PAGE_SIZE) {
	3667	ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3668	}
	3669
	3670	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3671	upl, 0, 0, retval, 0);
	3672	break;
	3673	}
	3674	}
	3675	}
	3676	xfer_resid = io_size;
	3677	io_offset = start_offset;
	3678
	3679	while (zero_cnt && xfer_resid) {
	3680	if (zero_cnt < (long long)xfer_resid) {
	3681	bytes_to_zero = (int)zero_cnt;
	3682	} else {
	3683	bytes_to_zero = xfer_resid;
	3684	}
	3685
	3686	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
	3687
	3688	xfer_resid -= bytes_to_zero;
	3689	zero_cnt -= bytes_to_zero;
	3690	zero_off += bytes_to_zero;
	3691	io_offset += bytes_to_zero;
	3692	}
	3693	if (xfer_resid && io_resid) {
	3694	u_int32_t io_requested;
	3695
	3696	bytes_to_move = min(io_resid, xfer_resid);
	3697	io_requested = bytes_to_move;
	3698
	3699	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
	3700
	3701	if (retval) {
	3702	ubc_upl_abort_range(upl, 0, (upl_size_t)upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3703
	3704	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3705	upl, 0, 0, retval, 0);
	3706	} else {
	3707	io_resid -= bytes_to_move;
	3708	xfer_resid -= bytes_to_move;
	3709	io_offset += bytes_to_move;
	3710	}
	3711	}
	3712	while (xfer_resid && zero_cnt1 && retval == 0) {
	3713	if (zero_cnt1 < (long long)xfer_resid) {
	3714	bytes_to_zero = (int)zero_cnt1;
	3715	} else {
	3716	bytes_to_zero = xfer_resid;
	3717	}
	3718
	3719	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
	3720
	3721	xfer_resid -= bytes_to_zero;
	3722	zero_cnt1 -= bytes_to_zero;
	3723	zero_off1 += bytes_to_zero;
	3724	io_offset += bytes_to_zero;
	3725	}
	3726	if (retval == 0) {
	3727	int do_zeroing = 1;
	3728
	3729	io_size += start_offset;
	3730
	3731	/* Force more restrictive zeroing behavior only on APFS */
	3732	if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
	3733	do_zeroing = 0;
	3734	}
	3735
	3736	if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
	3737	/*
	3738	* if we're extending the file with this write
	3739	* we'll zero fill the rest of the page so that
	3740	* if the file gets extended again in such a way as to leave a
	3741	* hole starting at this EOF, we'll have zero's in the correct spot
	3742	*/
	3743	cluster_zero(upl, io_size, (int)(upl_size - io_size), NULL);
	3744	}
	3745	/*
	3746	* release the upl now if we hold one since...
	3747	* 1) pages in it may be present in the sparse cluster map
	3748	* and may span 2 separate buckets there... if they do and
	3749	* we happen to have to flush a bucket to make room and it intersects
	3750	* this upl, a deadlock may result on page BUSY
	3751	* 2) we're delaying the I/O... from this point forward we're just updating
	3752	* the cluster state... no need to hold the pages, so commit them
	3753	* 3) IO_SYNC is set...
	3754	* because we had to ask for a UPL that provides currenty non-present pages, the
	3755	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	3756	* upon committing it... this is not the behavior we want since it's possible for
	3757	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	3758	* we'll pick these pages back up later with the correct behavior specified.
	3759	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
	3760	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
	3761	* we hold since the flushing context is holding the cluster lock.
	3762	*/
	3763	ubc_upl_commit_range(upl, 0, (upl_size_t)upl_size,
	3764	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	3765	check_cluster:
	3766	/*
	3767	* calculate the last logical block number
	3768	* that this delayed I/O encompassed
	3769	*/
	3770	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	3771
	3772	if (flags & IO_SYNC) {
	3773	/*
	3774	* if the IO_SYNC flag is set than we need to bypass
	3775	* any clustering and immediately issue the I/O
	3776	*
	3777	* we don't hold the lock at this point
	3778	*
	3779	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
	3780	* so that we correctly deal with a change in state of the hardware modify bit...
	3781	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
	3782	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
	3783	* responsible for generating the correct sized I/O(s)
	3784	*/
	3785	retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
	3786	} else {
	3787	boolean_t defer_writes = FALSE;
	3788
	3789	if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
	3790	defer_writes = TRUE;
	3791	}
	3792
	3793	cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
	3794	write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
	3795	}
	3796	}
	3797	}
	3798	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END, retval, 0, io_resid, 0, 0);
	3799
	3800	return retval;
	3801	}
	3802
	3803
	3804
	3805	int
	3806	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	3807	{
	3808	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
	3809	}
	3810
	3811
	3812	int
	3813	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int xflags, int (callback)(buf_t, void ), void callback_arg)
	3814	{
	3815	int retval = 0;
	3816	int flags;
	3817	user_ssize_t cur_resid;
	3818	u_int32_t io_size;
	3819	u_int32_t read_length = 0;
	3820	int read_type = IO_COPY;
	3821
	3822	flags = xflags;
	3823
	3824	if (vp->v_flag & VNOCACHE_DATA) {
	3825	flags \|= IO_NOCACHE;
	3826	}
	3827	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled) {
	3828	flags \|= IO_RAOFF;
	3829	}
	3830
	3831	if (flags & IO_SKIP_ENCRYPTION) {
	3832	flags \|= IO_ENCRYPTED;
	3833	}
	3834
	3835	/*
	3836	* do a read through the cache if one of the following is true....
	3837	* NOCACHE is not true
	3838	* the uio request doesn't target USERSPACE
	3839	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
	3840	* Reading encrypted data from a CP filesystem should never result in the data touching
	3841	* the UBC.
	3842	*
	3843	* otherwise, find out if we want the direct or contig variant for
	3844	* the first vector in the uio request
	3845	*/
	3846	if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED)) {
	3847	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3848	}
	3849
	3850	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
	3851	switch (read_type) {
	3852	case IO_COPY:
	3853	/*
	3854	* make sure the uio_resid isn't too big...
	3855	* internally, we want to handle all of the I/O in
	3856	* chunk sizes that fit in a 32 bit int
	3857	*/
	3858	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	3859	io_size = MAX_IO_REQUEST_SIZE;
	3860	} else {
	3861	io_size = (u_int32_t)cur_resid;
	3862	}
	3863
	3864	retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
	3865	break;
	3866
	3867	case IO_DIRECT:
	3868	retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
	3869	break;
	3870
	3871	case IO_CONTIG:
	3872	retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
	3873	break;
	3874
	3875	case IO_UNKNOWN:
	3876	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3877	break;
	3878	}
	3879	}
	3880	return retval;
	3881	}
	3882
	3883
	3884
	3885	static void
	3886	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
	3887	{
	3888	int range;
	3889	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	3890
	3891	if ((range = last_pg - start_pg)) {
	3892	if (take_reference) {
	3893	abort_flags \|= UPL_ABORT_REFERENCE;
	3894	}
	3895
	3896	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
	3897	}
	3898	}
	3899
	3900
	3901	static int
	3902	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int flags, int (callback)(buf_t, void ), void callback_arg)
	3903	{
	3904	upl_page_info_t *pl;
	3905	upl_t upl;
	3906	vm_offset_t upl_offset;
	3907	u_int32_t upl_size;
	3908	off_t upl_f_offset;
	3909	int start_offset;
	3910	int start_pg;
	3911	int last_pg;
	3912	int uio_last = 0;
	3913	int pages_in_upl;
	3914	off_t max_size;
	3915	off_t last_ioread_offset;
	3916	off_t last_request_offset;
	3917	kern_return_t kret;
	3918	int error = 0;
	3919	int retval = 0;
	3920	u_int32_t size_of_prefetch;
	3921	u_int32_t xsize;
	3922	u_int32_t io_size;
	3923	u_int32_t max_rd_size;
	3924	u_int32_t max_io_size;
	3925	u_int32_t max_prefetch;
	3926	u_int rd_ahead_enabled = 1;
	3927	u_int prefetch_enabled = 1;
	3928	struct cl_readahead * rap;
	3929	struct clios iostate;
	3930	struct cl_extent extent;
	3931	int bflag;
	3932	int take_reference = 1;
	3933	int policy = IOPOL_DEFAULT;
	3934	boolean_t iolock_inited = FALSE;
	3935
	3936	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	3937	(int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
	3938
	3939	if (flags & IO_ENCRYPTED) {
	3940	panic("encrypted blocks will hit UBC!");
	3941	}
	3942
	3943	policy = throttle_get_io_policy(NULL);
	3944
	3945	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE)) {
	3946	take_reference = 0;
	3947	}
	3948
	3949	if (flags & IO_PASSIVE) {
	3950	bflag = CL_PASSIVE;
	3951	} else {
	3952	bflag = 0;
	3953	}
	3954
	3955	if (flags & IO_NOCACHE) {
	3956	bflag \|= CL_NOCACHE;
	3957	}
	3958
	3959	if (flags & IO_SKIP_ENCRYPTION) {
	3960	bflag \|= CL_ENCRYPTED;
	3961	}
	3962
	3963	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	3964	max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
	3965	max_rd_size = max_prefetch;
	3966
	3967	last_request_offset = uio->uio_offset + io_req_size;
	3968
	3969	if (last_request_offset > filesize) {
	3970	last_request_offset = filesize;
	3971	}
	3972
	3973	if ((flags & (IO_RAOFF \| IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	3974	rd_ahead_enabled = 0;
	3975	rap = NULL;
	3976	} else {
	3977	if (cluster_is_throttled(vp)) {
	3978	/*
	3979	* we're in the throttle window, at the very least
	3980	* we want to limit the size of the I/O we're about
	3981	* to issue
	3982	*/
	3983	rd_ahead_enabled = 0;
	3984	prefetch_enabled = 0;
	3985
	3986	max_rd_size = THROTTLE_MAX_IOSIZE;
	3987	}
	3988	if ((rap = cluster_get_rap(vp)) == NULL) {
	3989	rd_ahead_enabled = 0;
	3990	} else {
	3991	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	3992	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	3993	}
	3994	}
	3995	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	3996	/*
	3997	* determine if we already have a read-ahead in the pipe courtesy of the
	3998	* last read systemcall that was issued...
	3999	* if so, pick up it's extent to determine where we should start
	4000	* with respect to any read-ahead that might be necessary to
	4001	* garner all the data needed to complete this read systemcall
	4002	*/
	4003	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	4004
	4005	if (last_ioread_offset < uio->uio_offset) {
	4006	last_ioread_offset = (off_t)0;
	4007	} else if (last_ioread_offset > last_request_offset) {
	4008	last_ioread_offset = last_request_offset;
	4009	}
	4010	} else {
	4011	last_ioread_offset = (off_t)0;
	4012	}
	4013
	4014	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
	4015	max_size = filesize - uio->uio_offset;
	4016	bool leftover_upl_aborted = false;
	4017
	4018	if ((off_t)(io_req_size) < max_size) {
	4019	io_size = io_req_size;
	4020	} else {
	4021	io_size = (u_int32_t)max_size;
	4022	}
	4023
	4024	if (!(flags & IO_NOCACHE)) {
	4025	while (io_size) {
	4026	u_int32_t io_resid;
	4027	u_int32_t io_requested;
	4028
	4029	/*
	4030	* if we keep finding the pages we need already in the cache, then
	4031	* don't bother to call cluster_read_prefetch since it costs CPU cycles
	4032	* to determine that we have all the pages we need... once we miss in
	4033	* the cache and have issued an I/O, than we'll assume that we're likely
	4034	* to continue to miss in the cache and it's to our advantage to try and prefetch
	4035	*/
	4036	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset))) {
	4037	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	4038	/*
	4039	* we've already issued I/O for this request and
	4040	* there's still work to do and
	4041	* our prefetch stream is running dry, so issue a
	4042	* pre-fetch I/O... the I/O latency will overlap
	4043	* with the copying of the data
	4044	*/
	4045	if (size_of_prefetch > max_rd_size) {
	4046	size_of_prefetch = max_rd_size;
	4047	}
	4048
	4049	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	4050
	4051	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	4052
	4053	if (last_ioread_offset > last_request_offset) {
	4054	last_ioread_offset = last_request_offset;
	4055	}
	4056	}
	4057	}
	4058	/*
	4059	* limit the size of the copy we're about to do so that
	4060	* we can notice that our I/O pipe is running dry and
	4061	* get the next I/O issued before it does go dry
	4062	*/
	4063	if (last_ioread_offset && io_size > (max_io_size / 4)) {
	4064	io_resid = (max_io_size / 4);
	4065	} else {
	4066	io_resid = io_size;
	4067	}
	4068
	4069	io_requested = io_resid;
	4070
	4071	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
	4072
	4073	xsize = io_requested - io_resid;
	4074
	4075	io_size -= xsize;
	4076	io_req_size -= xsize;
	4077
	4078	if (retval \|\| io_resid) {
	4079	/*
	4080	* if we run into a real error or
	4081	* a page that is not in the cache
	4082	* we need to leave streaming mode
	4083	*/
	4084	break;
	4085	}
	4086
	4087	if (rd_ahead_enabled && (io_size == 0 \|\| last_ioread_offset == last_request_offset)) {
	4088	/*
	4089	* we're already finished the I/O for this read request
	4090	* let's see if we should do a read-ahead
	4091	*/
	4092	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	4093	}
	4094	}
	4095	if (retval) {
	4096	break;
	4097	}
	4098	if (io_size == 0) {
	4099	if (rap != NULL) {
	4100	if (extent.e_addr < rap->cl_lastr) {
	4101	rap->cl_maxra = 0;
	4102	}
	4103	rap->cl_lastr = extent.e_addr;
	4104	}
	4105	break;
	4106	}
	4107	/*
	4108	* recompute max_size since cluster_copy_ubc_data_internal
	4109	* may have advanced uio->uio_offset
	4110	*/
	4111	max_size = filesize - uio->uio_offset;
	4112	}
	4113
	4114	iostate.io_completed = 0;
	4115	iostate.io_issued = 0;
	4116	iostate.io_error = 0;
	4117	iostate.io_wanted = 0;
	4118
	4119	if ((flags & IO_RETURN_ON_THROTTLE)) {
	4120	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4121	if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4122	/*
	4123	* we're in the throttle window and at least 1 I/O
	4124	* has already been issued by a throttleable thread
	4125	* in this window, so return with EAGAIN to indicate
	4126	* to the FS issuing the cluster_read call that it
	4127	* should now throttle after dropping any locks
	4128	*/
	4129	throttle_info_update_by_mount(vp->v_mount);
	4130
	4131	retval = EAGAIN;
	4132	break;
	4133	}
	4134	}
	4135	}
	4136
	4137	/*
	4138	* compute the size of the upl needed to encompass
	4139	* the requested read... limit each call to cluster_io
	4140	* to the maximum UPL size... cluster_io will clip if
	4141	* this exceeds the maximum io_size for the device,
	4142	* make sure to account for
	4143	* a starting offset that's not page aligned
	4144	*/
	4145	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	4146	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	4147
	4148	if (io_size > max_rd_size) {
	4149	io_size = max_rd_size;
	4150	}
	4151
	4152	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	4153
	4154	if (flags & IO_NOCACHE) {
	4155	if (upl_size > max_io_size) {
	4156	upl_size = max_io_size;
	4157	}
	4158	} else {
	4159	if (upl_size > max_io_size / 4) {
	4160	upl_size = max_io_size / 4;
	4161	upl_size &= ~PAGE_MASK;
	4162
	4163	if (upl_size == 0) {
	4164	upl_size = PAGE_SIZE;
	4165	}
	4166	}
	4167	}
	4168	pages_in_upl = upl_size / PAGE_SIZE;
	4169
	4170	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	4171	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4172
	4173	kret = ubc_create_upl_kernel(vp,
	4174	upl_f_offset,
	4175	upl_size,
	4176	&upl,
	4177	&pl,
	4178	UPL_FILE_IO \| UPL_SET_LITE,
	4179	VM_KERN_MEMORY_FILE);
	4180	if (kret != KERN_SUCCESS) {
	4181	panic("cluster_read_copy: failed to get pagelist");
	4182	}
	4183
	4184	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	4185	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4186
	4187	/*
	4188	* scan from the beginning of the upl looking for the first
	4189	* non-valid page.... this will become the first page in
	4190	* the request we're going to make to 'cluster_io'... if all
	4191	* of the pages are valid, we won't call through to 'cluster_io'
	4192	*/
	4193	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	4194	if (!upl_valid_page(pl, start_pg)) {
	4195	break;
	4196	}
	4197	}
	4198
	4199	/*
	4200	* scan from the starting invalid page looking for a valid
	4201	* page before the end of the upl is reached, if we
	4202	* find one, then it will be the last page of the request to
	4203	* 'cluster_io'
	4204	*/
	4205	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4206	if (upl_valid_page(pl, last_pg)) {
	4207	break;
	4208	}
	4209	}
	4210
	4211	if (start_pg < last_pg) {
	4212	/*
	4213	* we found a range of 'invalid' pages that must be filled
	4214	* if the last page in this range is the last page of the file
	4215	* we may have to clip the size of it to keep from reading past
	4216	* the end of the last physical block associated with the file
	4217	*/
	4218	if (iolock_inited == FALSE) {
	4219	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
	4220
	4221	iolock_inited = TRUE;
	4222	}
	4223	upl_offset = start_pg * PAGE_SIZE;
	4224	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4225
	4226	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
	4227	io_size = (u_int32_t)(filesize - (upl_f_offset + upl_offset));
	4228	}
	4229
	4230	/*
	4231	* Find out if this needs verification, we'll have to manage the UPL
	4232	* diffrently if so. Note that this call only lets us know if
	4233	* verification is enabled on this mount point, the actual verification
	4234	* is performed in the File system.
	4235	*/
	4236	size_t verify_block_size = 0;
	4237	if ((VNOP_VERIFY(vp, start_offset, NULL, 0, &verify_block_size, VNODE_VERIFY_DEFAULT, NULL) == 0) /* && verify_block_size */) {
	4238	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	4239	if (!upl_valid_page(pl, uio_last)) {
	4240	break;
	4241	}
	4242	}
	4243	if (uio_last < pages_in_upl) {
	4244	/*
	4245	* there were some invalid pages beyond the valid pages
	4246	* that we didn't issue an I/O for, just release them
	4247	* unchanged now, so that any prefetch/readahed can
	4248	* include them
	4249	*/
	4250	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	4251	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4252	leftover_upl_aborted = true;
	4253	}
	4254	}
	4255
	4256	/*
	4257	* issue an asynchronous read to cluster_io
	4258	*/
	4259
	4260	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	4261	io_size, CL_READ \| CL_ASYNC \| bflag, (buf_t)NULL, &iostate, callback, callback_arg);
	4262
	4263	if (rap) {
	4264	if (extent.e_addr < rap->cl_maxra) {
	4265	/*
	4266	* we've just issued a read for a block that should have been
	4267	* in the cache courtesy of the read-ahead engine... something
	4268	* has gone wrong with the pipeline, so reset the read-ahead
	4269	* logic which will cause us to restart from scratch
	4270	*/
	4271	rap->cl_maxra = 0;
	4272	}
	4273	}
	4274	}
	4275	if (error == 0) {
	4276	/*
	4277	* if the read completed successfully, or there was no I/O request
	4278	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	4279	* we'll first add on any 'valid'
	4280	* pages that were present in the upl when we acquired it.
	4281	*/
	4282	u_int val_size;
	4283
	4284	if (!leftover_upl_aborted) {
	4285	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	4286	if (!upl_valid_page(pl, uio_last)) {
	4287	break;
	4288	}
	4289	}
	4290	if (uio_last < pages_in_upl) {
	4291	/*
	4292	* there were some invalid pages beyond the valid pages
	4293	* that we didn't issue an I/O for, just release them
	4294	* unchanged now, so that any prefetch/readahed can
	4295	* include them
	4296	*/
	4297	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	4298	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4299	}
	4300	}
	4301
	4302	/*
	4303	* compute size to transfer this round, if io_req_size is
	4304	* still non-zero after this attempt, we'll loop around and
	4305	* set up for another I/O.
	4306	*/
	4307	val_size = (uio_last * PAGE_SIZE) - start_offset;
	4308
	4309	if (val_size > max_size) {
	4310	val_size = (u_int)max_size;
	4311	}
	4312
	4313	if (val_size > io_req_size) {
	4314	val_size = io_req_size;
	4315	}
	4316
	4317	if ((uio->uio_offset + val_size) > last_ioread_offset) {
	4318	last_ioread_offset = uio->uio_offset + val_size;
	4319	}
	4320
	4321	if ((size_of_prefetch = (u_int32_t)(last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	4322	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
	4323	/*
	4324	* if there's still I/O left to do for this request, and...
	4325	* we're not in hard throttle mode, and...
	4326	* we're close to using up the previous prefetch, then issue a
	4327	* new pre-fetch I/O... the I/O latency will overlap
	4328	* with the copying of the data
	4329	*/
	4330	if (size_of_prefetch > max_rd_size) {
	4331	size_of_prefetch = max_rd_size;
	4332	}
	4333
	4334	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	4335
	4336	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	4337
	4338	if (last_ioread_offset > last_request_offset) {
	4339	last_ioread_offset = last_request_offset;
	4340	}
	4341	}
	4342	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	4343	/*
	4344	* this transfer will finish this request, so...
	4345	* let's try to read ahead if we're in
	4346	* a sequential access pattern and we haven't
	4347	* explicitly disabled it
	4348	*/
	4349	if (rd_ahead_enabled) {
	4350	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	4351	}
	4352
	4353	if (rap != NULL) {
	4354	if (extent.e_addr < rap->cl_lastr) {
	4355	rap->cl_maxra = 0;
	4356	}
	4357	rap->cl_lastr = extent.e_addr;
	4358	}
	4359	}
	4360	if (iolock_inited == TRUE) {
	4361	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4362	}
	4363
	4364	if (iostate.io_error) {
	4365	error = iostate.io_error;
	4366	} else {
	4367	u_int32_t io_requested;
	4368
	4369	io_requested = val_size;
	4370
	4371	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
	4372
	4373	io_req_size -= (val_size - io_requested);
	4374	}
	4375	} else {
	4376	if (iolock_inited == TRUE) {
	4377	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4378	}
	4379	}
	4380	if (start_pg < last_pg) {
	4381	/*
	4382	* compute the range of pages that we actually issued an I/O for
	4383	* and either commit them as valid if the I/O succeeded
	4384	* or abort them if the I/O failed or we're not supposed to
	4385	* keep them in the cache
	4386	*/
	4387	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4388
	4389	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4390
	4391	if (error \|\| (flags & IO_NOCACHE)) {
	4392	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	4393	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	4394	} else {
	4395	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
	4396
	4397	if (take_reference) {
	4398	commit_flags \|= UPL_COMMIT_INACTIVATE;
	4399	} else {
	4400	commit_flags \|= UPL_COMMIT_SPECULATE;
	4401	}
	4402
	4403	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
	4404	}
	4405	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4406	}
	4407	if ((last_pg - start_pg) < pages_in_upl) {
	4408	/*
	4409	* the set of pages that we issued an I/O for did not encompass
	4410	* the entire upl... so just release these without modifying
	4411	* their state
	4412	*/
	4413	if (error) {
	4414	if (leftover_upl_aborted) {
	4415	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, (uio_last - start_pg) * PAGE_SIZE,
	4416	UPL_ABORT_FREE_ON_EMPTY);
	4417	} else {
	4418	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	4419	}
	4420	} else {
	4421	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	4422	upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	4423
	4424	/*
	4425	* handle any valid pages at the beginning of
	4426	* the upl... release these appropriately
	4427	*/
	4428	cluster_read_upl_release(upl, 0, start_pg, take_reference);
	4429
	4430	/*
	4431	* handle any valid pages immediately after the
	4432	* pages we issued I/O for... ... release these appropriately
	4433	*/
	4434	cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
	4435
	4436	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, -1, -1, 0, 0);
	4437	}
	4438	}
	4439	if (retval == 0) {
	4440	retval = error;
	4441	}
	4442
	4443	if (io_req_size) {
	4444	if (cluster_is_throttled(vp)) {
	4445	/*
	4446	* we're in the throttle window, at the very least
	4447	* we want to limit the size of the I/O we're about
	4448	* to issue
	4449	*/
	4450	rd_ahead_enabled = 0;
	4451	prefetch_enabled = 0;
	4452	max_rd_size = THROTTLE_MAX_IOSIZE;
	4453	} else {
	4454	if (max_rd_size == THROTTLE_MAX_IOSIZE) {
	4455	/*
	4456	* coming out of throttled state
	4457	*/
	4458	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
	4459	if (rap != NULL) {
	4460	rd_ahead_enabled = 1;
	4461	}
	4462	prefetch_enabled = 1;
	4463	}
	4464	max_rd_size = max_prefetch;
	4465	last_ioread_offset = 0;
	4466	}
	4467	}
	4468	}
	4469	}
	4470	if (iolock_inited == TRUE) {
	4471	/*
	4472	* cluster_io returned an error after it
	4473	* had already issued some I/O. we need
	4474	* to wait for that I/O to complete before
	4475	* we can destroy the iostate mutex...
	4476	* 'retval' already contains the early error
	4477	* so no need to pick it up from iostate.io_error
	4478	*/
	4479	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4480
	4481	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
	4482	}
	4483	if (rap != NULL) {
	4484	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4485	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
	4486
	4487	lck_mtx_unlock(&rap->cl_lockr);
	4488	} else {
	4489	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4490	(int)uio->uio_offset, io_req_size, 0, retval, 0);
	4491	}
	4492
	4493	return retval;
	4494	}
	4495
	4496	/*
	4497	* We don't want another read/write lock for every vnode in the system
	4498	* so we keep a hash of them here. There should never be very many of
	4499	* these around at any point in time.
	4500	*/
	4501	cl_direct_read_lock_t *
	4502	cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
	4503	{
	4504	struct cl_direct_read_locks *head
	4505	= &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
	4506	% CL_DIRECT_READ_LOCK_BUCKETS];
	4507
	4508	struct cl_direct_read_lock lck, new_lck = NULL;
	4509
	4510	for (;;) {
	4511	lck_spin_lock(&cl_direct_read_spin_lock);
	4512
	4513	LIST_FOREACH(lck, head, chain) {
	4514	if (lck->vp == vp) {
	4515	++lck->ref_count;
	4516	lck_spin_unlock(&cl_direct_read_spin_lock);
	4517	if (new_lck) {
	4518	// Someone beat us to it, ditch the allocation
	4519	lck_rw_destroy(&new_lck->rw_lock, &cl_mtx_grp);
	4520	kheap_free(KHEAP_DEFAULT, new_lck, sizeof(cl_direct_read_lock_t));
	4521	}
	4522	lck_rw_lock(&lck->rw_lock, type);
	4523	return lck;
	4524	}
	4525	}
	4526
	4527	if (new_lck) {
	4528	// Use the lock we allocated
	4529	LIST_INSERT_HEAD(head, new_lck, chain);
	4530	lck_spin_unlock(&cl_direct_read_spin_lock);
	4531	lck_rw_lock(&new_lck->rw_lock, type);
	4532	return new_lck;
	4533	}
	4534
	4535	lck_spin_unlock(&cl_direct_read_spin_lock);
	4536
	4537	// Allocate a new lock
	4538	new_lck = kheap_alloc(KHEAP_DEFAULT, sizeof(cl_direct_read_lock_t),
	4539	Z_WAITOK);
	4540	lck_rw_init(&new_lck->rw_lock, &cl_mtx_grp, LCK_ATTR_NULL);
	4541	new_lck->vp = vp;
	4542	new_lck->ref_count = 1;
	4543
	4544	// Got to go round again
	4545	}
	4546	}
	4547
	4548	void
	4549	cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
	4550	{
	4551	lck_rw_done(&lck->rw_lock);
	4552
	4553	lck_spin_lock(&cl_direct_read_spin_lock);
	4554	if (lck->ref_count == 1) {
	4555	LIST_REMOVE(lck, chain);
	4556	lck_spin_unlock(&cl_direct_read_spin_lock);
	4557	lck_rw_destroy(&lck->rw_lock, &cl_mtx_grp);
	4558	kheap_free(KHEAP_DEFAULT, lck, sizeof(cl_direct_read_lock_t));
	4559	} else {
	4560	--lck->ref_count;
	4561	lck_spin_unlock(&cl_direct_read_spin_lock);
	4562	}
	4563	}
	4564
	4565	static int
	4566	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4567	int flags, int (callback)(buf_t, void ), void *callback_arg)
	4568	{
	4569	upl_t upl;
	4570	upl_page_info_t *pl;
	4571	off_t max_io_size;
	4572	vm_offset_t upl_offset, vector_upl_offset = 0;
	4573	upl_size_t upl_size, vector_upl_size = 0;
	4574	vm_size_t upl_needed_size;
	4575	unsigned int pages_in_pl;
	4576	upl_control_flags_t upl_flags;
	4577	kern_return_t kret;
	4578	unsigned int i;
	4579	int force_data_sync;
	4580	int retval = 0;
	4581	int no_zero_fill = 0;
	4582	int io_flag = 0;
	4583	int misaligned = 0;
	4584	struct clios iostate;
	4585	user_addr_t iov_base;
	4586	u_int32_t io_req_size;
	4587	u_int32_t offset_in_file;
	4588	u_int32_t offset_in_iovbase;
	4589	u_int32_t io_size;
	4590	u_int32_t io_min;
	4591	u_int32_t xsize;
	4592	u_int32_t devblocksize;
	4593	u_int32_t mem_alignment_mask;
	4594	u_int32_t max_upl_size;
	4595	u_int32_t max_rd_size;
	4596	u_int32_t max_rd_ahead;
	4597	u_int32_t max_vector_size;
	4598	boolean_t io_throttled = FALSE;
	4599
	4600	u_int32_t vector_upl_iosize = 0;
	4601	int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
	4602	off_t v_upl_uio_offset = 0;
	4603	int vector_upl_index = 0;
	4604	upl_t vector_upl = NULL;
	4605	cl_direct_read_lock_t *lock = NULL;
	4606
	4607	user_addr_t orig_iov_base = 0;
	4608	user_addr_t last_iov_base = 0;
	4609	user_addr_t next_iov_base = 0;
	4610
	4611	assert(vm_map_page_shift(current_map()) >= PAGE_SHIFT);
	4612
	4613	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	4614	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4615
	4616	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4617
	4618	max_rd_size = max_upl_size;
	4619	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4620
	4621	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
	4622
	4623	if (flags & IO_PASSIVE) {
	4624	io_flag \|= CL_PASSIVE;
	4625	}
	4626
	4627	if (flags & IO_ENCRYPTED) {
	4628	io_flag \|= CL_RAW_ENCRYPTED;
	4629	}
	4630
	4631	if (flags & IO_NOCACHE) {
	4632	io_flag \|= CL_NOCACHE;
	4633	}
	4634
	4635	if (flags & IO_SKIP_ENCRYPTION) {
	4636	io_flag \|= CL_ENCRYPTED;
	4637	}
	4638
	4639	iostate.io_completed = 0;
	4640	iostate.io_issued = 0;
	4641	iostate.io_error = 0;
	4642	iostate.io_wanted = 0;
	4643
	4644	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
	4645
	4646	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4647	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4648
	4649	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4650	(int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
	4651
	4652	if (devblocksize == 1) {
	4653	/*
	4654	* the AFP client advertises a devblocksize of 1
	4655	* however, its BLOCKMAP routine maps to physical
	4656	* blocks that are PAGE_SIZE in size...
	4657	* therefore we can't ask for I/Os that aren't page aligned
	4658	* or aren't multiples of PAGE_SIZE in size
	4659	* by setting devblocksize to PAGE_SIZE, we re-instate
	4660	* the old behavior we had before the mem_alignment_mask
	4661	* changes went in...
	4662	*/
	4663	devblocksize = PAGE_SIZE;
	4664	}
	4665
	4666	orig_iov_base = uio_curriovbase(uio);
	4667	last_iov_base = orig_iov_base;
	4668
	4669	next_dread:
	4670	io_req_size = *read_length;
	4671	iov_base = uio_curriovbase(uio);
	4672
	4673	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
	4674	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	4675
	4676	if (vm_map_page_mask(current_map()) < PAGE_MASK) {
	4677	/*
	4678	* XXX TODO4K
	4679	* Direct I/O might not work as expected from a 16k kernel space
	4680	* to a 4k user space because each 4k chunk might point to
	4681	* a different 16k physical page...
	4682	* Let's go the "misaligned" way.
	4683	*/
	4684	if (!misaligned) {
	4685	DEBUG4K_VFS("forcing misaligned\n");
	4686	}
	4687	misaligned = 1;
	4688	}
	4689
	4690	if (offset_in_file \|\| offset_in_iovbase) {
	4691	/*
	4692	* one of the 2 important offsets is misaligned
	4693	* so fire an I/O through the cache for this entire vector
	4694	*/
	4695	misaligned = 1;
	4696	}
	4697	if (iov_base & (devblocksize - 1)) {
	4698	/*
	4699	* the offset in memory must be on a device block boundary
	4700	* so that we can guarantee that we can generate an
	4701	* I/O that ends on a page boundary in cluster_io
	4702	*/
	4703	misaligned = 1;
	4704	}
	4705
	4706	max_io_size = filesize - uio->uio_offset;
	4707
	4708	/*
	4709	* The user must request IO in aligned chunks. If the
	4710	* offset into the file is bad, or the userland pointer
	4711	* is non-aligned, then we cannot service the encrypted IO request.
	4712	*/
	4713	if (flags & IO_ENCRYPTED) {
	4714	if (misaligned \|\| (io_req_size & (devblocksize - 1))) {
	4715	retval = EINVAL;
	4716	}
	4717
	4718	max_io_size = roundup(max_io_size, devblocksize);
	4719	}
	4720
	4721	if ((off_t)io_req_size > max_io_size) {
	4722	io_req_size = (u_int32_t)max_io_size;
	4723	}
	4724
	4725	/*
	4726	* When we get to this point, we know...
	4727	* -- the offset into the file is on a devblocksize boundary
	4728	*/
	4729
	4730	while (io_req_size && retval == 0) {
	4731	u_int32_t io_start;
	4732
	4733	if (cluster_is_throttled(vp)) {
	4734	/*
	4735	* we're in the throttle window, at the very least
	4736	* we want to limit the size of the I/O we're about
	4737	* to issue
	4738	*/
	4739	max_rd_size = THROTTLE_MAX_IOSIZE;
	4740	max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
	4741	max_vector_size = THROTTLE_MAX_IOSIZE;
	4742	} else {
	4743	max_rd_size = max_upl_size;
	4744	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4745	max_vector_size = MAX_VECTOR_UPL_SIZE;
	4746	}
	4747	io_start = io_size = io_req_size;
	4748
	4749	/*
	4750	* First look for pages already in the cache
	4751	* and move them to user space. But only do this
	4752	* check if we are not retrieving encrypted data directly
	4753	* from the filesystem; those blocks should never
	4754	* be in the UBC.
	4755	*
	4756	* cluster_copy_ubc_data returns the resid
	4757	* in io_size
	4758	*/
	4759	if ((flags & IO_ENCRYPTED) == 0) {
	4760	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
	4761	}
	4762	/*
	4763	* calculate the number of bytes actually copied
	4764	* starting size - residual
	4765	*/
	4766	xsize = io_start - io_size;
	4767
	4768	io_req_size -= xsize;
	4769
	4770	if (useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
	4771	/*
	4772	* We found something in the cache or we have an iov_base that's not
	4773	* page-aligned.
	4774	*
	4775	* Issue all I/O's that have been collected within this Vectored UPL.
	4776	*/
	4777	if (vector_upl_index) {
	4778	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4779	reset_vector_run_state();
	4780	}
	4781
	4782	if (xsize) {
	4783	useVectorUPL = 0;
	4784	}
	4785
	4786	/*
	4787	* After this point, if we are using the Vector UPL path and the base is
	4788	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	4789	*/
	4790	}
	4791
	4792	/*
	4793	* check to see if we are finished with this request.
	4794	*
	4795	* If we satisfied this IO already, then io_req_size will be 0.
	4796	* Otherwise, see if the IO was mis-aligned and needs to go through
	4797	* the UBC to deal with the 'tail'.
	4798	*
	4799	*/
	4800	if (io_req_size == 0 \|\| (misaligned)) {
	4801	/*
	4802	* see if there's another uio vector to
	4803	* process that's of type IO_DIRECT
	4804	*
	4805	* break out of while loop to get there
	4806	*/
	4807	break;
	4808	}
	4809	/*
	4810	* assume the request ends on a device block boundary
	4811	*/
	4812	io_min = devblocksize;
	4813
	4814	/*
	4815	* we can handle I/O's in multiples of the device block size
	4816	* however, if io_size isn't a multiple of devblocksize we
	4817	* want to clip it back to the nearest page boundary since
	4818	* we are going to have to go through cluster_read_copy to
	4819	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
	4820	* multiple, we avoid asking the drive for the same physical
	4821	* blocks twice.. once for the partial page at the end of the
	4822	* request and a 2nd time for the page we read into the cache
	4823	* (which overlaps the end of the direct read) in order to
	4824	* get at the overhang bytes
	4825	*/
	4826	if (io_size & (devblocksize - 1)) {
	4827	assert(!(flags & IO_ENCRYPTED));
	4828	/*
	4829	* Clip the request to the previous page size boundary
	4830	* since request does NOT end on a device block boundary
	4831	*/
	4832	io_size &= ~PAGE_MASK;
	4833	io_min = PAGE_SIZE;
	4834	}
	4835	if (retval \|\| io_size < io_min) {
	4836	/*
	4837	* either an error or we only have the tail left to
	4838	* complete via the copy path...
	4839	* we may have already spun some portion of this request
	4840	* off as async requests... we need to wait for the I/O
	4841	* to complete before returning
	4842	*/
	4843	goto wait_for_dreads;
	4844	}
	4845
	4846	/*
	4847	* Don't re-check the UBC data if we are looking for uncached IO
	4848	* or asking for encrypted blocks.
	4849	*/
	4850	if ((flags & IO_ENCRYPTED) == 0) {
	4851	if ((xsize = io_size) > max_rd_size) {
	4852	xsize = max_rd_size;
	4853	}
	4854
	4855	io_size = 0;
	4856
	4857	if (!lock) {
	4858	/*
	4859	* We hold a lock here between the time we check the
	4860	* cache and the time we issue I/O. This saves us
	4861	* from having to lock the pages in the cache. Not
	4862	* all clients will care about this lock but some
	4863	* clients may want to guarantee stability between
	4864	* here and when the I/O is issued in which case they
	4865	* will take the lock exclusively.
	4866	*/
	4867	lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
	4868	}
	4869
	4870	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
	4871
	4872	if (io_size == 0) {
	4873	/*
	4874	* a page must have just come into the cache
	4875	* since the first page in this range is no
	4876	* longer absent, go back and re-evaluate
	4877	*/
	4878	continue;
	4879	}
	4880	}
	4881	if ((flags & IO_RETURN_ON_THROTTLE)) {
	4882	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4883	if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4884	/*
	4885	* we're in the throttle window and at least 1 I/O
	4886	* has already been issued by a throttleable thread
	4887	* in this window, so return with EAGAIN to indicate
	4888	* to the FS issuing the cluster_read call that it
	4889	* should now throttle after dropping any locks
	4890	*/
	4891	throttle_info_update_by_mount(vp->v_mount);
	4892
	4893	io_throttled = TRUE;
	4894	goto wait_for_dreads;
	4895	}
	4896	}
	4897	}
	4898	if (io_size > max_rd_size) {
	4899	io_size = max_rd_size;
	4900	}
	4901
	4902	iov_base = uio_curriovbase(uio);
	4903
	4904	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4905	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	4906
	4907	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	4908	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	4909
	4910	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
	4911	no_zero_fill = 1;
	4912	} else {
	4913	no_zero_fill = 0;
	4914	}
	4915
	4916	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4917	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	4918	pages_in_pl = 0;
	4919	upl_size = (upl_size_t)upl_needed_size;
	4920	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4921	if (no_zero_fill) {
	4922	upl_flags \|= UPL_NOZEROFILL;
	4923	}
	4924	if (force_data_sync) {
	4925	upl_flags \|= UPL_FORCE_DATA_SYNC;
	4926	}
	4927
	4928	kret = vm_map_create_upl(map,
	4929	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4930	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
	4931
	4932	if (kret != KERN_SUCCESS) {
	4933	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4934	(int)upl_offset, upl_size, io_size, kret, 0);
	4935	/*
	4936	* failed to get pagelist
	4937	*
	4938	* we may have already spun some portion of this request
	4939	* off as async requests... we need to wait for the I/O
	4940	* to complete before returning
	4941	*/
	4942	goto wait_for_dreads;
	4943	}
	4944	pages_in_pl = upl_size / PAGE_SIZE;
	4945	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	4946
	4947	for (i = 0; i < pages_in_pl; i++) {
	4948	if (!upl_page_present(pl, i)) {
	4949	break;
	4950	}
	4951	}
	4952	if (i == pages_in_pl) {
	4953	break;
	4954	}
	4955
	4956	ubc_upl_abort(upl, 0);
	4957	}
	4958	if (force_data_sync >= 3) {
	4959	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4960	(int)upl_offset, upl_size, io_size, kret, 0);
	4961
	4962	goto wait_for_dreads;
	4963	}
	4964	/*
	4965	* Consider the possibility that upl_size wasn't satisfied.
	4966	*/
	4967	if (upl_size < upl_needed_size) {
	4968	if (upl_size && upl_offset == 0) {
	4969	io_size = upl_size;
	4970	} else {
	4971	io_size = 0;
	4972	}
	4973	}
	4974	if (io_size == 0) {
	4975	ubc_upl_abort(upl, 0);
	4976	goto wait_for_dreads;
	4977	}
	4978	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4979	(int)upl_offset, upl_size, io_size, kret, 0);
	4980
	4981	if (useVectorUPL) {
	4982	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	4983	if (end_off) {
	4984	issueVectorUPL = 1;
	4985	}
	4986	/*
	4987	* After this point, if we are using a vector UPL, then
	4988	* either all the UPL elements end on a page boundary OR
	4989	* this UPL is the last element because it does not end
	4990	* on a page boundary.
	4991	*/
	4992	}
	4993
	4994	/*
	4995	* request asynchronously so that we can overlap
	4996	* the preparation of the next I/O
	4997	* if there are already too many outstanding reads
	4998	* wait until some have completed before issuing the next read
	4999	*/
	5000	cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
	5001
	5002	if (iostate.io_error) {
	5003	/*
	5004	* one of the earlier reads we issued ran into a hard error
	5005	* don't issue any more reads, cleanup the UPL
	5006	* that was just created but not used, then
	5007	* go wait for any other reads to complete before
	5008	* returning the error to the caller
	5009	*/
	5010	ubc_upl_abort(upl, 0);
	5011
	5012	goto wait_for_dreads;
	5013	}
	5014	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	5015	upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	5016
	5017	if (!useVectorUPL) {
	5018	if (no_zero_fill) {
	5019	io_flag &= ~CL_PRESERVE;
	5020	} else {
	5021	io_flag \|= CL_PRESERVE;
	5022	}
	5023
	5024	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	5025	} else {
	5026	if (!vector_upl_index) {
	5027	vector_upl = vector_upl_create(upl_offset);
	5028	v_upl_uio_offset = uio->uio_offset;
	5029	vector_upl_offset = upl_offset;
	5030	}
	5031
	5032	vector_upl_set_subupl(vector_upl, upl, upl_size);
	5033	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	5034	vector_upl_index++;
	5035	vector_upl_size += upl_size;
	5036	vector_upl_iosize += io_size;
	5037
	5038	if (issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	5039	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	5040	reset_vector_run_state();
	5041	}
	5042	}
	5043	last_iov_base = iov_base + io_size;
	5044
	5045	if (lock) {
	5046	// We don't need to wait for the I/O to complete
	5047	cluster_unlock_direct_read(lock);
	5048	lock = NULL;
	5049	}
	5050
	5051	/*
	5052	* update the uio structure
	5053	*/
	5054	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
	5055	uio_update(uio, (user_size_t)max_io_size);
	5056	} else {
	5057	uio_update(uio, (user_size_t)io_size);
	5058	}
	5059
	5060	io_req_size -= io_size;
	5061
	5062	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	5063	upl, (int)uio->uio_offset, io_req_size, retval, 0);
	5064	} /* end while */
	5065
	5066	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
	5067	retval = cluster_io_type(uio, read_type, read_length, 0);
	5068
	5069	if (retval == 0 && *read_type == IO_DIRECT) {
	5070	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	5071	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	5072
	5073	goto next_dread;
	5074	}
	5075	}
	5076
	5077	wait_for_dreads:
	5078
	5079	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	5080	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	5081	reset_vector_run_state();
	5082	}
	5083
	5084	// We don't need to wait for the I/O to complete
	5085	if (lock) {
	5086	cluster_unlock_direct_read(lock);
	5087	}
	5088
	5089	/*
	5090	* make sure all async reads that are part of this stream
	5091	* have completed before we return
	5092	*/
	5093	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
	5094
	5095	if (iostate.io_error) {
	5096	retval = iostate.io_error;
	5097	}
	5098
	5099	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
	5100
	5101	if (io_throttled == TRUE && retval == 0) {
	5102	retval = EAGAIN;
	5103	}
	5104
	5105	vm_map_offset_t current_page_size, current_page_mask;
	5106	current_page_size = vm_map_page_size(current_map());
	5107	current_page_mask = vm_map_page_mask(current_map());
	5108	for (next_iov_base = orig_iov_base;
	5109	next_iov_base < last_iov_base;
	5110	next_iov_base += current_page_size) {
	5111	/*
	5112	* This is specifically done for pmap accounting purposes.
	5113	* vm_pre_fault() will call vm_fault() to enter the page into
	5114	* the pmap if there isn't _a_ physical page for that VA already.
	5115	*/
	5116	vm_pre_fault(vm_map_trunc_page(next_iov_base, current_page_mask), VM_PROT_READ);
	5117	}
	5118
	5119	if (io_req_size && retval == 0) {
	5120	/*
	5121	* we couldn't handle the tail of this request in DIRECT mode
	5122	* so fire it through the copy path
	5123	*/
	5124	if (flags & IO_ENCRYPTED) {
	5125	/*
	5126	* We cannot fall back to the copy path for encrypted I/O. If this
	5127	* happens, there is something wrong with the user buffer passed
	5128	* down.
	5129	*/
	5130	retval = EFAULT;
	5131	} else {
	5132	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
	5133	}
	5134
	5135	*read_type = IO_UNKNOWN;
	5136	}
	5137	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	5138	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
	5139
	5140	return retval;
	5141	}
	5142
	5143
	5144	static int
	5145	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	5146	int (callback)(buf_t, void ), void *callback_arg, int flags)
	5147	{
	5148	upl_page_info_t *pl;
	5149	upl_t upl[MAX_VECTS];
	5150	vm_offset_t upl_offset;
	5151	addr64_t dst_paddr = 0;
	5152	user_addr_t iov_base;
	5153	off_t max_size;
	5154	upl_size_t upl_size;
	5155	vm_size_t upl_needed_size;
	5156	mach_msg_type_number_t pages_in_pl;
	5157	upl_control_flags_t upl_flags;
	5158	kern_return_t kret;
	5159	struct clios iostate;
	5160	int error = 0;
	5161	int cur_upl = 0;
	5162	int num_upl = 0;
	5163	int n;
	5164	u_int32_t xsize;
	5165	u_int32_t io_size;
	5166	u_int32_t devblocksize;
	5167	u_int32_t mem_alignment_mask;
	5168	u_int32_t tail_size = 0;
	5169	int bflag;
	5170
	5171	if (flags & IO_PASSIVE) {
	5172	bflag = CL_PASSIVE;
	5173	} else {
	5174	bflag = 0;
	5175	}
	5176
	5177	if (flags & IO_NOCACHE) {
	5178	bflag \|= CL_NOCACHE;
	5179	}
	5180
	5181	/*
	5182	* When we enter this routine, we know
	5183	* -- the read_length will not exceed the current iov_len
	5184	* -- the target address is physically contiguous for read_length
	5185	*/
	5186	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
	5187
	5188	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	5189	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	5190
	5191	iostate.io_completed = 0;
	5192	iostate.io_issued = 0;
	5193	iostate.io_error = 0;
	5194	iostate.io_wanted = 0;
	5195
	5196	lck_mtx_init(&iostate.io_mtxp, &cl_mtx_grp, LCK_ATTR_NULL);
	5197
	5198	next_cread:
	5199	io_size = *read_length;
	5200
	5201	max_size = filesize - uio->uio_offset;
	5202
	5203	if (io_size > max_size) {
	5204	io_size = (u_int32_t)max_size;
	5205	}
	5206
	5207	iov_base = uio_curriovbase(uio);
	5208
	5209	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	5210	upl_needed_size = upl_offset + io_size;
	5211
	5212	pages_in_pl = 0;
	5213	upl_size = (upl_size_t)upl_needed_size;
	5214	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	5215
	5216
	5217	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_START,
	5218	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
	5219
	5220	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	5221	kret = vm_map_get_upl(map,
	5222	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
	5223	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	5224
	5225	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_END,
	5226	(int)upl_offset, upl_size, io_size, kret, 0);
	5227
	5228	if (kret != KERN_SUCCESS) {
	5229	/*
	5230	* failed to get pagelist
	5231	*/
	5232	error = EINVAL;
	5233	goto wait_for_creads;
	5234	}
	5235	num_upl++;
	5236
	5237	if (upl_size < upl_needed_size) {
	5238	/*
	5239	* The upl_size wasn't satisfied.
	5240	*/
	5241	error = EINVAL;
	5242	goto wait_for_creads;
	5243	}
	5244	pl = ubc_upl_pageinfo(upl[cur_upl]);
	5245
	5246	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	5247
	5248	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	5249	u_int32_t head_size;
	5250
	5251	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	5252
	5253	if (head_size > io_size) {
	5254	head_size = io_size;
	5255	}
	5256
	5257	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
	5258
	5259	if (error) {
	5260	goto wait_for_creads;
	5261	}
	5262
	5263	upl_offset += head_size;
	5264	dst_paddr += head_size;
	5265	io_size -= head_size;
	5266
	5267	iov_base += head_size;
	5268	}
	5269	if ((u_int32_t)iov_base & mem_alignment_mask) {
	5270	/*
	5271	* request doesn't set up on a memory boundary
	5272	* the underlying DMA engine can handle...
	5273	* return an error instead of going through
	5274	* the slow copy path since the intent of this
	5275	* path is direct I/O to device memory
	5276	*/
	5277	error = EINVAL;
	5278	goto wait_for_creads;
	5279	}
	5280
	5281	tail_size = io_size & (devblocksize - 1);
	5282
	5283	io_size -= tail_size;
	5284
	5285	while (io_size && error == 0) {
	5286	if (io_size > MAX_IO_CONTIG_SIZE) {
	5287	xsize = MAX_IO_CONTIG_SIZE;
	5288	} else {
	5289	xsize = io_size;
	5290	}
	5291	/*
	5292	* request asynchronously so that we can overlap
	5293	* the preparation of the next I/O... we'll do
	5294	* the commit after all the I/O has completed
	5295	* since its all issued against the same UPL
	5296	* if there are already too many outstanding reads
	5297	* wait until some have completed before issuing the next
	5298	*/
	5299	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
	5300
	5301	if (iostate.io_error) {
	5302	/*
	5303	* one of the earlier reads we issued ran into a hard error
	5304	* don't issue any more reads...
	5305	* go wait for any other reads to complete before
	5306	* returning the error to the caller
	5307	*/
	5308	goto wait_for_creads;
	5309	}
	5310	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
	5311	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
	5312	(buf_t)NULL, &iostate, callback, callback_arg);
	5313	/*
	5314	* The cluster_io read was issued successfully,
	5315	* update the uio structure
	5316	*/
	5317	if (error == 0) {
	5318	uio_update(uio, (user_size_t)xsize);
	5319
	5320	dst_paddr += xsize;
	5321	upl_offset += xsize;
	5322	io_size -= xsize;
	5323	}
	5324	}
	5325	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
	5326	error = cluster_io_type(uio, read_type, read_length, 0);
	5327
	5328	if (error == 0 && *read_type == IO_CONTIG) {
	5329	cur_upl++;
	5330	goto next_cread;
	5331	}
	5332	} else {
	5333	*read_type = IO_UNKNOWN;
	5334	}
	5335
	5336	wait_for_creads:
	5337	/*
	5338	* make sure all async reads that are part of this stream
	5339	* have completed before we proceed
	5340	*/
	5341	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
	5342
	5343	if (iostate.io_error) {
	5344	error = iostate.io_error;
	5345	}
	5346
	5347	lck_mtx_destroy(&iostate.io_mtxp, &cl_mtx_grp);
	5348
	5349	if (error == 0 && tail_size) {
	5350	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
	5351	}
	5352
	5353	for (n = 0; n < num_upl; n++) {
	5354	/*
	5355	* just release our hold on each physically contiguous
	5356	* region without changing any state
	5357	*/
	5358	ubc_upl_abort(upl[n], 0);
	5359	}
	5360
	5361	return error;
	5362	}
	5363
	5364
	5365	static int
	5366	cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length)
	5367	{
	5368	user_size_t iov_len;
	5369	user_addr_t iov_base = 0;
	5370	upl_t upl;
	5371	upl_size_t upl_size;
	5372	upl_control_flags_t upl_flags;
	5373	int retval = 0;
	5374
	5375	/*
	5376	* skip over any emtpy vectors
	5377	*/
	5378	uio_update(uio, (user_size_t)0);
	5379
	5380	iov_len = uio_curriovlen(uio);
	5381
	5382	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
	5383
	5384	if (iov_len) {
	5385	iov_base = uio_curriovbase(uio);
	5386	/*
	5387	* make sure the size of the vector isn't too big...
	5388	* internally, we want to handle all of the I/O in
	5389	* chunk sizes that fit in a 32 bit int
	5390	*/
	5391	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
	5392	upl_size = MAX_IO_REQUEST_SIZE;
	5393	} else {
	5394	upl_size = (u_int32_t)iov_len;
	5395	}
	5396
	5397	upl_flags = UPL_QUERY_OBJECT_TYPE;
	5398
	5399	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	5400	if ((vm_map_get_upl(map,
	5401	vm_map_trunc_page(iov_base, vm_map_page_mask(map)),
	5402	&upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
	5403	/*
	5404	* the user app must have passed in an invalid address
	5405	*/
	5406	retval = EFAULT;
	5407	}
	5408	if (upl_size == 0) {
	5409	retval = EFAULT;
	5410	}
	5411
	5412	*io_length = upl_size;
	5413
	5414	if (upl_flags & UPL_PHYS_CONTIG) {
	5415	*io_type = IO_CONTIG;
	5416	} else if (iov_len >= min_length) {
	5417	*io_type = IO_DIRECT;
	5418	} else {
	5419	*io_type = IO_COPY;
	5420	}
	5421	} else {
	5422	/*
	5423	* nothing left to do for this uio
	5424	*/
	5425	*io_length = 0;
	5426	*io_type = IO_UNKNOWN;
	5427	}
	5428	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, 0);
	5429
	5430	if (*io_type == IO_DIRECT &&
	5431	vm_map_page_shift(current_map()) < PAGE_SHIFT) {
	5432	/* no direct I/O for sub-page-size address spaces */
	5433	DEBUG4K_VFS("io_type IO_DIRECT -> IO_COPY\n");
	5434	*io_type = IO_COPY;
	5435	}
	5436
	5437	return retval;
	5438	}
	5439
	5440
	5441	/*
	5442	* generate advisory I/O's in the largest chunks possible
	5443	* the completed pages will be released into the VM cache
	5444	*/
	5445	int
	5446	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	5447	{
	5448	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
	5449	}
	5450
	5451	int
	5452	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	5453	{
	5454	upl_page_info_t *pl;
	5455	upl_t upl;
	5456	vm_offset_t upl_offset;
	5457	int upl_size;
	5458	off_t upl_f_offset;
	5459	int start_offset;
	5460	int start_pg;
	5461	int last_pg;
	5462	int pages_in_upl;
	5463	off_t max_size;
	5464	int io_size;
	5465	kern_return_t kret;
	5466	int retval = 0;
	5467	int issued_io;
	5468	int skip_range;
	5469	uint32_t max_io_size;
	5470
	5471
	5472	if (!UBCINFOEXISTS(vp)) {
	5473	return EINVAL;
	5474	}
	5475
	5476	if (f_offset < 0 \|\| resid < 0) {
	5477	return EINVAL;
	5478	}
	5479
	5480	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	5481
	5482	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
	5483	if (max_io_size > speculative_prefetch_max_iosize) {
	5484	max_io_size = speculative_prefetch_max_iosize;
	5485	}
	5486	}
	5487
	5488	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	5489	(int)f_offset, resid, (int)filesize, 0, 0);
	5490
	5491	while (resid && f_offset < filesize && retval == 0) {
	5492	/*
	5493	* compute the size of the upl needed to encompass
	5494	* the requested read... limit each call to cluster_io
	5495	* to the maximum UPL size... cluster_io will clip if
	5496	* this exceeds the maximum io_size for the device,
	5497	* make sure to account for
	5498	* a starting offset that's not page aligned
	5499	*/
	5500	start_offset = (int)(f_offset & PAGE_MASK_64);
	5501	upl_f_offset = f_offset - (off_t)start_offset;
	5502	max_size = filesize - f_offset;
	5503
	5504	if (resid < max_size) {
	5505	io_size = resid;
	5506	} else {
	5507	io_size = (int)max_size;
	5508	}
	5509
	5510	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5511	if ((uint32_t)upl_size > max_io_size) {
	5512	upl_size = max_io_size;
	5513	}
	5514
	5515	skip_range = 0;
	5516	/*
	5517	* return the number of contiguously present pages in the cache
	5518	* starting at upl_f_offset within the file
	5519	*/
	5520	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	5521
	5522	if (skip_range) {
	5523	/*
	5524	* skip over pages already present in the cache
	5525	*/
	5526	io_size = skip_range - start_offset;
	5527
	5528	f_offset += io_size;
	5529	resid -= io_size;
	5530
	5531	if (skip_range == upl_size) {
	5532	continue;
	5533	}
	5534	/*
	5535	* have to issue some real I/O
	5536	* at this point, we know it's starting on a page boundary
	5537	* because we've skipped over at least the first page in the request
	5538	*/
	5539	start_offset = 0;
	5540	upl_f_offset += skip_range;
	5541	upl_size -= skip_range;
	5542	}
	5543	pages_in_upl = upl_size / PAGE_SIZE;
	5544
	5545	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	5546	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5547
	5548	kret = ubc_create_upl_kernel(vp,
	5549	upl_f_offset,
	5550	upl_size,
	5551	&upl,
	5552	&pl,
	5553	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE,
	5554	VM_KERN_MEMORY_FILE);
	5555	if (kret != KERN_SUCCESS) {
	5556	return retval;
	5557	}
	5558	issued_io = 0;
	5559
	5560	/*
	5561	* before we start marching forward, we must make sure we end on
	5562	* a present page, otherwise we will be working with a freed
	5563	* upl
	5564	*/
	5565	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5566	if (upl_page_present(pl, last_pg)) {
	5567	break;
	5568	}
	5569	}
	5570	pages_in_upl = last_pg + 1;
	5571
	5572
	5573	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	5574	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5575
	5576
	5577	for (last_pg = 0; last_pg < pages_in_upl;) {
	5578	/*
	5579	* scan from the beginning of the upl looking for the first
	5580	* page that is present.... this will become the first page in
	5581	* the request we're going to make to 'cluster_io'... if all
	5582	* of the pages are absent, we won't call through to 'cluster_io'
	5583	*/
	5584	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5585	if (upl_page_present(pl, start_pg)) {
	5586	break;
	5587	}
	5588	}
	5589
	5590	/*
	5591	* scan from the starting present page looking for an absent
	5592	* page before the end of the upl is reached, if we
	5593	* find one, then it will terminate the range of pages being
	5594	* presented to 'cluster_io'
	5595	*/
	5596	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5597	if (!upl_page_present(pl, last_pg)) {
	5598	break;
	5599	}
	5600	}
	5601
	5602	if (last_pg > start_pg) {
	5603	/*
	5604	* we found a range of pages that must be filled
	5605	* if the last page in this range is the last page of the file
	5606	* we may have to clip the size of it to keep from reading past
	5607	* the end of the last physical block associated with the file
	5608	*/
	5609	upl_offset = start_pg * PAGE_SIZE;
	5610	io_size = (last_pg - start_pg) * PAGE_SIZE;
	5611
	5612	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
	5613	io_size = (int)(filesize - (upl_f_offset + upl_offset));
	5614	}
	5615
	5616	/*
	5617	* issue an asynchronous read to cluster_io
	5618	*/
	5619	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5620	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5621
	5622	issued_io = 1;
	5623	}
	5624	}
	5625	if (issued_io == 0) {
	5626	ubc_upl_abort(upl, 0);
	5627	}
	5628
	5629	io_size = upl_size - start_offset;
	5630
	5631	if (io_size > resid) {
	5632	io_size = resid;
	5633	}
	5634	f_offset += io_size;
	5635	resid -= io_size;
	5636	}
	5637
	5638	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	5639	(int)f_offset, resid, retval, 0, 0);
	5640
	5641	return retval;
	5642	}
	5643
	5644
	5645	int
	5646	cluster_push(vnode_t vp, int flags)
	5647	{
	5648	return cluster_push_ext(vp, flags, NULL, NULL);
	5649	}
	5650
	5651
	5652	int
	5653	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void ), void *callback_arg)
	5654	{
	5655	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
	5656	}
	5657
	5658	/* write errors via err, but return the number of clusters written */
	5659	int
	5660	cluster_push_err(vnode_t vp, int flags, int (callback)(buf_t, void ), void callback_arg, int err)
	5661	{
	5662	int retval;
	5663	int my_sparse_wait = 0;
	5664	struct cl_writebehind *wbp;
	5665	int local_err = 0;
	5666
	5667	if (err) {
	5668	*err = 0;
	5669	}
	5670
	5671	if (!UBCINFOEXISTS(vp)) {
	5672	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
	5673	return 0;
	5674	}
	5675	/* return if deferred write is set */
	5676	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	5677	return 0;
	5678	}
	5679	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	5680	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
	5681	return 0;
	5682	}
	5683	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	5684	lck_mtx_unlock(&wbp->cl_lockw);
	5685
	5686	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
	5687	return 0;
	5688	}
	5689	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	5690	wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	5691
	5692	/*
	5693	* if we have an fsync in progress, we don't want to allow any additional
	5694	* sync/fsync/close(s) to occur until it finishes.
	5695	* note that its possible for writes to continue to occur to this file
	5696	* while we're waiting and also once the fsync starts to clean if we're
	5697	* in the sparse map case
	5698	*/
	5699	while (wbp->cl_sparse_wait) {
	5700	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5701
	5702	msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5703
	5704	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5705	}
	5706	if (flags & IO_SYNC) {
	5707	my_sparse_wait = 1;
	5708	wbp->cl_sparse_wait = 1;
	5709
	5710	/*
	5711	* this is an fsync (or equivalent)... we must wait for any existing async
	5712	* cleaning operations to complete before we evaulate the current state
	5713	* and finish cleaning... this insures that all writes issued before this
	5714	* fsync actually get cleaned to the disk before this fsync returns
	5715	*/
	5716	while (wbp->cl_sparse_pushes) {
	5717	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5718
	5719	msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5720
	5721	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5722	}
	5723	}
	5724	if (wbp->cl_scmap) {
	5725	void *scmap;
	5726
	5727	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
	5728	scmap = wbp->cl_scmap;
	5729	wbp->cl_scmap = NULL;
	5730
	5731	wbp->cl_sparse_pushes++;
	5732
	5733	lck_mtx_unlock(&wbp->cl_lockw);
	5734
	5735	retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
	5736
	5737	lck_mtx_lock(&wbp->cl_lockw);
	5738
	5739	wbp->cl_sparse_pushes--;
	5740
	5741	if (retval) {
	5742	if (wbp->cl_scmap != NULL) {
	5743	panic("cluster_push_err: Expected NULL cl_scmap\n");
	5744	}
	5745
	5746	wbp->cl_scmap = scmap;
	5747	}
	5748
	5749	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
	5750	wakeup((caddr_t)&wbp->cl_sparse_pushes);
	5751	}
	5752	} else {
	5753	retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
	5754	}
	5755
	5756	local_err = retval;
	5757
	5758	if (err) {
	5759	*err = retval;
	5760	}
	5761	retval = 1;
	5762	} else {
	5763	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
	5764	if (err) {
	5765	*err = local_err;
	5766	}
	5767	}
	5768	lck_mtx_unlock(&wbp->cl_lockw);
	5769
	5770	if (flags & IO_SYNC) {
	5771	(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
	5772	}
	5773
	5774	if (my_sparse_wait) {
	5775	/*
	5776	* I'm the owner of the serialization token
	5777	* clear it and wakeup anyone that is waiting
	5778	* for me to finish
	5779	*/
	5780	lck_mtx_lock(&wbp->cl_lockw);
	5781
	5782	wbp->cl_sparse_wait = 0;
	5783	wakeup((caddr_t)&wbp->cl_sparse_wait);
	5784
	5785	lck_mtx_unlock(&wbp->cl_lockw);
	5786	}
	5787	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	5788	wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
	5789
	5790	return retval;
	5791	}
	5792
	5793
	5794	__private_extern__ void
	5795	cluster_release(struct ubc_info *ubc)
	5796	{
	5797	struct cl_writebehind *wbp;
	5798	struct cl_readahead *rap;
	5799
	5800	if ((wbp = ubc->cl_wbehind)) {
	5801	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
	5802
	5803	if (wbp->cl_scmap) {
	5804	vfs_drt_control(&(wbp->cl_scmap), 0);
	5805	}
	5806	lck_mtx_destroy(&wbp->cl_lockw, &cl_mtx_grp);
	5807	zfree(cl_wr_zone, wbp);
	5808	ubc->cl_wbehind = NULL;
	5809	} else {
	5810	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, 0, 0, 0, 0);
	5811	}
	5812
	5813	if ((rap = ubc->cl_rahead)) {
	5814	lck_mtx_destroy(&rap->cl_lockr, &cl_mtx_grp);
	5815	zfree(cl_rd_zone, rap);
	5816	ubc->cl_rahead = NULL;
	5817	}
	5818
	5819	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, ubc, rap, wbp, 0, 0);
	5820	}
	5821
	5822
	5823	static int
	5824	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg, int *err, boolean_t vm_initiated)
	5825	{
	5826	int cl_index;
	5827	int cl_index1;
	5828	int min_index;
	5829	int cl_len;
	5830	int cl_pushed = 0;
	5831	struct cl_wextent l_clusters[MAX_CLUSTERS];
	5832	u_int max_cluster_pgcount;
	5833	int error = 0;
	5834
	5835	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	5836	/*
	5837	* the write behind context exists and has
	5838	* already been locked...
	5839	*/
	5840	if (wbp->cl_number == 0) {
	5841	/*
	5842	* no clusters to push
	5843	* return number of empty slots
	5844	*/
	5845	return MAX_CLUSTERS;
	5846	}
	5847
	5848	/*
	5849	* make a local 'sorted' copy of the clusters
	5850	* and clear wbp->cl_number so that new clusters can
	5851	* be developed
	5852	*/
	5853	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5854	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	5855	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
	5856	continue;
	5857	}
	5858	if (min_index == -1) {
	5859	min_index = cl_index1;
	5860	} else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
	5861	min_index = cl_index1;
	5862	}
	5863	}
	5864	if (min_index == -1) {
	5865	break;
	5866	}
	5867
	5868	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	5869	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	5870	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
	5871
	5872	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	5873	}
	5874	wbp->cl_number = 0;
	5875
	5876	cl_len = cl_index;
	5877
	5878	/* skip switching to the sparse cluster mechanism if on diskimage */
	5879	if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
	5880	!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
	5881	int i;
	5882
	5883	/*
	5884	* determine if we appear to be writing the file sequentially
	5885	* if not, by returning without having pushed any clusters
	5886	* we will cause this vnode to be pushed into the sparse cluster mechanism
	5887	* used for managing more random I/O patterns
	5888	*
	5889	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	5890	* that's why we're in try_push with PUSH_DELAY...
	5891	*
	5892	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	5893	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	5894	* so we can just make a simple pass through, up to, but not including the last one...
	5895	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	5896	* are sequential
	5897	*
	5898	* we let the last one be partial as long as it was adjacent to the previous one...
	5899	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	5900	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	5901	*/
	5902	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	5903	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
	5904	goto dont_try;
	5905	}
	5906	if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
	5907	goto dont_try;
	5908	}
	5909	}
	5910	}
	5911	if (vm_initiated == TRUE) {
	5912	lck_mtx_unlock(&wbp->cl_lockw);
	5913	}
	5914
	5915	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	5916	int flags;
	5917	struct cl_extent cl;
	5918	int retval;
	5919
	5920	flags = io_flags & (IO_PASSIVE \| IO_CLOSE);
	5921
	5922	/*
	5923	* try to push each cluster in turn...
	5924	*/
	5925	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
	5926	flags \|= IO_NOCACHE;
	5927	}
	5928
	5929	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
	5930	flags \|= IO_PASSIVE;
	5931	}
	5932
	5933	if (push_flag & PUSH_SYNC) {
	5934	flags \|= IO_SYNC;
	5935	}
	5936
	5937	cl.b_addr = l_clusters[cl_index].b_addr;
	5938	cl.e_addr = l_clusters[cl_index].e_addr;
	5939
	5940	retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
	5941
	5942	if (retval == 0) {
	5943	cl_pushed++;
	5944
	5945	l_clusters[cl_index].b_addr = 0;
	5946	l_clusters[cl_index].e_addr = 0;
	5947	} else if (error == 0) {
	5948	error = retval;
	5949	}
	5950
	5951	if (!(push_flag & PUSH_ALL)) {
	5952	break;
	5953	}
	5954	}
	5955	if (vm_initiated == TRUE) {
	5956	lck_mtx_lock(&wbp->cl_lockw);
	5957	}
	5958
	5959	if (err) {
	5960	*err = error;
	5961	}
	5962
	5963	dont_try:
	5964	if (cl_len > cl_pushed) {
	5965	/*
	5966	* we didn't push all of the clusters, so
	5967	* lets try to merge them back in to the vnode
	5968	*/
	5969	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	5970	/*
	5971	* we picked up some new clusters while we were trying to
	5972	* push the old ones... this can happen because I've dropped
	5973	* the vnode lock... the sum of the
	5974	* leftovers plus the new cluster count exceeds our ability
	5975	* to represent them, so switch to the sparse cluster mechanism
	5976	*
	5977	* collect the active public clusters...
	5978	*/
	5979	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
	5980
	5981	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	5982	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
	5983	continue;
	5984	}
	5985	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5986	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5987	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5988
	5989	cl_index1++;
	5990	}
	5991	/*
	5992	* update the cluster count
	5993	*/
	5994	wbp->cl_number = cl_index1;
	5995
	5996	/*
	5997	* and collect the original clusters that were moved into the
	5998	* local storage for sorting purposes
	5999	*/
	6000	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
	6001	} else {
	6002	/*
	6003	* we've got room to merge the leftovers back in
	6004	* just append them starting at the next 'hole'
	6005	* represented by wbp->cl_number
	6006	*/
	6007	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	6008	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
	6009	continue;
	6010	}
	6011
	6012	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	6013	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	6014	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	6015
	6016	cl_index1++;
	6017	}
	6018	/*
	6019	* update the cluster count
	6020	*/
	6021	wbp->cl_number = cl_index1;
	6022	}
	6023	}
	6024	return MAX_CLUSTERS - wbp->cl_number;
	6025	}
	6026
	6027
	6028
	6029	static int
	6030	cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
	6031	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	6032	{
	6033	upl_page_info_t *pl;
	6034	upl_t upl;
	6035	vm_offset_t upl_offset;
	6036	int upl_size;
	6037	off_t upl_f_offset;
	6038	int pages_in_upl;
	6039	int start_pg;
	6040	int last_pg;
	6041	int io_size;
	6042	int io_flags;
	6043	int upl_flags;
	6044	int bflag;
	6045	int size;
	6046	int error = 0;
	6047	int retval;
	6048	kern_return_t kret;
	6049
	6050	if (flags & IO_PASSIVE) {
	6051	bflag = CL_PASSIVE;
	6052	} else {
	6053	bflag = 0;
	6054	}
	6055
	6056	if (flags & IO_SKIP_ENCRYPTION) {
	6057	bflag \|= CL_ENCRYPTED;
	6058	}
	6059
	6060	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	6061	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	6062
	6063	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	6064	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	6065
	6066	return 0;
	6067	}
	6068	upl_size = pages_in_upl * PAGE_SIZE;
	6069	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	6070
	6071	if (upl_f_offset + upl_size >= EOF) {
	6072	if (upl_f_offset >= EOF) {
	6073	/*
	6074	* must have truncated the file and missed
	6075	* clearing a dangling cluster (i.e. it's completely
	6076	* beyond the new EOF
	6077	*/
	6078	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	6079
	6080	return 0;
	6081	}
	6082	size = (int)(EOF - upl_f_offset);
	6083
	6084	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	6085	pages_in_upl = upl_size / PAGE_SIZE;
	6086	} else {
	6087	size = upl_size;
	6088	}
	6089
	6090
	6091	if (vm_initiated) {
	6092	vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
	6093	UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_KEEPCACHED, &error);
	6094
	6095	return error;
	6096	}
	6097	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	6098
	6099	/*
	6100	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	6101	*
	6102	* - only pages that are currently dirty are returned... these are the ones we need to clean
	6103	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	6104	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	6105	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	6106	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	6107	*
	6108	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	6109	*/
	6110
	6111	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE)) {
	6112	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	6113	} else {
	6114	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	6115	}
	6116
	6117	kret = ubc_create_upl_kernel(vp,
	6118	upl_f_offset,
	6119	upl_size,
	6120	&upl,
	6121	&pl,
	6122	upl_flags,
	6123	VM_KERN_MEMORY_FILE);
	6124	if (kret != KERN_SUCCESS) {
	6125	panic("cluster_push: failed to get pagelist");
	6126	}
	6127
	6128	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
	6129
	6130	/*
	6131	* since we only asked for the dirty pages back
	6132	* it's possible that we may only get a few or even none, so...
	6133	* before we start marching forward, we must make sure we know
	6134	* where the last present page is in the UPL, otherwise we could
	6135	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	6136	* employed by commit_range and abort_range.
	6137	*/
	6138	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	6139	if (upl_page_present(pl, last_pg)) {
	6140	break;
	6141	}
	6142	}
	6143	pages_in_upl = last_pg + 1;
	6144
	6145	if (pages_in_upl == 0) {
	6146	ubc_upl_abort(upl, 0);
	6147
	6148	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	6149	return 0;
	6150	}
	6151
	6152	for (last_pg = 0; last_pg < pages_in_upl;) {
	6153	/*
	6154	* find the next dirty page in the UPL
	6155	* this will become the first page in the
	6156	* next I/O to generate
	6157	*/
	6158	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	6159	if (upl_dirty_page(pl, start_pg)) {
	6160	break;
	6161	}
	6162	if (upl_page_present(pl, start_pg)) {
	6163	/*
	6164	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	6165	* just release these unchanged since we're not going
	6166	* to steal them or change their state
	6167	*/
	6168	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	6169	}
	6170	}
	6171	if (start_pg >= pages_in_upl) {
	6172	/*
	6173	* done... no more dirty pages to push
	6174	*/
	6175	break;
	6176	}
	6177	if (start_pg > last_pg) {
	6178	/*
	6179	* skipped over some non-dirty pages
	6180	*/
	6181	size -= ((start_pg - last_pg) * PAGE_SIZE);
	6182	}
	6183
	6184	/*
	6185	* find a range of dirty pages to write
	6186	*/
	6187	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	6188	if (!upl_dirty_page(pl, last_pg)) {
	6189	break;
	6190	}
	6191	}
	6192	upl_offset = start_pg * PAGE_SIZE;
	6193
	6194	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	6195
	6196	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
	6197
	6198	if (!(flags & IO_SYNC)) {
	6199	io_flags \|= CL_ASYNC;
	6200	}
	6201
	6202	if (flags & IO_CLOSE) {
	6203	io_flags \|= CL_CLOSE;
	6204	}
	6205
	6206	if (flags & IO_NOCACHE) {
	6207	io_flags \|= CL_NOCACHE;
	6208	}
	6209
	6210	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	6211	io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6212
	6213	if (error == 0 && retval) {
	6214	error = retval;
	6215	}
	6216
	6217	size -= io_size;
	6218	}
	6219	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, error, 0, 0);
	6220
	6221	return error;
	6222	}
	6223
	6224
	6225	/*
	6226	* sparse_cluster_switch is called with the write behind lock held
	6227	*/
	6228	static int
	6229	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int (callback)(buf_t, void ), void callback_arg, boolean_t vm_initiated)
	6230	{
	6231	int cl_index;
	6232	int error;
	6233
	6234	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
	6235
	6236	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	6237	int flags;
	6238	struct cl_extent cl;
	6239
	6240	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	6241	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
	6242	if (flags & UPL_POP_DIRTY) {
	6243	cl.e_addr = cl.b_addr + 1;
	6244
	6245	error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
	6246
	6247	if (error) {
	6248	break;
	6249	}
	6250	}
	6251	}
	6252	}
	6253	}
	6254	wbp->cl_number -= cl_index;
	6255
	6256	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
	6257
	6258	return error;
	6259	}
	6260
	6261
	6262	/*
	6263	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
	6264	* still associated with the write-behind context... however, if the scmap has been disassociated
	6265	* from the write-behind context (the cluster_push case), the wb lock is not held
	6266	*/
	6267	static int
	6268	sparse_cluster_push(struct cl_writebehind wbp, void *scmap, vnode_t vp, off_t EOF, int push_flag,
	6269	int io_flags, int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	6270	{
	6271	struct cl_extent cl;
	6272	off_t offset;
	6273	u_int length;
	6274	void *l_scmap;
	6275	int error = 0;
	6276
	6277	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
	6278
	6279	if (push_flag & PUSH_ALL) {
	6280	vfs_drt_control(scmap, 1);
	6281	}
	6282
	6283	l_scmap = *scmap;
	6284
	6285	for (;;) {
	6286	int retval;
	6287
	6288	if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
	6289	break;
	6290	}
	6291
	6292	if (vm_initiated == TRUE) {
	6293	lck_mtx_unlock(&wbp->cl_lockw);
	6294	}
	6295
	6296	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	6297	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	6298
	6299	retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
	6300	if (error == 0 && retval) {
	6301	error = retval;
	6302	}
	6303
	6304	if (vm_initiated == TRUE) {
	6305	lck_mtx_lock(&wbp->cl_lockw);
	6306
	6307	if (*scmap != l_scmap) {
	6308	break;
	6309	}
	6310	}
	6311
	6312	if (error) {
	6313	if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
	6314	panic("Failed to restore dirty state on failure\n");
	6315	}
	6316
	6317	break;
	6318	}
	6319
	6320	if (!(push_flag & PUSH_ALL)) {
	6321	break;
	6322	}
	6323	}
	6324	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
	6325
	6326	return error;
	6327	}
	6328
	6329
	6330	/*
	6331	* sparse_cluster_add is called with the write behind lock held
	6332	*/
	6333	static int
	6334	sparse_cluster_add(struct cl_writebehind wbp, void scmap, vnode_t vp, struct cl_extent cl, off_t EOF,
	6335	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	6336	{
	6337	u_int new_dirty;
	6338	u_int length;
	6339	off_t offset;
	6340	int error;
	6341	int push_flag = 0; /* Is this a valid value? */
	6342
	6343	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
	6344
	6345	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	6346	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	6347
	6348	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
	6349	/*
	6350	* no room left in the map
	6351	* only a partial update was done
	6352	* push out some pages and try again
	6353	*/
	6354
	6355	if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
	6356	push_flag = 0;
	6357	}
	6358
	6359	error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
	6360
	6361	if (error) {
	6362	break;
	6363	}
	6364
	6365	offset += (new_dirty * PAGE_SIZE_64);
	6366	length -= (new_dirty * PAGE_SIZE);
	6367	}
	6368	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
	6369
	6370	return error;
	6371	}
	6372
	6373
	6374	static int
	6375	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (callback)(buf_t, void ), void callback_arg)
	6376	{
	6377	upl_page_info_t *pl;
	6378	upl_t upl;
	6379	addr64_t ubc_paddr;
	6380	kern_return_t kret;
	6381	int error = 0;
	6382	int did_read = 0;
	6383	int abort_flags;
	6384	int upl_flags;
	6385	int bflag;
	6386
	6387	if (flags & IO_PASSIVE) {
	6388	bflag = CL_PASSIVE;
	6389	} else {
	6390	bflag = 0;
	6391	}
	6392
	6393	if (flags & IO_NOCACHE) {
	6394	bflag \|= CL_NOCACHE;
	6395	}
	6396
	6397	upl_flags = UPL_SET_LITE;
	6398
	6399	if (!(flags & CL_READ)) {
	6400	/*
	6401	* "write" operation: let the UPL subsystem know
	6402	* that we intend to modify the buffer cache pages
	6403	* we're gathering.
	6404	*/
	6405	upl_flags \|= UPL_WILL_MODIFY;
	6406	} else {
	6407	/*
	6408	* indicate that there is no need to pull the
	6409	* mapping for this page... we're only going
	6410	* to read from it, not modify it.
	6411	*/
	6412	upl_flags \|= UPL_FILE_IO;
	6413	}
	6414	kret = ubc_create_upl_kernel(vp,
	6415	uio->uio_offset & ~PAGE_MASK_64,
	6416	PAGE_SIZE,
	6417	&upl,
	6418	&pl,
	6419	upl_flags,
	6420	VM_KERN_MEMORY_FILE);
	6421
	6422	if (kret != KERN_SUCCESS) {
	6423	return EINVAL;
	6424	}
	6425
	6426	if (!upl_valid_page(pl, 0)) {
	6427	/*
	6428	* issue a synchronous read to cluster_io
	6429	*/
	6430	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6431	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6432	if (error) {
	6433	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	6434
	6435	return error;
	6436	}
	6437	did_read = 1;
	6438	}
	6439	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	6440
	6441	/*
	6442	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	6443	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	6444	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	6445	* way to do so without exporting them to kexts as well.
	6446	*/
	6447	if (flags & CL_READ) {
	6448	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	6449	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	6450	} else {
	6451	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	6452	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	6453	}
	6454	if (!(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	6455	/*
	6456	* issue a synchronous write to cluster_io
	6457	*/
	6458	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6459	bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6460	}
	6461	if (error == 0) {
	6462	uio_update(uio, (user_size_t)xsize);
	6463	}
	6464
	6465	if (did_read) {
	6466	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	6467	} else {
	6468	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	6469	}
	6470
	6471	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	6472
	6473	return error;
	6474	}
	6475
	6476	int
	6477	cluster_copy_upl_data(struct uio uio, upl_t upl, int upl_offset, int io_resid)
	6478	{
	6479	int pg_offset;
	6480	int pg_index;
	6481	int csize;
	6482	int segflg;
	6483	int retval = 0;
	6484	int xsize;
	6485	upl_page_info_t *pl;
	6486	int dirty_count;
	6487
	6488	xsize = *io_resid;
	6489
	6490	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6491	(int)uio->uio_offset, upl_offset, xsize, 0, 0);
	6492
	6493	segflg = uio->uio_segflg;
	6494
	6495	switch (segflg) {
	6496	case UIO_USERSPACE32:
	6497	case UIO_USERISPACE32:
	6498	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6499	break;
	6500
	6501	case UIO_USERSPACE:
	6502	case UIO_USERISPACE:
	6503	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6504	break;
	6505
	6506	case UIO_USERSPACE64:
	6507	case UIO_USERISPACE64:
	6508	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6509	break;
	6510
	6511	case UIO_SYSSPACE:
	6512	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6513	break;
	6514	}
	6515	pl = ubc_upl_pageinfo(upl);
	6516
	6517	pg_index = upl_offset / PAGE_SIZE;
	6518	pg_offset = upl_offset & PAGE_MASK;
	6519	csize = min(PAGE_SIZE - pg_offset, xsize);
	6520
	6521	dirty_count = 0;
	6522	while (xsize && retval == 0) {
	6523	addr64_t paddr;
	6524
	6525	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
	6526	if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
	6527	dirty_count++;
	6528	}
	6529
	6530	retval = uiomove64(paddr, csize, uio);
	6531
	6532	pg_index += 1;
	6533	pg_offset = 0;
	6534	xsize -= csize;
	6535	csize = min(PAGE_SIZE, xsize);
	6536	}
	6537	*io_resid = xsize;
	6538
	6539	uio->uio_segflg = segflg;
	6540
	6541	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
	6542	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6543	(int)uio->uio_offset, xsize, retval, segflg, 0);
	6544
	6545	return retval;
	6546	}
	6547
	6548
	6549	int
	6550	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	6551	{
	6552	return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
	6553	}
	6554
	6555
	6556	static int
	6557	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference)
	6558	{
	6559	int segflg;
	6560	int io_size;
	6561	int xsize;
	6562	int start_offset;
	6563	int retval = 0;
	6564	memory_object_control_t control;
	6565
	6566	io_size = *io_resid;
	6567
	6568	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6569	(int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
	6570
	6571	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	6572
	6573	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	6574	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6575	(int)uio->uio_offset, io_size, retval, 3, 0);
	6576
	6577	return 0;
	6578	}
	6579	segflg = uio->uio_segflg;
	6580
	6581	switch (segflg) {
	6582	case UIO_USERSPACE32:
	6583	case UIO_USERISPACE32:
	6584	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6585	break;
	6586
	6587	case UIO_USERSPACE64:
	6588	case UIO_USERISPACE64:
	6589	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6590	break;
	6591
	6592	case UIO_USERSPACE:
	6593	case UIO_USERISPACE:
	6594	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6595	break;
	6596
	6597	case UIO_SYSSPACE:
	6598	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6599	break;
	6600	}
	6601
	6602	if ((io_size = *io_resid)) {
	6603	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	6604	xsize = (int)uio_resid(uio);
	6605
	6606	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
	6607	start_offset, io_size, mark_dirty, take_reference);
	6608	xsize -= uio_resid(uio);
	6609	io_size -= xsize;
	6610	}
	6611	uio->uio_segflg = segflg;
	6612	*io_resid = io_size;
	6613
	6614	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6615	(int)uio->uio_offset, io_size, retval, 0x80000000 \| segflg, 0);
	6616
	6617	return retval;
	6618	}
	6619
	6620
	6621	int
	6622	is_file_clean(vnode_t vp, off_t filesize)
	6623	{
	6624	off_t f_offset;
	6625	int flags;
	6626	int total_dirty = 0;
	6627
	6628	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	6629	if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
	6630	if (flags & UPL_POP_DIRTY) {
	6631	total_dirty++;
	6632	}
	6633	}
	6634	}
	6635	if (total_dirty) {
	6636	return EINVAL;
	6637	}
	6638
	6639	return 0;
	6640	}
	6641
	6642
	6643
	6644	/*
	6645	* Dirty region tracking/clustering mechanism.
	6646	*
	6647	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	6648	* dirty regions within a larger space (file). It is primarily intended to
	6649	* support clustering in large files with many dirty areas.
	6650	*
	6651	* The implementation assumes that the dirty regions are pages.
	6652	*
	6653	* To represent dirty pages within the file, we store bit vectors in a
	6654	* variable-size circular hash.
	6655	*/
	6656
	6657	/*
	6658	* Bitvector size. This determines the number of pages we group in a
	6659	* single hashtable entry. Each hashtable entry is aligned to this
	6660	* size within the file.
	6661	*/
	6662	#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
	6663
	6664	/*
	6665	* File offset handling.
	6666	*
	6667	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	6668	* the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6669	*/
	6670	#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6671	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	6672
	6673	/*
	6674	* Hashtable address field handling.
	6675	*
	6676	* The low-order bits of the hashtable address are used to conserve
	6677	* space.
	6678	*
	6679	* DRT_HASH_COUNT_MASK must be large enough to store the range
	6680	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	6681	* to indicate that the bucket is actually unoccupied.
	6682	*/
	6683	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	6684	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	6685	do { \
	6686	(scm)->scm_hashtable[(i)].dhe_control = \
	6687	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	6688	} while (0)
	6689	#define DRT_HASH_COUNT_MASK 0x1ff
	6690	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	6691	#define DRT_HASH_SET_COUNT(scm, i, c) \
	6692	do { \
	6693	(scm)->scm_hashtable[(i)].dhe_control = \
	6694	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	6695	} while (0)
	6696	#define DRT_HASH_CLEAR(scm, i) \
	6697	do { \
	6698	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	6699	} while (0)
	6700	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	6701	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	6702	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	6703	do { \
	6704	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	6705	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	6706	} while(0);
	6707
	6708
	6709	#if !defined(XNU_TARGET_OS_OSX)
	6710	/*
	6711	* Hash table moduli.
	6712	*
	6713	* Since the hashtable entry's size is dependent on the size of
	6714	* the bitvector, and since the hashtable size is constrained to
	6715	* both being prime and fitting within the desired allocation
	6716	* size, these values need to be manually determined.
	6717	*
	6718	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
	6719	*
	6720	* The small hashtable allocation is 4096 bytes, so the modulus is 251.
	6721	* The large hashtable allocation is 32768 bytes, so the modulus is 2039.
	6722	* The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
	6723	*/
	6724
	6725	#define DRT_HASH_SMALL_MODULUS 251
	6726	#define DRT_HASH_LARGE_MODULUS 2039
	6727	#define DRT_HASH_XLARGE_MODULUS 8179
	6728
	6729	/*
	6730	* Physical memory required before the large hash modulus is permitted.
	6731	*
	6732	* On small memory systems, the large hash modulus can lead to phsyical
	6733	* memory starvation, so we avoid using it there.
	6734	*/
	6735	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
	6736	#define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
	6737
	6738	#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
	6739	#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
	6740	#define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
	6741
	6742	#else /* XNU_TARGET_OS_OSX */
	6743	/*
	6744	* Hash table moduli.
	6745	*
	6746	* Since the hashtable entry's size is dependent on the size of
	6747	* the bitvector, and since the hashtable size is constrained to
	6748	* both being prime and fitting within the desired allocation
	6749	* size, these values need to be manually determined.
	6750	*
	6751	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
	6752	*
	6753	* The small hashtable allocation is 16384 bytes, so the modulus is 1019.
	6754	* The large hashtable allocation is 131072 bytes, so the modulus is 8179.
	6755	* The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
	6756	*/
	6757
	6758	#define DRT_HASH_SMALL_MODULUS 1019
	6759	#define DRT_HASH_LARGE_MODULUS 8179
	6760	#define DRT_HASH_XLARGE_MODULUS 32749
	6761
	6762	/*
	6763	* Physical memory required before the large hash modulus is permitted.
	6764	*
	6765	* On small memory systems, the large hash modulus can lead to phsyical
	6766	* memory starvation, so we avoid using it there.
	6767	*/
	6768	#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
	6769	#define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
	6770
	6771	#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
	6772	#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
	6773	#define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
	6774
	6775	#endif /* ! XNU_TARGET_OS_OSX */
	6776
	6777	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	6778
	6779	/*
	6780	* Hashtable entry.
	6781	*/
	6782	struct vfs_drt_hashentry {
	6783	u_int64_t dhe_control;
	6784	/*
	6785	* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	6786	* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
	6787	* Since PAGE_SIZE is only known at boot time,
	6788	* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
	6789	* -declare dhe_bitvector array for largest possible length
	6790	*/
	6791	#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
	6792	u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
	6793	};
	6794
	6795	/*
	6796	* Hashtable bitvector handling.
	6797	*
	6798	* Bitvector fields are 32 bits long.
	6799	*/
	6800
	6801	#define DRT_HASH_SET_BIT(scm, i, bit) \
	6802	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	6803
	6804	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	6805	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	6806
	6807	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	6808	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	6809
	6810	#define DRT_BITVECTOR_CLEAR(scm, i) \
	6811	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6812
	6813	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	6814	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	6815	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	6816	(MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6817
	6818	/*
	6819	* Dirty Region Tracking structure.
	6820	*
	6821	* The hashtable is allocated entirely inside the DRT structure.
	6822	*
	6823	* The hash is a simple circular prime modulus arrangement, the structure
	6824	* is resized from small to large if it overflows.
	6825	*/
	6826
	6827	struct vfs_drt_clustermap {
	6828	u_int32_t scm_magic; /* sanity/detection */
	6829	#define DRT_SCM_MAGIC 0x12020003
	6830	u_int32_t scm_modulus; /* current ring size */
	6831	u_int32_t scm_buckets; /* number of occupied buckets */
	6832	u_int32_t scm_lastclean; /* last entry we cleaned */
	6833	u_int32_t scm_iskips; /* number of slot skips */
	6834
	6835	struct vfs_drt_hashentry scm_hashtable[0];
	6836	};
	6837
	6838
	6839	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	6840	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	6841
	6842	/*
	6843	* Debugging codes and arguments.
	6844	*/
	6845	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	6846	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	6847	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	6848	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	6849	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	6850	* dirty */
	6851	/* 0, setcount */
	6852	/* 1 (clean, no map) */
	6853	/* 2 (map alloc fail) */
	6854	/* 3, resid (partial) */
	6855	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	6856	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	6857	* lastclean, iskips */
	6858
	6859
	6860	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	6861	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	6862	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	6863	u_int64_t offset, int *indexp);
	6864	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	6865	u_int64_t offset,
	6866	int *indexp,
	6867	int recursed);
	6868	static kern_return_t vfs_drt_do_mark_pages(
	6869	void **cmapp,
	6870	u_int64_t offset,
	6871	u_int length,
	6872	u_int *setcountp,
	6873	int dirty);
	6874	static void vfs_drt_trace(
	6875	struct vfs_drt_clustermap *cmap,
	6876	int code,
	6877	int arg1,
	6878	int arg2,
	6879	int arg3,
	6880	int arg4);
	6881
	6882
	6883	/*
	6884	* Allocate and initialise a sparse cluster map.
	6885	*
	6886	* Will allocate a new map, resize or compact an existing map.
	6887	*
	6888	* XXX we should probably have at least one intermediate map size,
	6889	* as the 1:16 ratio seems a bit drastic.
	6890	*/
	6891	static kern_return_t
	6892	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	6893	{
	6894	struct vfs_drt_clustermap cmap = NULL, ocmap = NULL;
	6895	kern_return_t kret = KERN_SUCCESS;
	6896	u_int64_t offset = 0;
	6897	u_int32_t i = 0;
	6898	int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
	6899
	6900	ocmap = NULL;
	6901	if (cmapp != NULL) {
	6902	ocmap = *cmapp;
	6903	}
	6904
	6905	/*
	6906	* Decide on the size of the new map.
	6907	*/
	6908	if (ocmap == NULL) {
	6909	modulus_size = DRT_HASH_SMALL_MODULUS;
	6910	map_size = DRT_SMALL_ALLOCATION;
	6911	} else {
	6912	/* count the number of active buckets in the old map */
	6913	active_buckets = 0;
	6914	for (i = 0; i < ocmap->scm_modulus; i++) {
	6915	if (!DRT_HASH_VACANT(ocmap, i) &&
	6916	(DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
	6917	active_buckets++;
	6918	}
	6919	}
	6920	/*
	6921	* If we're currently using the small allocation, check to
	6922	* see whether we should grow to the large one.
	6923	*/
	6924	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	6925	/*
	6926	* If the ring is nearly full and we are allowed to
	6927	* use the large modulus, upgrade.
	6928	*/
	6929	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
	6930	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
	6931	modulus_size = DRT_HASH_LARGE_MODULUS;
	6932	map_size = DRT_LARGE_ALLOCATION;
	6933	} else {
	6934	modulus_size = DRT_HASH_SMALL_MODULUS;
	6935	map_size = DRT_SMALL_ALLOCATION;
	6936	}
	6937	} else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
	6938	if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
	6939	(max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
	6940	modulus_size = DRT_HASH_XLARGE_MODULUS;
	6941	map_size = DRT_XLARGE_ALLOCATION;
	6942	} else {
	6943	/*
	6944	* If the ring is completely full and we can't
	6945	* expand, there's nothing useful for us to do.
	6946	* Behave as though we had compacted into the new
	6947	* array and return.
	6948	*/
	6949	return KERN_SUCCESS;
	6950	}
	6951	} else {
	6952	/* already using the xlarge modulus */
	6953	modulus_size = DRT_HASH_XLARGE_MODULUS;
	6954	map_size = DRT_XLARGE_ALLOCATION;
	6955
	6956	/*
	6957	* If the ring is completely full, there's
	6958	* nothing useful for us to do. Behave as
	6959	* though we had compacted into the new
	6960	* array and return.
	6961	*/
	6962	if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
	6963	return KERN_SUCCESS;
	6964	}
	6965	}
	6966	}
	6967
	6968	/*
	6969	* Allocate and initialise the new map.
	6970	*/
	6971
	6972	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size, VM_KERN_MEMORY_FILE);
	6973	if (kret != KERN_SUCCESS) {
	6974	return kret;
	6975	}
	6976	cmap->scm_magic = DRT_SCM_MAGIC;
	6977	cmap->scm_modulus = modulus_size;
	6978	cmap->scm_buckets = 0;
	6979	cmap->scm_lastclean = 0;
	6980	cmap->scm_iskips = 0;
	6981	for (i = 0; i < cmap->scm_modulus; i++) {
	6982	DRT_HASH_CLEAR(cmap, i);
	6983	DRT_HASH_VACATE(cmap, i);
	6984	DRT_BITVECTOR_CLEAR(cmap, i);
	6985	}
	6986
	6987	/*
	6988	* If there's an old map, re-hash entries from it into the new map.
	6989	*/
	6990	copycount = 0;
	6991	if (ocmap != NULL) {
	6992	for (i = 0; i < ocmap->scm_modulus; i++) {
	6993	/* skip empty buckets */
	6994	if (DRT_HASH_VACANT(ocmap, i) \|\|
	6995	(DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
	6996	continue;
	6997	}
	6998	/* get new index */
	6999	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	7000	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	7001	if (kret != KERN_SUCCESS) {
	7002	/* XXX need to bail out gracefully here */
	7003	panic("vfs_drt: new cluster map mysteriously too small");
	7004	index = 0;
	7005	}
	7006	/* copy */
	7007	DRT_HASH_COPY(ocmap, i, cmap, index);
	7008	copycount++;
	7009	}
	7010	}
	7011
	7012	/* log what we've done */
	7013	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	7014
	7015	/*
	7016	* It's important to ensure that *cmapp always points to
	7017	* a valid map, so we must overwrite it before freeing
	7018	* the old map.
	7019	*/
	7020	*cmapp = cmap;
	7021	if (ocmap != NULL) {
	7022	/* emit stats into trace buffer */
	7023	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	7024	ocmap->scm_modulus,
	7025	ocmap->scm_buckets,
	7026	ocmap->scm_lastclean,
	7027	ocmap->scm_iskips);
	7028
	7029	vfs_drt_free_map(ocmap);
	7030	}
	7031	return KERN_SUCCESS;
	7032	}
	7033
	7034
	7035	/*
	7036	* Free a sparse cluster map.
	7037	*/
	7038	static kern_return_t
	7039	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	7040	{
	7041	vm_size_t map_size = 0;
	7042
	7043	if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	7044	map_size = DRT_SMALL_ALLOCATION;
	7045	} else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
	7046	map_size = DRT_LARGE_ALLOCATION;
	7047	} else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
	7048	map_size = DRT_XLARGE_ALLOCATION;
	7049	} else {
	7050	panic("vfs_drt_free_map: Invalid modulus %d\n", cmap->scm_modulus);
	7051	}
	7052
	7053	kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
	7054	return KERN_SUCCESS;
	7055	}
	7056
	7057
	7058	/*
	7059	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	7060	*/
	7061	static kern_return_t
	7062	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	7063	{
	7064	int index;
	7065	u_int32_t i;
	7066
	7067	offset = DRT_ALIGN_ADDRESS(offset);
	7068	index = DRT_HASH(cmap, offset);
	7069
	7070	/* traverse the hashtable */
	7071	for (i = 0; i < cmap->scm_modulus; i++) {
	7072	/*
	7073	* If the slot is vacant, we can stop.
	7074	*/
	7075	if (DRT_HASH_VACANT(cmap, index)) {
	7076	break;
	7077	}
	7078
	7079	/*
	7080	* If the address matches our offset, we have success.
	7081	*/
	7082	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	7083	*indexp = index;
	7084	return KERN_SUCCESS;
	7085	}
	7086
	7087	/*
	7088	* Move to the next slot, try again.
	7089	*/
	7090	index = DRT_HASH_NEXT(cmap, index);
	7091	}
	7092	/*
	7093	* It's not there.
	7094	*/
	7095	return KERN_FAILURE;
	7096	}
	7097
	7098	/*
	7099	* Find the hashtable slot for the supplied offset. If we haven't allocated
	7100	* one yet, allocate one and populate the address field. Note that it will
	7101	* not have a nonzero page count and thus will still technically be free, so
	7102	* in the case where we are called to clean pages, the slot will remain free.
	7103	*/
	7104	static kern_return_t
	7105	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	7106	{
	7107	struct vfs_drt_clustermap *cmap;
	7108	kern_return_t kret;
	7109	u_int32_t index;
	7110	u_int32_t i;
	7111
	7112	cmap = *cmapp;
	7113
	7114	/* look for an existing entry */
	7115	kret = vfs_drt_search_index(cmap, offset, indexp);
	7116	if (kret == KERN_SUCCESS) {
	7117	return kret;
	7118	}
	7119
	7120	/* need to allocate an entry */
	7121	offset = DRT_ALIGN_ADDRESS(offset);
	7122	index = DRT_HASH(cmap, offset);
	7123
	7124	/* scan from the index forwards looking for a vacant slot */
	7125	for (i = 0; i < cmap->scm_modulus; i++) {
	7126	/* slot vacant? */
	7127	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap, index) == 0) {
	7128	cmap->scm_buckets++;
	7129	if (index < cmap->scm_lastclean) {
	7130	cmap->scm_lastclean = index;
	7131	}
	7132	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	7133	DRT_HASH_SET_COUNT(cmap, index, 0);
	7134	DRT_BITVECTOR_CLEAR(cmap, index);
	7135	*indexp = index;
	7136	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	7137	return KERN_SUCCESS;
	7138	}
	7139	cmap->scm_iskips += i;
	7140	index = DRT_HASH_NEXT(cmap, index);
	7141	}
	7142
	7143	/*
	7144	* We haven't found a vacant slot, so the map is full. If we're not
	7145	* already recursed, try reallocating/compacting it.
	7146	*/
	7147	if (recursed) {
	7148	return KERN_FAILURE;
	7149	}
	7150	kret = vfs_drt_alloc_map(cmapp);
	7151	if (kret == KERN_SUCCESS) {
	7152	/* now try to insert again */
	7153	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	7154	}
	7155	return kret;
	7156	}
	7157
	7158	/*
	7159	* Implementation of set dirty/clean.
	7160	*
	7161	* In the 'clean' case, not finding a map is OK.
	7162	*/
	7163	static kern_return_t
	7164	vfs_drt_do_mark_pages(
	7165	void **private,
	7166	u_int64_t offset,
	7167	u_int length,
	7168	u_int *setcountp,
	7169	int dirty)
	7170	{
	7171	struct vfs_drt_clustermap cmap, *cmapp;
	7172	kern_return_t kret;
	7173	int i, index, pgoff, pgcount, setcount, ecount;
	7174
	7175	cmapp = (struct vfs_drt_clustermap **)private;
	7176	cmap = *cmapp;
	7177
	7178	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	7179
	7180	if (setcountp != NULL) {
	7181	*setcountp = 0;
	7182	}
	7183
	7184	/* allocate a cluster map if we don't already have one */
	7185	if (cmap == NULL) {
	7186	/* no cluster map, nothing to clean */
	7187	if (!dirty) {
	7188	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	7189	return KERN_SUCCESS;
	7190	}
	7191	kret = vfs_drt_alloc_map(cmapp);
	7192	if (kret != KERN_SUCCESS) {
	7193	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	7194	return kret;
	7195	}
	7196	}
	7197	setcount = 0;
	7198
	7199	/*
	7200	* Iterate over the length of the region.
	7201	*/
	7202	while (length > 0) {
	7203	/*
	7204	* Get the hashtable index for this offset.
	7205	*
	7206	* XXX this will add blank entries if we are clearing a range
	7207	* that hasn't been dirtied.
	7208	*/
	7209	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	7210	cmap = cmapp; / may have changed! */
	7211	/* this may be a partial-success return */
	7212	if (kret != KERN_SUCCESS) {
	7213	if (setcountp != NULL) {
	7214	*setcountp = setcount;
	7215	}
	7216	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	7217
	7218	return kret;
	7219	}
	7220
	7221	/*
	7222	* Work out how many pages we're modifying in this
	7223	* hashtable entry.
	7224	*/
	7225	pgoff = (int)((offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE);
	7226	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	7227
	7228	/*
	7229	* Iterate over pages, dirty/clearing as we go.
	7230	*/
	7231	ecount = DRT_HASH_GET_COUNT(cmap, index);
	7232	for (i = 0; i < pgcount; i++) {
	7233	if (dirty) {
	7234	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	7235	if (ecount >= DRT_BITVECTOR_PAGES) {
	7236	panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
	7237	}
	7238	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	7239	ecount++;
	7240	setcount++;
	7241	}
	7242	} else {
	7243	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	7244	if (ecount <= 0) {
	7245	panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
	7246	}
	7247	assert(ecount > 0);
	7248	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	7249	ecount--;
	7250	setcount++;
	7251	}
	7252	}
	7253	}
	7254	DRT_HASH_SET_COUNT(cmap, index, ecount);
	7255
	7256	offset += pgcount * PAGE_SIZE;
	7257	length -= pgcount * PAGE_SIZE;
	7258	}
	7259	if (setcountp != NULL) {
	7260	*setcountp = setcount;
	7261	}
	7262
	7263	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	7264
	7265	return KERN_SUCCESS;
	7266	}
	7267
	7268	/*
	7269	* Mark a set of pages as dirty/clean.
	7270	*
	7271	* This is a public interface.
	7272	*
	7273	* cmapp
	7274	* Pointer to storage suitable for holding a pointer. Note that
	7275	* this must either be NULL or a value set by this function.
	7276	*
	7277	* size
	7278	* Current file size in bytes.
	7279	*
	7280	* offset
	7281	* Offset of the first page to be marked as dirty, in bytes. Must be
	7282	* page-aligned.
	7283	*
	7284	* length
	7285	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	7286	*
	7287	* setcountp
	7288	* Number of pages newly marked dirty by this call (optional).
	7289	*
	7290	* Returns KERN_SUCCESS if all the pages were successfully marked.
	7291	*/
	7292	static kern_return_t
	7293	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
	7294	{
	7295	/* XXX size unused, drop from interface */
	7296	return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
	7297	}
	7298
	7299	#if 0
	7300	static kern_return_t
	7301	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	7302	{
	7303	return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	7304	}
	7305	#endif
	7306
	7307	/*
	7308	* Get a cluster of dirty pages.
	7309	*
	7310	* This is a public interface.
	7311	*
	7312	* cmapp
	7313	* Pointer to storage managed by drt_mark_pages. Note that this must
	7314	* be NULL or a value set by drt_mark_pages.
	7315	*
	7316	* offsetp
	7317	* Returns the byte offset into the file of the first page in the cluster.
	7318	*
	7319	* lengthp
	7320	* Returns the length in bytes of the cluster of dirty pages.
	7321	*
	7322	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	7323	* are no dirty pages meeting the minmum size criteria. Private storage will
	7324	* be released if there are no more dirty pages left in the map
	7325	*
	7326	*/
	7327	static kern_return_t
	7328	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	7329	{
	7330	struct vfs_drt_clustermap *cmap;
	7331	u_int64_t offset;
	7332	u_int length;
	7333	u_int32_t j;
	7334	int index, i, fs, ls;
	7335
	7336	/* sanity */
	7337	if ((cmapp == NULL) \|\| (*cmapp == NULL)) {
	7338	return KERN_FAILURE;
	7339	}
	7340	cmap = *cmapp;
	7341
	7342	/* walk the hashtable */
	7343	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	7344	index = DRT_HASH(cmap, offset);
	7345
	7346	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
	7347	continue;
	7348	}
	7349
	7350	/* scan the bitfield for a string of bits */
	7351	fs = -1;
	7352
	7353	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	7354	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	7355	fs = i;
	7356	break;
	7357	}
	7358	}
	7359	if (fs == -1) {
	7360	/* didn't find any bits set */
	7361	panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
	7362	cmap, index, DRT_HASH_GET_COUNT(cmap, index));
	7363	}
	7364	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	7365	if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
	7366	break;
	7367	}
	7368	}
	7369
	7370	/* compute offset and length, mark pages clean */
	7371	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	7372	length = ls * PAGE_SIZE;
	7373	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	7374	cmap->scm_lastclean = index;
	7375
	7376	/* return successful */
	7377	*offsetp = (off_t)offset;
	7378	*lengthp = length;
	7379
	7380	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	7381	return KERN_SUCCESS;
	7382	}
	7383	/*
	7384	* We didn't find anything... hashtable is empty
	7385	* emit stats into trace buffer and
	7386	* then free it
	7387	*/
	7388	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	7389	cmap->scm_modulus,
	7390	cmap->scm_buckets,
	7391	cmap->scm_lastclean,
	7392	cmap->scm_iskips);
	7393
	7394	vfs_drt_free_map(cmap);
	7395	*cmapp = NULL;
	7396
	7397	return KERN_FAILURE;
	7398	}
	7399
	7400
	7401	static kern_return_t
	7402	vfs_drt_control(void **cmapp, int op_type)
	7403	{
	7404	struct vfs_drt_clustermap *cmap;
	7405
	7406	/* sanity */
	7407	if ((cmapp == NULL) \|\| (*cmapp == NULL)) {
	7408	return KERN_FAILURE;
	7409	}
	7410	cmap = *cmapp;
	7411
	7412	switch (op_type) {
	7413	case 0:
	7414	/* emit stats into trace buffer */
	7415	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	7416	cmap->scm_modulus,
	7417	cmap->scm_buckets,
	7418	cmap->scm_lastclean,
	7419	cmap->scm_iskips);
	7420
	7421	vfs_drt_free_map(cmap);
	7422	*cmapp = NULL;
	7423	break;
	7424
	7425	case 1:
	7426	cmap->scm_lastclean = 0;
	7427	break;
	7428	}
	7429	return KERN_SUCCESS;
	7430	}
	7431
	7432
	7433
	7434	/*
	7435	* Emit a summary of the state of the clustermap into the trace buffer
	7436	* along with some caller-provided data.
	7437	*/
	7438	#if KDEBUG
	7439	static void
	7440	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	7441	{
	7442	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	7443	}
	7444	#else
	7445	static void
	7446	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	7447	__unused int arg1, __unused int arg2, __unused int arg3,
	7448	__unused int arg4)
	7449	{
	7450	}
	7451	#endif
	7452
	7453	#if 0
	7454	/*
	7455	* Perform basic sanity check on the hash entry summary count
	7456	* vs. the actual bits set in the entry.
	7457	*/
	7458	static void
	7459	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	7460	{
	7461	int index, i;
	7462	int bits_on;
	7463
	7464	for (index = 0; index < cmap->scm_modulus; index++) {
	7465	if (DRT_HASH_VACANT(cmap, index)) {
	7466	continue;
	7467	}
	7468
	7469	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	7470	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	7471	bits_on++;
	7472	}
	7473	}
	7474	if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
	7475	panic("bits_on = %d, index = %d\n", bits_on, index);
	7476	}
	7477	}
	7478	}
	7479	#endif
	7480
	7481	/*
	7482	* Internal interface only.
	7483	*/
	7484	static kern_return_t
	7485	vfs_get_scmap_push_behavior_internal(void *cmapp, int push_flag)
	7486	{
	7487	struct vfs_drt_clustermap *cmap;
	7488
	7489	/* sanity */
	7490	if ((cmapp == NULL) \|\| (*cmapp == NULL) \|\| (push_flag == NULL)) {
	7491	return KERN_FAILURE;
	7492	}
	7493	cmap = *cmapp;
	7494
	7495	if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
	7496	/*
	7497	* If we have a full xlarge sparse cluster,
	7498	* we push it out all at once so the cluster
	7499	* map can be available to absorb more I/Os.
	7500	* This is done on large memory configs so
	7501	* the small I/Os don't interfere with the
	7502	* pro workloads.
	7503	*/
	7504	*push_flag = PUSH_ALL;
	7505	}
	7506	return KERN_SUCCESS;
	7507	}