git.saurik.com Git - apple/xnu.git/blame_incremental

0 / 7131 ( 0%)

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/buf_internal.h>
	67	#include <sys/mount_internal.h>
	68	#include <sys/vnode_internal.h>
	69	#include <sys/trace.h>
	70	#include <sys/malloc.h>
	71	#include <sys/time.h>
	72	#include <sys/kernel.h>
	73	#include <sys/resourcevar.h>
	74	#include <miscfs/specfs/specdev.h>
	75	#include <sys/uio_internal.h>
	76	#include <libkern/libkern.h>
	77	#include <machine/machine_routines.h>
	78
	79	#include <sys/ubc_internal.h>
	80	#include <vm/vnode_pager.h>
	81
	82	#include <mach/mach_types.h>
	83	#include <mach/memory_object_types.h>
	84	#include <mach/vm_map.h>
	85	#include <mach/upl.h>
	86	#include <kern/task.h>
	87	#include <kern/policy_internal.h>
	88
	89	#include <vm/vm_kern.h>
	90	#include <vm/vm_map.h>
	91	#include <vm/vm_pageout.h>
	92	#include <vm/vm_fault.h>
	93
	94	#include <sys/kdebug.h>
	95	#include <libkern/OSAtomic.h>
	96
	97	#include <sys/sdt.h>
	98
	99	#include <stdbool.h>
	100
	101	#include <vfs/vfs_disk_conditioner.h>
	102
	103	#if 0
	104	#undef KERNEL_DEBUG
	105	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	106	#endif
	107
	108
	109	#define CL_READ 0x01
	110	#define CL_WRITE 0x02
	111	#define CL_ASYNC 0x04
	112	#define CL_COMMIT 0x08
	113	#define CL_PAGEOUT 0x10
	114	#define CL_AGE 0x20
	115	#define CL_NOZERO 0x40
	116	#define CL_PAGEIN 0x80
	117	#define CL_DEV_MEMORY 0x100
	118	#define CL_PRESERVE 0x200
	119	#define CL_THROTTLE 0x400
	120	#define CL_KEEPCACHED 0x800
	121	#define CL_DIRECT_IO 0x1000
	122	#define CL_PASSIVE 0x2000
	123	#define CL_IOSTREAMING 0x4000
	124	#define CL_CLOSE 0x8000
	125	#define CL_ENCRYPTED 0x10000
	126	#define CL_RAW_ENCRYPTED 0x20000
	127	#define CL_NOCACHE 0x40000
	128
	129	#define MAX_VECTOR_UPL_ELEMENTS 8
	130	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
	131
	132	#define CLUSTER_IO_WAITING ((buf_t)1)
	133
	134	extern upl_t vector_upl_create(vm_offset_t);
	135	extern boolean_t vector_upl_is_valid(upl_t);
	136	extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
	137	extern void vector_upl_set_pagelist(upl_t);
	138	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
	139
	140	struct clios {
	141	lck_mtx_t io_mtxp;
	142	u_int io_completed; /* amount of io that has currently completed */
	143	u_int io_issued; /* amount of io that was successfully issued */
	144	int io_error; /* error code of first error encountered */
	145	int io_wanted; /* someone is sleeping waiting for a change in state */
	146	};
	147
	148	struct cl_direct_read_lock {
	149	LIST_ENTRY(cl_direct_read_lock) chain;
	150	int32_t ref_count;
	151	vnode_t vp;
	152	lck_rw_t rw_lock;
	153	};
	154
	155	#define CL_DIRECT_READ_LOCK_BUCKETS 61
	156
	157	static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
	158	cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
	159
	160	static lck_spin_t cl_direct_read_spin_lock;
	161
	162	static lck_grp_t *cl_mtx_grp;
	163	static lck_attr_t *cl_mtx_attr;
	164	static lck_grp_attr_t *cl_mtx_grp_attr;
	165	static lck_mtx_t *cl_transaction_mtxp;
	166
	167	#define IO_UNKNOWN 0
	168	#define IO_DIRECT 1
	169	#define IO_CONTIG 2
	170	#define IO_COPY 3
	171
	172	#define PUSH_DELAY 0x01
	173	#define PUSH_ALL 0x02
	174	#define PUSH_SYNC 0x04
	175
	176
	177	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
	178	static void cluster_wait_IO(buf_t cbp_head, int async);
	179	static void cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait);
	180
	181	static int cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length);
	182
	183	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	184	int flags, buf_t real_bp, struct clios iostate, int ()(buf_t, void ), void callback_arg);
	185	static int cluster_iodone(buf_t bp, void *callback_arg);
	186	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
	187	static int cluster_is_throttled(vnode_t vp);
	188
	189	static void cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name);
	190
	191	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void ), void *callback_arg, int flags);
	192
	193	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
	194	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference);
	195
	196	static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
	197	int ()(buf_t, void ), void *callback_arg);
	198	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	199	int flags, int ()(buf_t, void ), void *callback_arg);
	200	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	201	int ()(buf_t, void ), void *callback_arg, int flags);
	202
	203	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
	204	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void ), void *callback_arg);
	205	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	206	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void ), void *callback_arg);
	207	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
	208	int write_type, u_int32_t write_length, int ()(buf_t, void ), void *callback_arg, int bflag);
	209
	210	static void cluster_update_state_internal(vnode_t vp, struct cl_extent cl, int flags, boolean_t defer_writes, boolean_t first_pass,
	211	off_t write_off, int write_cnt, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	212
	213	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int ()(buf_t, void ), void callback_arg);
	214
	215	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	216	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra,
	217	int (callback)(buf_t, void ), void *callback_arg, int bflag);
	218
	219	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int flags, int ()(buf_t, void ), void callback_arg, boolean_t vm_ioitiated);
	220
	221	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int push_flag, int flags, int ()(buf_t, void *),
	222	void callback_arg, int err, boolean_t vm_initiated);
	223
	224	static int sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int ()(buf_t, void ), void callback_arg, boolean_t vm_initiated);
	225	static int sparse_cluster_push(struct cl_writebehind , void *cmapp, vnode_t vp, off_t EOF, int push_flag,
	226	int io_flags, int ()(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	227	static int sparse_cluster_add(struct cl_writebehind , void cmapp, vnode_t vp, struct cl_extent , off_t EOF,
	228	int ()(buf_t, void ), void *callback_arg, boolean_t vm_initiated);
	229
	230	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
	231	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	232	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	233
	234
	235	/*
	236	* For throttled IO to check whether
	237	* a block is cached by the boot cache
	238	* and thus it can avoid delaying the IO.
	239	*
	240	* bootcache_contains_block is initially
	241	* NULL. The BootCache will set it while
	242	* the cache is active and clear it when
	243	* the cache is jettisoned.
	244	*
	245	* Returns 0 if the block is not
	246	* contained in the cache, 1 if it is
	247	* contained.
	248	*
	249	* The function pointer remains valid
	250	* after the cache has been evicted even
	251	* if bootcache_contains_block has been
	252	* cleared.
	253	*
	254	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
	255	*/
	256	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
	257
	258
	259	/*
	260	* limit the internal I/O size so that we
	261	* can represent it in a 32 bit int
	262	*/
	263	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
	264	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
	265	#define MAX_VECTS 16
	266	/*
	267	* The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
	268	* allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
	269	* we have not historically allowed the write to bypass the UBC.
	270	*/
	271	#define MIN_DIRECT_WRITE_SIZE (16384)
	272
	273	#define WRITE_THROTTLE 6
	274	#define WRITE_THROTTLE_SSD 2
	275	#define WRITE_BEHIND 1
	276	#define WRITE_BEHIND_SSD 1
	277
	278	#if CONFIG_EMBEDDED
	279	#define PREFETCH 1
	280	#define PREFETCH_SSD 1
	281	uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
	282	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
	283	#else
	284	#define PREFETCH 3
	285	#define PREFETCH_SSD 2
	286	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
	287	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
	288	#endif
	289
	290
	291	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
	292	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
	293	#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
	294
	295	int speculative_reads_disabled = 0;
	296
	297	/*
	298	* throttle the number of async writes that
	299	* can be outstanding on a single vnode
	300	* before we issue a synchronous write
	301	*/
	302	#define THROTTLE_MAXCNT 0
	303
	304	uint32_t throttle_max_iosize = (128 * 1024);
	305
	306	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
	307
	308	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
	309
	310
	311	void
	312	cluster_init(void) {
	313	/*
	314	* allocate lock group attribute and group
	315	*/
	316	cl_mtx_grp_attr = lck_grp_attr_alloc_init();
	317	cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
	318
	319	/*
	320	* allocate the lock attribute
	321	*/
	322	cl_mtx_attr = lck_attr_alloc_init();
	323
	324	cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
	325
	326	if (cl_transaction_mtxp == NULL)
	327	panic("cluster_init: failed to allocate cl_transaction_mtxp");
	328
	329	lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
	330
	331	for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i)
	332	LIST_INIT(&cl_direct_read_locks[i]);
	333	}
	334
	335
	336	uint32_t
	337	cluster_max_io_size(mount_t mp, int type)
	338	{
	339	uint32_t max_io_size;
	340	uint32_t segcnt;
	341	uint32_t maxcnt;
	342
	343	switch(type) {
	344
	345	case CL_READ:
	346	segcnt = mp->mnt_segreadcnt;
	347	maxcnt = mp->mnt_maxreadcnt;
	348	break;
	349	case CL_WRITE:
	350	segcnt = mp->mnt_segwritecnt;
	351	maxcnt = mp->mnt_maxwritecnt;
	352	break;
	353	default:
	354	segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
	355	maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
	356	break;
	357	}
	358	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
	359	/*
	360	* don't allow a size beyond the max UPL size we can create
	361	*/
	362	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
	363	}
	364	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
	365
	366	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
	367	/*
	368	* don't allow a size smaller than the old fixed limit
	369	*/
	370	max_io_size = MAX_UPL_TRANSFER_BYTES;
	371	} else {
	372	/*
	373	* make sure the size specified is a multiple of PAGE_SIZE
	374	*/
	375	max_io_size &= ~PAGE_MASK;
	376	}
	377	return (max_io_size);
	378	}
	379
	380
	381
	382
	383	#define CLW_ALLOCATE 0x01
	384	#define CLW_RETURNLOCKED 0x02
	385	#define CLW_IONOCACHE 0x04
	386	#define CLW_IOPASSIVE 0x08
	387
	388	/*
	389	* if the read ahead context doesn't yet exist,
	390	* allocate and initialize it...
	391	* the vnode lock serializes multiple callers
	392	* during the actual assignment... first one
	393	* to grab the lock wins... the other callers
	394	* will release the now unnecessary storage
	395	*
	396	* once the context is present, try to grab (but don't block on)
	397	* the lock associated with it... if someone
	398	* else currently owns it, than the read
	399	* will run without read-ahead. this allows
	400	* multiple readers to run in parallel and
	401	* since there's only 1 read ahead context,
	402	* there's no real loss in only allowing 1
	403	* reader to have read-ahead enabled.
	404	*/
	405	static struct cl_readahead *
	406	cluster_get_rap(vnode_t vp)
	407	{
	408	struct ubc_info *ubc;
	409	struct cl_readahead *rap;
	410
	411	ubc = vp->v_ubcinfo;
	412
	413	if ((rap = ubc->cl_rahead) == NULL) {
	414	MALLOC_ZONE(rap, struct cl_readahead , sizeof rap, M_CLRDAHEAD, M_WAITOK);
	415
	416	bzero(rap, sizeof *rap);
	417	rap->cl_lastr = -1;
	418	lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
	419
	420	vnode_lock(vp);
	421
	422	if (ubc->cl_rahead == NULL)
	423	ubc->cl_rahead = rap;
	424	else {
	425	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	426	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	427	rap = ubc->cl_rahead;
	428	}
	429	vnode_unlock(vp);
	430	}
	431	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
	432	return(rap);
	433
	434	return ((struct cl_readahead *)NULL);
	435	}
	436
	437
	438	/*
	439	* if the write behind context doesn't yet exist,
	440	* and CLW_ALLOCATE is specified, allocate and initialize it...
	441	* the vnode lock serializes multiple callers
	442	* during the actual assignment... first one
	443	* to grab the lock wins... the other callers
	444	* will release the now unnecessary storage
	445	*
	446	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	447	* the lock associated with the write behind context before
	448	* returning
	449	*/
	450
	451	static struct cl_writebehind *
	452	cluster_get_wbp(vnode_t vp, int flags)
	453	{
	454	struct ubc_info *ubc;
	455	struct cl_writebehind *wbp;
	456
	457	ubc = vp->v_ubcinfo;
	458
	459	if ((wbp = ubc->cl_wbehind) == NULL) {
	460
	461	if ( !(flags & CLW_ALLOCATE))
	462	return ((struct cl_writebehind *)NULL);
	463
	464	MALLOC_ZONE(wbp, struct cl_writebehind , sizeof wbp, M_CLWRBEHIND, M_WAITOK);
	465
	466	bzero(wbp, sizeof *wbp);
	467	lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
	468
	469	vnode_lock(vp);
	470
	471	if (ubc->cl_wbehind == NULL)
	472	ubc->cl_wbehind = wbp;
	473	else {
	474	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	475	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	476	wbp = ubc->cl_wbehind;
	477	}
	478	vnode_unlock(vp);
	479	}
	480	if (flags & CLW_RETURNLOCKED)
	481	lck_mtx_lock(&wbp->cl_lockw);
	482
	483	return (wbp);
	484	}
	485
	486
	487	static void
	488	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, int flags)
	489	{
	490	struct cl_writebehind *wbp;
	491
	492	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
	493
	494	if (wbp->cl_number) {
	495	lck_mtx_lock(&wbp->cl_lockw);
	496
	497	cluster_try_push(wbp, vp, newEOF, PUSH_ALL \| flags, 0, callback, callback_arg, NULL, FALSE);
	498
	499	lck_mtx_unlock(&wbp->cl_lockw);
	500	}
	501	}
	502	}
	503
	504
	505	static int
	506	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
	507	{
	508	daddr64_t blkno;
	509	size_t io_size;
	510	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
	511
	512	if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
	513	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ \| VNODE_BLOCKMAP_NO_TRACK, NULL))
	514	return(0);
	515
	516	if (io_size == 0)
	517	return (0);
	518
	519	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
	520	return(1);
	521	}
	522	return(0);
	523	}
	524
	525
	526	static int
	527	cluster_is_throttled(vnode_t vp)
	528	{
	529	return (throttle_io_will_be_throttled(-1, vp->v_mount));
	530	}
	531
	532
	533	static void
	534	cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name)
	535	{
	536
	537	lck_mtx_lock(&iostate->io_mtxp);
	538
	539	while ((iostate->io_issued - iostate->io_completed) > target) {
	540
	541	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_START,
	542	iostate->io_issued, iostate->io_completed, target, 0, 0);
	543
	544	iostate->io_wanted = 1;
	545	msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
	546
	547	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_END,
	548	iostate->io_issued, iostate->io_completed, target, 0, 0);
	549	}
	550	lck_mtx_unlock(&iostate->io_mtxp);
	551	}
	552
	553	static void cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
	554	upl_offset_t upl_offset, upl_size_t size)
	555	{
	556	if (!size)
	557	return;
	558
	559	upl_t associated_upl = upl_associated_upl(upl);
	560
	561	if (!associated_upl)
	562	return;
	563
	564	#if 0
	565	printf("1: %d %d\n", upl_offset, upl_offset + size);
	566	#endif
	567
	568	/*
	569	* The associated UPL is page aligned to file offsets whereas the
	570	* UPL it's attached to has different alignment requirements. The
	571	* upl_offset that we have refers to @upl. The code that follows
	572	* has to deal with the first and last pages in this transaction
	573	* which might straddle pages in the associated UPL. To keep
	574	* track of these pages, we use the mark bits: if the mark bit is
	575	* set, we know another transaction has completed its part of that
	576	* page and so we can unlock that page here.
	577	*
	578	* The following illustrates what we have to deal with:
	579	*
	580	* MEM u <------------ 1 PAGE ------------> e
	581	* +-------------+----------------------+-----------------
	582	* \| \|######################\|#################
	583	* +-------------+----------------------+-----------------
	584	* FILE \| <--- a ---> o <------------ 1 PAGE ------------>
	585	*
	586	* So here we show a write to offset @o. The data that is to be
	587	* written is in a buffer that is not page aligned; it has offset
	588	* @a in the page. The upl that carries the data starts in memory
	589	* at @u. The associated upl starts in the file at offset @o. A
	590	* transaction will always end on a page boundary (like @e above)
	591	* except for the very last transaction in the group. We cannot
	592	* unlock the page at @o in the associated upl until both the
	593	* transaction ending at @e and the following transaction (that
	594	* starts at @e) has completed.
	595	*/
	596
	597	/*
	598	* We record whether or not the two UPLs are aligned as the mark
	599	* bit in the first page of @upl.
	600	*/
	601	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	602	bool is_unaligned = upl_page_get_mark(pl, 0);
	603
	604	if (is_unaligned) {
	605	upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
	606
	607	upl_offset_t upl_end = upl_offset + size;
	608	assert(upl_end >= PAGE_SIZE);
	609
	610	upl_size_t assoc_upl_size = upl_get_size(associated_upl);
	611
	612	/*
	613	* In the very first transaction in the group, upl_offset will
	614	* not be page aligned, but after that it will be and in that
	615	* case we want the preceding page in the associated UPL hence
	616	* the minus one.
	617	*/
	618	assert(upl_offset);
	619	if (upl_offset)
	620	upl_offset = trunc_page_32(upl_offset - 1);
	621
	622	lck_mtx_lock_spin(&iostate->io_mtxp);
	623
	624	// Look at the first page...
	625	if (upl_offset
	626	&& !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
	627	/*
	628	* The first page isn't marked so let another transaction
	629	* completion handle it.
	630	*/
	631	upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
	632	upl_offset += PAGE_SIZE;
	633	}
	634
	635	// And now the last page...
	636
	637	/*
	638	* This needs to be > rather than >= because if it's equal, it
	639	* means there's another transaction that is sharing the last
	640	* page.
	641	*/
	642	if (upl_end > assoc_upl_size)
	643	upl_end = assoc_upl_size;
	644	else {
	645	upl_end = trunc_page_32(upl_end);
	646	const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
	647
	648	if (!upl_page_get_mark(assoc_pl, last_pg)) {
	649	/*
	650	* The last page isn't marked so mark the page and let another
	651	* transaction completion handle it.
	652	*/
	653	upl_page_set_mark(assoc_pl, last_pg, true);
	654	upl_end -= PAGE_SIZE;
	655	}
	656	}
	657
	658	lck_mtx_unlock(&iostate->io_mtxp);
	659
	660	#if 0
	661	printf("2: %d %d\n", upl_offset, upl_end);
	662	#endif
	663
	664	if (upl_end <= upl_offset)
	665	return;
	666
	667	size = upl_end - upl_offset;
	668	} else {
	669	assert(!(upl_offset & PAGE_MASK));
	670	assert(!(size & PAGE_MASK));
	671	}
	672
	673	boolean_t empty;
	674
	675	/*
	676	* We can unlock these pages now and as this is for a
	677	* direct/uncached write, we want to dump the pages too.
	678	*/
	679	kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
	680	UPL_ABORT_DUMP_PAGES, &empty);
	681
	682	assert(!kr);
	683
	684	if (!kr && empty) {
	685	upl_set_associated_upl(upl, NULL);
	686	upl_deallocate(associated_upl);
	687	}
	688	}
	689
	690	static int
	691	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
	692	{
	693	int upl_abort_code = 0;
	694	int page_in = 0;
	695	int page_out = 0;
	696
	697	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE))
	698	/*
	699	* direct write of any flavor, or a direct read that wasn't aligned
	700	*/
	701	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
	702	else {
	703	if (io_flags & B_PAGEIO) {
	704	if (io_flags & B_READ)
	705	page_in = 1;
	706	else
	707	page_out = 1;
	708	}
	709	if (io_flags & B_CACHE)
	710	/*
	711	* leave pages in the cache unchanged on error
	712	*/
	713	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	714	else if (((io_flags & B_READ) == 0) && ((error != ENXIO) \|\| vnode_isswap(vp)))
	715	/*
	716	* transient error on pageout/write path... leave pages unchanged
	717	*/
	718	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	719	else if (page_in)
	720	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	721	else
	722	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	723
	724	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
	725	}
	726	return (upl_abort_code);
	727	}
	728
	729
	730	static int
	731	cluster_iodone(buf_t bp, void *callback_arg)
	732	{
	733	int b_flags;
	734	int error;
	735	int total_size;
	736	int total_resid;
	737	int upl_offset;
	738	int zero_offset;
	739	int pg_offset = 0;
	740	int commit_size = 0;
	741	int upl_flags = 0;
	742	int transaction_size = 0;
	743	upl_t upl;
	744	buf_t cbp;
	745	buf_t cbp_head;
	746	buf_t cbp_next;
	747	buf_t real_bp;
	748	vnode_t vp;
	749	struct clios *iostate;
	750	boolean_t transaction_complete = FALSE;
	751
	752	__IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
	753
	754	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	755	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	756
	757	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
	758	lck_mtx_lock_spin(cl_transaction_mtxp);
	759
	760	bp->b_flags \|= B_TDONE;
	761
	762	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	763	/*
	764	* all I/O requests that are part of this transaction
	765	* have to complete before we can process it
	766	*/
	767	if ( !(cbp->b_flags & B_TDONE)) {
	768
	769	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	770	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	771
	772	lck_mtx_unlock(cl_transaction_mtxp);
	773
	774	return 0;
	775	}
	776
	777	if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
	778	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	779	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	780
	781	lck_mtx_unlock(cl_transaction_mtxp);
	782	wakeup(cbp);
	783
	784	return 0;
	785	}
	786
	787	if (cbp->b_flags & B_EOT)
	788	transaction_complete = TRUE;
	789	}
	790	lck_mtx_unlock(cl_transaction_mtxp);
	791
	792	if (transaction_complete == FALSE) {
	793	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	794	cbp_head, 0, 0, 0, 0);
	795	return 0;
	796	}
	797	}
	798	error = 0;
	799	total_size = 0;
	800	total_resid = 0;
	801
	802	cbp = cbp_head;
	803	vp = cbp->b_vp;
	804	upl_offset = cbp->b_uploffset;
	805	upl = cbp->b_upl;
	806	b_flags = cbp->b_flags;
	807	real_bp = cbp->b_real_bp;
	808	zero_offset= cbp->b_validend;
	809	iostate = (struct clios *)cbp->b_iostate;
	810
	811	if (real_bp)
	812	real_bp->b_dev = cbp->b_dev;
	813
	814	while (cbp) {
	815	if ((cbp->b_flags & B_ERROR) && error == 0)
	816	error = cbp->b_error;
	817
	818	total_resid += cbp->b_resid;
	819	total_size += cbp->b_bcount;
	820
	821	cbp_next = cbp->b_trans_next;
	822
	823	if (cbp_next == NULL)
	824	/*
	825	* compute the overall size of the transaction
	826	* in case we created one that has 'holes' in it
	827	* 'total_size' represents the amount of I/O we
	828	* did, not the span of the transaction w/r to the UPL
	829	*/
	830	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
	831
	832	if (cbp != cbp_head)
	833	free_io_buf(cbp);
	834
	835	cbp = cbp_next;
	836	}
	837
	838	if (ISSET(b_flags, B_COMMIT_UPL)) {
	839	cluster_handle_associated_upl(iostate,
	840	cbp_head->b_upl,
	841	upl_offset,
	842	transaction_size);
	843	}
	844
	845	if (error == 0 && total_resid)
	846	error = EIO;
	847
	848	if (error == 0) {
	849	int (cliodone_func)(buf_t, void ) = (int ()(buf_t, void ))(cbp_head->b_cliodone);
	850
	851	if (cliodone_func != NULL) {
	852	cbp_head->b_bcount = transaction_size;
	853
	854	error = (*cliodone_func)(cbp_head, callback_arg);
	855	}
	856	}
	857	if (zero_offset)
	858	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	859
	860	free_io_buf(cbp_head);
	861
	862	if (iostate) {
	863	int need_wakeup = 0;
	864
	865	/*
	866	* someone has issued multiple I/Os asynchrounsly
	867	* and is waiting for them to complete (streaming)
	868	*/
	869	lck_mtx_lock_spin(&iostate->io_mtxp);
	870
	871	if (error && iostate->io_error == 0)
	872	iostate->io_error = error;
	873
	874	iostate->io_completed += total_size;
	875
	876	if (iostate->io_wanted) {
	877	/*
	878	* someone is waiting for the state of
	879	* this io stream to change
	880	*/
	881	iostate->io_wanted = 0;
	882	need_wakeup = 1;
	883	}
	884	lck_mtx_unlock(&iostate->io_mtxp);
	885
	886	if (need_wakeup)
	887	wakeup((caddr_t)&iostate->io_wanted);
	888	}
	889
	890	if (b_flags & B_COMMIT_UPL) {
	891
	892	pg_offset = upl_offset & PAGE_MASK;
	893	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	894
	895	if (error) {
	896	upl_set_iodone_error(upl, error);
	897
	898	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
	899	} else {
	900	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
	901
	902	if ((b_flags & B_PHYS) && (b_flags & B_READ))
	903	upl_flags \|= UPL_COMMIT_SET_DIRTY;
	904
	905	if (b_flags & B_AGE)
	906	upl_flags \|= UPL_COMMIT_INACTIVATE;
	907
	908	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
	909	}
	910	}
	911	if (real_bp) {
	912	if (error) {
	913	real_bp->b_flags \|= B_ERROR;
	914	real_bp->b_error = error;
	915	}
	916	real_bp->b_resid = total_resid;
	917
	918	buf_biodone(real_bp);
	919	}
	920	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	921	upl, upl_offset - pg_offset, commit_size, (error << 24) \| upl_flags, 0);
	922
	923	return (error);
	924	}
	925
	926
	927	uint32_t
	928	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
	929	{
	930	if (cluster_is_throttled(vp)) {
	931	*limit = THROTTLE_MAX_IOSIZE;
	932	return 1;
	933	}
	934	return 0;
	935	}
	936
	937
	938	void
	939	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
	940	{
	941
	942	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	943	upl_offset, size, bp, 0, 0);
	944
	945	if (bp == NULL \|\| bp->b_datap == 0) {
	946	upl_page_info_t *pl;
	947	addr64_t zero_addr;
	948
	949	pl = ubc_upl_pageinfo(upl);
	950
	951	if (upl_device_page(pl) == TRUE) {
	952	zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
	953
	954	bzero_phys_nc(zero_addr, size);
	955	} else {
	956	while (size) {
	957	int page_offset;
	958	int page_index;
	959	int zero_cnt;
	960
	961	page_index = upl_offset / PAGE_SIZE;
	962	page_offset = upl_offset & PAGE_MASK;
	963
	964	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
	965	zero_cnt = min(PAGE_SIZE - page_offset, size);
	966
	967	bzero_phys(zero_addr, zero_cnt);
	968
	969	size -= zero_cnt;
	970	upl_offset += zero_cnt;
	971	}
	972	}
	973	} else
	974	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	975
	976	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	977	upl_offset, size, 0, 0, 0);
	978	}
	979
	980
	981	static void
	982	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
	983	{
	984	cbp_head->b_validend = zero_offset;
	985	cbp_tail->b_flags \|= B_EOT;
	986	}
	987
	988	static void
	989	cluster_wait_IO(buf_t cbp_head, int async)
	990	{
	991	buf_t cbp;
	992
	993	if (async) {
	994	/*
	995	* Async callback completion will not normally generate a
	996	* wakeup upon I/O completion. To get woken up, we set
	997	* b_trans_next (which is safe for us to modify) on the last
	998	* buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
	999	* to wake us up when all buffers as part of this transaction
	1000	* are completed. This is done under the umbrella of
	1001	* cl_transaction_mtxp which is also taken in cluster_iodone.
	1002	*/
	1003	bool done = true;
	1004	buf_t last = NULL;
	1005
	1006	lck_mtx_lock_spin(cl_transaction_mtxp);
	1007
	1008	for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
	1009	if (!ISSET(cbp->b_flags, B_TDONE))
	1010	done = false;
	1011	}
	1012
	1013	if (!done) {
	1014	last->b_trans_next = CLUSTER_IO_WAITING;
	1015
	1016	DTRACE_IO1(wait__start, buf_t, last);
	1017	do {
	1018	msleep(last, cl_transaction_mtxp, PSPIN \| (PRIBIO+1), "cluster_wait_IO", NULL);
	1019
	1020	/*
	1021	* We should only have been woken up if all the
	1022	* buffers are completed, but just in case...
	1023	*/
	1024	done = true;
	1025	for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
	1026	if (!ISSET(cbp->b_flags, B_TDONE)) {
	1027	done = false;
	1028	break;
	1029	}
	1030	}
	1031	} while (!done);
	1032	DTRACE_IO1(wait__done, buf_t, last);
	1033
	1034	last->b_trans_next = NULL;
	1035	}
	1036
	1037	lck_mtx_unlock(cl_transaction_mtxp);
	1038	} else { // !async
	1039	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1040	buf_biowait(cbp);
	1041	}
	1042	}
	1043
	1044	static void
	1045	cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait)
	1046	{
	1047	buf_t cbp;
	1048	int error;
	1049	boolean_t isswapout = FALSE;
	1050
	1051	/*
	1052	* cluster_complete_transaction will
	1053	* only be called if we've issued a complete chain in synchronous mode
	1054	* or, we've already done a cluster_wait_IO on an incomplete chain
	1055	*/
	1056	if (needwait) {
	1057	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1058	buf_biowait(cbp);
	1059	}
	1060	/*
	1061	* we've already waited on all of the I/Os in this transaction,
	1062	* so mark all of the buf_t's in this transaction as B_TDONE
	1063	* so that cluster_iodone sees the transaction as completed
	1064	*/
	1065	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	1066	cbp->b_flags \|= B_TDONE;
	1067	cbp = *cbp_head;
	1068
	1069	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
	1070	isswapout = TRUE;
	1071
	1072	error = cluster_iodone(cbp, callback_arg);
	1073
	1074	if ( !(flags & CL_ASYNC) && error && *retval == 0) {
	1075	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO))
	1076	*retval = error;
	1077	else if (isswapout == TRUE)
	1078	*retval = error;
	1079	}
	1080	*cbp_head = (buf_t)NULL;
	1081	}
	1082
	1083
	1084	static int
	1085	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	1086	int flags, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1087	{
	1088	buf_t cbp;
	1089	u_int size;
	1090	u_int io_size;
	1091	int io_flags;
	1092	int bmap_flags;
	1093	int error = 0;
	1094	int retval = 0;
	1095	buf_t cbp_head = NULL;
	1096	buf_t cbp_tail = NULL;
	1097	int trans_count = 0;
	1098	int max_trans_count;
	1099	u_int pg_count;
	1100	int pg_offset;
	1101	u_int max_iosize;
	1102	u_int max_vectors;
	1103	int priv;
	1104	int zero_offset = 0;
	1105	int async_throttle = 0;
	1106	mount_t mp;
	1107	vm_offset_t upl_end_offset;
	1108	boolean_t need_EOT = FALSE;
	1109
	1110	/*
	1111	* we currently don't support buffers larger than a page
	1112	*/
	1113	if (real_bp && non_rounded_size > PAGE_SIZE)
	1114	panic("%s(): Called with real buffer of size %d bytes which "
	1115	"is greater than the maximum allowed size of "
	1116	"%d bytes (the system PAGE_SIZE).\n",
	1117	__FUNCTION__, non_rounded_size, PAGE_SIZE);
	1118
	1119	mp = vp->v_mount;
	1120
	1121	/*
	1122	* we don't want to do any funny rounding of the size for IO requests
	1123	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
	1124	* belong to us... we can't extend (nor do we need to) the I/O to fill
	1125	* out a page
	1126	*/
	1127	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
	1128	/*
	1129	* round the requested size up so that this I/O ends on a
	1130	* page boundary in case this is a 'write'... if the filesystem
	1131	* has blocks allocated to back the page beyond the EOF, we want to
	1132	* make sure to write out the zero's that are sitting beyond the EOF
	1133	* so that in case the filesystem doesn't explicitly zero this area
	1134	* if a hole is created via a lseek/write beyond the current EOF,
	1135	* it will return zeros when it's read back from the disk. If the
	1136	* physical allocation doesn't extend for the whole page, we'll
	1137	* only write/read from the disk up to the end of this allocation
	1138	* via the extent info returned from the VNOP_BLOCKMAP call.
	1139	*/
	1140	pg_offset = upl_offset & PAGE_MASK;
	1141
	1142	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	1143	} else {
	1144	/*
	1145	* anyone advertising a blocksize of 1 byte probably
	1146	* can't deal with us rounding up the request size
	1147	* AFP is one such filesystem/device
	1148	*/
	1149	size = non_rounded_size;
	1150	}
	1151	upl_end_offset = upl_offset + size;
	1152
	1153	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
	1154
	1155	/*
	1156	* Set the maximum transaction size to the maximum desired number of
	1157	* buffers.
	1158	*/
	1159	max_trans_count = 8;
	1160	if (flags & CL_DEV_MEMORY)
	1161	max_trans_count = 16;
	1162
	1163	if (flags & CL_READ) {
	1164	io_flags = B_READ;
	1165	bmap_flags = VNODE_READ;
	1166
	1167	max_iosize = mp->mnt_maxreadcnt;
	1168	max_vectors = mp->mnt_segreadcnt;
	1169	} else {
	1170	io_flags = B_WRITE;
	1171	bmap_flags = VNODE_WRITE;
	1172
	1173	max_iosize = mp->mnt_maxwritecnt;
	1174	max_vectors = mp->mnt_segwritecnt;
	1175	}
	1176	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	1177
	1178	/*
	1179	* make sure the maximum iosize is a
	1180	* multiple of the page size
	1181	*/
	1182	max_iosize &= ~PAGE_MASK;
	1183
	1184	/*
	1185	* Ensure the maximum iosize is sensible.
	1186	*/
	1187	if (!max_iosize)
	1188	max_iosize = PAGE_SIZE;
	1189
	1190	if (flags & CL_THROTTLE) {
	1191	if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
	1192	if (max_iosize > THROTTLE_MAX_IOSIZE)
	1193	max_iosize = THROTTLE_MAX_IOSIZE;
	1194	async_throttle = THROTTLE_MAXCNT;
	1195	} else {
	1196	if ( (flags & CL_DEV_MEMORY) )
	1197	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
	1198	else {
	1199	u_int max_cluster;
	1200	u_int max_cluster_size;
	1201	u_int scale;
	1202
	1203	if (vp->v_mount->mnt_minsaturationbytecount) {
	1204	max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
	1205
	1206	scale = 1;
	1207	} else {
	1208	max_cluster_size = MAX_CLUSTER_SIZE(vp);
	1209
	1210	if (disk_conditioner_mount_is_ssd(vp->v_mount))
	1211	scale = WRITE_THROTTLE_SSD;
	1212	else
	1213	scale = WRITE_THROTTLE;
	1214	}
	1215	if (max_iosize > max_cluster_size)
	1216	max_cluster = max_cluster_size;
	1217	else
	1218	max_cluster = max_iosize;
	1219
	1220	if (size < max_cluster)
	1221	max_cluster = size;
	1222
	1223	if (flags & CL_CLOSE)
	1224	scale += MAX_CLUSTERS;
	1225
	1226	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
	1227	}
	1228	}
	1229	}
	1230	if (flags & CL_AGE)
	1231	io_flags \|= B_AGE;
	1232	if (flags & (CL_PAGEIN \| CL_PAGEOUT))
	1233	io_flags \|= B_PAGEIO;
	1234	if (flags & (CL_IOSTREAMING))
	1235	io_flags \|= B_IOSTREAMING;
	1236	if (flags & CL_COMMIT)
	1237	io_flags \|= B_COMMIT_UPL;
	1238	if (flags & CL_DIRECT_IO)
	1239	io_flags \|= B_PHYS;
	1240	if (flags & (CL_PRESERVE \| CL_KEEPCACHED))
	1241	io_flags \|= B_CACHE;
	1242	if (flags & CL_PASSIVE)
	1243	io_flags \|= B_PASSIVE;
	1244	if (flags & CL_ENCRYPTED)
	1245	io_flags \|= B_ENCRYPTED_IO;
	1246
	1247	if (vp->v_flag & VSYSTEM)
	1248	io_flags \|= B_META;
	1249
	1250	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	1251	/*
	1252	* then we are going to end up
	1253	* with a page that we can't complete (the file size wasn't a multiple
	1254	* of PAGE_SIZE and we're trying to read to the end of the file
	1255	* so we'll go ahead and zero out the portion of the page we can't
	1256	* read in from the file
	1257	*/
	1258	zero_offset = upl_offset + non_rounded_size;
	1259	} else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
	1260	assert(ISSET(flags, CL_COMMIT));
	1261
	1262	// For a direct/uncached write, we need to lock pages...
	1263
	1264	upl_t cached_upl;
	1265
	1266	/*
	1267	* Create a UPL to lock the pages in the cache whilst the
	1268	* write is in progress.
	1269	*/
	1270	ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
	1271	NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
	1272
	1273	/*
	1274	* Attach this UPL to the other UPL so that we can find it
	1275	* later.
	1276	*/
	1277	upl_set_associated_upl(upl, cached_upl);
	1278
	1279	if (upl_offset & PAGE_MASK) {
	1280	/*
	1281	* The two UPLs are not aligned, so mark the first page in
	1282	* @upl so that cluster_handle_associated_upl can handle
	1283	* it accordingly.
	1284	*/
	1285	upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	1286	upl_page_set_mark(pl, 0, true);
	1287	}
	1288	}
	1289
	1290	while (size) {
	1291	daddr64_t blkno;
	1292	daddr64_t lblkno;
	1293	u_int io_size_wanted;
	1294	size_t io_size_tmp;
	1295
	1296	if (size > max_iosize)
	1297	io_size = max_iosize;
	1298	else
	1299	io_size = size;
	1300
	1301	io_size_wanted = io_size;
	1302	io_size_tmp = (size_t)io_size;
	1303
	1304	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
	1305	break;
	1306
	1307	if (io_size_tmp > io_size_wanted)
	1308	io_size = io_size_wanted;
	1309	else
	1310	io_size = (u_int)io_size_tmp;
	1311
	1312	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
	1313	real_bp->b_blkno = blkno;
	1314
	1315	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	1316	(int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
	1317
	1318	if (io_size == 0) {
	1319	/*
	1320	* vnop_blockmap didn't return an error... however, it did
	1321	* return an extent size of 0 which means we can't
	1322	* make forward progress on this I/O... a hole in the
	1323	* file would be returned as a blkno of -1 with a non-zero io_size
	1324	* a real extent is returned with a blkno != -1 and a non-zero io_size
	1325	*/
	1326	error = EINVAL;
	1327	break;
	1328	}
	1329	if ( !(flags & CL_READ) && blkno == -1) {
	1330	off_t e_offset;
	1331	int pageout_flags;
	1332
	1333	if (upl_get_internal_vectorupl(upl))
	1334	panic("Vector UPLs should not take this code-path\n");
	1335	/*
	1336	* we're writing into a 'hole'
	1337	*/
	1338	if (flags & CL_PAGEOUT) {
	1339	/*
	1340	* if we got here via cluster_pageout
	1341	* then just error the request and return
	1342	* the 'hole' should already have been covered
	1343	*/
	1344	error = EINVAL;
	1345	break;
	1346	}
	1347	/*
	1348	* we can get here if the cluster code happens to
	1349	* pick up a page that was dirtied via mmap vs
	1350	* a 'write' and the page targets a 'hole'...
	1351	* i.e. the writes to the cluster were sparse
	1352	* and the file was being written for the first time
	1353	*
	1354	* we can also get here if the filesystem supports
	1355	* 'holes' that are less than PAGE_SIZE.... because
	1356	* we can't know if the range in the page that covers
	1357	* the 'hole' has been dirtied via an mmap or not,
	1358	* we have to assume the worst and try to push the
	1359	* entire page to storage.
	1360	*
	1361	* Try paging out the page individually before
	1362	* giving up entirely and dumping it (the pageout
	1363	* path will insure that the zero extent accounting
	1364	* has been taken care of before we get back into cluster_io)
	1365	*
	1366	* go direct to vnode_pageout so that we don't have to
	1367	* unbusy the page from the UPL... we used to do this
	1368	* so that we could call ubc_msync, but that results
	1369	* in a potential deadlock if someone else races us to acquire
	1370	* that page and wins and in addition needs one of the pages
	1371	* we're continuing to hold in the UPL
	1372	*/
	1373	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
	1374
	1375	if ( !(flags & CL_ASYNC))
	1376	pageout_flags \|= UPL_IOSYNC;
	1377	if ( !(flags & CL_COMMIT))
	1378	pageout_flags \|= UPL_NOCOMMIT;
	1379
	1380	if (cbp_head) {
	1381	buf_t prev_cbp;
	1382	int bytes_in_last_page;
	1383
	1384	/*
	1385	* first we have to wait for the the current outstanding I/Os
	1386	* to complete... EOT hasn't been set yet on this transaction
	1387	* so the pages won't be released
	1388	*/
	1389	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1390
	1391	bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
	1392	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	1393	bytes_in_last_page += cbp->b_bcount;
	1394	bytes_in_last_page &= PAGE_MASK;
	1395
	1396	while (bytes_in_last_page) {
	1397	/*
	1398	* we've got a transcation that
	1399	* includes the page we're about to push out through vnode_pageout...
	1400	* find the bp's in the list which intersect this page and either
	1401	* remove them entirely from the transaction (there could be multiple bp's), or
	1402	* round it's iosize down to the page boundary (there can only be one)...
	1403	*
	1404	* find the last bp in the list and act on it
	1405	*/
	1406	for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
	1407	prev_cbp = cbp;
	1408
	1409	if (bytes_in_last_page >= cbp->b_bcount) {
	1410	/*
	1411	* this buf no longer has any I/O associated with it
	1412	*/
	1413	bytes_in_last_page -= cbp->b_bcount;
	1414	cbp->b_bcount = 0;
	1415
	1416	free_io_buf(cbp);
	1417
	1418	if (cbp == cbp_head) {
	1419	assert(bytes_in_last_page == 0);
	1420	/*
	1421	* the buf we just freed was the only buf in
	1422	* this transaction... so there's no I/O to do
	1423	*/
	1424	cbp_head = NULL;
	1425	cbp_tail = NULL;
	1426	} else {
	1427	/*
	1428	* remove the buf we just freed from
	1429	* the transaction list
	1430	*/
	1431	prev_cbp->b_trans_next = NULL;
	1432	cbp_tail = prev_cbp;
	1433	}
	1434	} else {
	1435	/*
	1436	* this is the last bp that has I/O
	1437	* intersecting the page of interest
	1438	* only some of the I/O is in the intersection
	1439	* so clip the size but keep it in the transaction list
	1440	*/
	1441	cbp->b_bcount -= bytes_in_last_page;
	1442	cbp_tail = cbp;
	1443	bytes_in_last_page = 0;
	1444	}
	1445	}
	1446	if (cbp_head) {
	1447	/*
	1448	* there was more to the current transaction
	1449	* than just the page we are pushing out via vnode_pageout...
	1450	* mark it as finished and complete it... we've already
	1451	* waited for the I/Os to complete above in the call to cluster_wait_IO
	1452	*/
	1453	cluster_EOT(cbp_head, cbp_tail, 0);
	1454
	1455	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1456
	1457	trans_count = 0;
	1458	}
	1459	}
	1460	if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
	1461	error = EINVAL;
	1462	}
	1463	e_offset = round_page_64(f_offset + 1);
	1464	io_size = e_offset - f_offset;
	1465
	1466	f_offset += io_size;
	1467	upl_offset += io_size;
	1468
	1469	if (size >= io_size)
	1470	size -= io_size;
	1471	else
	1472	size = 0;
	1473	/*
	1474	* keep track of how much of the original request
	1475	* that we've actually completed... non_rounded_size
	1476	* may go negative due to us rounding the request
	1477	* to a page size multiple (i.e. size > non_rounded_size)
	1478	*/
	1479	non_rounded_size -= io_size;
	1480
	1481	if (non_rounded_size <= 0) {
	1482	/*
	1483	* we've transferred all of the data in the original
	1484	* request, but we were unable to complete the tail
	1485	* of the last page because the file didn't have
	1486	* an allocation to back that portion... this is ok.
	1487	*/
	1488	size = 0;
	1489	}
	1490	if (error) {
	1491	if (size == 0)
	1492	flags &= ~CL_COMMIT;
	1493	break;
	1494	}
	1495	continue;
	1496	}
	1497	lblkno = (daddr64_t)(f_offset / 0x1000);
	1498	/*
	1499	* we have now figured out how much I/O we can do - this is in 'io_size'
	1500	* pg_offset is the starting point in the first page for the I/O
	1501	* pg_count is the number of full and partial pages that 'io_size' encompasses
	1502	*/
	1503	pg_offset = upl_offset & PAGE_MASK;
	1504
	1505	if (flags & CL_DEV_MEMORY) {
	1506	/*
	1507	* treat physical requests as one 'giant' page
	1508	*/
	1509	pg_count = 1;
	1510	} else
	1511	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1512
	1513	if ((flags & CL_READ) && blkno == -1) {
	1514	vm_offset_t commit_offset;
	1515	int bytes_to_zero;
	1516	int complete_transaction_now = 0;
	1517
	1518	/*
	1519	* if we're reading and blkno == -1, then we've got a
	1520	* 'hole' in the file that we need to deal with by zeroing
	1521	* out the affected area in the upl
	1522	*/
	1523	if (io_size >= (u_int)non_rounded_size) {
	1524	/*
	1525	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	1526	* than 'zero_offset' will be non-zero
	1527	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	1528	* (indicated by the io_size finishing off the I/O request for this UPL)
	1529	* than we're not going to issue an I/O for the
	1530	* last page in this upl... we need to zero both the hole and the tail
	1531	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	1532	*/
	1533	bytes_to_zero = non_rounded_size;
	1534	if (!(flags & CL_NOZERO))
	1535	bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
	1536
	1537	zero_offset = 0;
	1538	} else
	1539	bytes_to_zero = io_size;
	1540
	1541	pg_count = 0;
	1542
	1543	cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
	1544
	1545	if (cbp_head) {
	1546	int pg_resid;
	1547
	1548	/*
	1549	* if there is a current I/O chain pending
	1550	* then the first page of the group we just zero'd
	1551	* will be handled by the I/O completion if the zero
	1552	* fill started in the middle of the page
	1553	*/
	1554	commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1555
	1556	pg_resid = commit_offset - upl_offset;
	1557
	1558	if (bytes_to_zero >= pg_resid) {
	1559	/*
	1560	* the last page of the current I/O
	1561	* has been completed...
	1562	* compute the number of fully zero'd
	1563	* pages that are beyond it
	1564	* plus the last page if its partial
	1565	* and we have no more I/O to issue...
	1566	* otherwise a partial page is left
	1567	* to begin the next I/O
	1568	*/
	1569	if ((int)io_size >= non_rounded_size)
	1570	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1571	else
	1572	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
	1573
	1574	complete_transaction_now = 1;
	1575	}
	1576	} else {
	1577	/*
	1578	* no pending I/O to deal with
	1579	* so, commit all of the fully zero'd pages
	1580	* plus the last page if its partial
	1581	* and we have no more I/O to issue...
	1582	* otherwise a partial page is left
	1583	* to begin the next I/O
	1584	*/
	1585	if ((int)io_size >= non_rounded_size)
	1586	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1587	else
	1588	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
	1589
	1590	commit_offset = upl_offset & ~PAGE_MASK;
	1591	}
	1592
	1593	// Associated UPL is currently only used in the direct write path
	1594	assert(!upl_associated_upl(upl));
	1595
	1596	if ( (flags & CL_COMMIT) && pg_count) {
	1597	ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
	1598	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	1599	}
	1600	upl_offset += io_size;
	1601	f_offset += io_size;
	1602	size -= io_size;
	1603
	1604	/*
	1605	* keep track of how much of the original request
	1606	* that we've actually completed... non_rounded_size
	1607	* may go negative due to us rounding the request
	1608	* to a page size multiple (i.e. size > non_rounded_size)
	1609	*/
	1610	non_rounded_size -= io_size;
	1611
	1612	if (non_rounded_size <= 0) {
	1613	/*
	1614	* we've transferred all of the data in the original
	1615	* request, but we were unable to complete the tail
	1616	* of the last page because the file didn't have
	1617	* an allocation to back that portion... this is ok.
	1618	*/
	1619	size = 0;
	1620	}
	1621	if (cbp_head && (complete_transaction_now \|\| size == 0)) {
	1622	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1623
	1624	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1625
	1626	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1627
	1628	trans_count = 0;
	1629	}
	1630	continue;
	1631	}
	1632	if (pg_count > max_vectors) {
	1633	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	1634	io_size = PAGE_SIZE - pg_offset;
	1635	pg_count = 1;
	1636	} else {
	1637	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	1638	pg_count = max_vectors;
	1639	}
	1640	}
	1641	/*
	1642	* If the transaction is going to reach the maximum number of
	1643	* desired elements, truncate the i/o to the nearest page so
	1644	* that the actual i/o is initiated after this buffer is
	1645	* created and added to the i/o chain.
	1646	*
	1647	* I/O directed to physically contiguous memory
	1648	* doesn't have a requirement to make sure we 'fill' a page
	1649	*/
	1650	if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
	1651	((upl_offset + io_size) & PAGE_MASK)) {
	1652	vm_offset_t aligned_ofs;
	1653
	1654	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
	1655	/*
	1656	* If the io_size does not actually finish off even a
	1657	* single page we have to keep adding buffers to the
	1658	* transaction despite having reached the desired limit.
	1659	*
	1660	* Eventually we get here with the page being finished
	1661	* off (and exceeded) and then we truncate the size of
	1662	* this i/o request so that it is page aligned so that
	1663	* we can finally issue the i/o on the transaction.
	1664	*/
	1665	if (aligned_ofs > upl_offset) {
	1666	io_size = aligned_ofs - upl_offset;
	1667	pg_count--;
	1668	}
	1669	}
	1670
	1671	if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
	1672	/*
	1673	* if we're not targeting a virtual device i.e. a disk image
	1674	* it's safe to dip into the reserve pool since real devices
	1675	* can complete this I/O request without requiring additional
	1676	* bufs from the alloc_io_buf pool
	1677	*/
	1678	priv = 1;
	1679	else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
	1680	/*
	1681	* Throttle the speculative IO
	1682	*/
	1683	priv = 0;
	1684	else
	1685	priv = 1;
	1686
	1687	cbp = alloc_io_buf(vp, priv);
	1688
	1689	if (flags & CL_PAGEOUT) {
	1690	u_int i;
	1691
	1692	/*
	1693	* since blocks are in offsets of 0x1000, scale
	1694	* iteration to (PAGE_SIZE * pg_count) of blks.
	1695	*/
	1696	for (i = 0; i < (PAGE_SIZE * pg_count)/0x1000; i++) {
	1697	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
	1698	panic("BUSY bp found in cluster_io");
	1699	}
	1700	}
	1701	if (flags & CL_ASYNC) {
	1702	if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
	1703	panic("buf_setcallback failed\n");
	1704	}
	1705	cbp->b_cliodone = (void *)callback;
	1706	cbp->b_flags \|= io_flags;
	1707	if (flags & CL_NOCACHE)
	1708	cbp->b_attr.ba_flags \|= BA_NOCACHE;
	1709
	1710	cbp->b_lblkno = lblkno;
	1711	cbp->b_blkno = blkno;
	1712	cbp->b_bcount = io_size;
	1713
	1714	if (buf_setupl(cbp, upl, upl_offset))
	1715	panic("buf_setupl failed\n");
	1716	#if CONFIG_IOSCHED
	1717	upl_set_blkno(upl, upl_offset, io_size, blkno);
	1718	#endif
	1719	cbp->b_trans_next = (buf_t)NULL;
	1720
	1721	if ((cbp->b_iostate = (void *)iostate))
	1722	/*
	1723	* caller wants to track the state of this
	1724	* io... bump the amount issued against this stream
	1725	*/
	1726	iostate->io_issued += io_size;
	1727
	1728	if (flags & CL_READ) {
	1729	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	1730	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1731	}
	1732	else {
	1733	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	1734	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1735	}
	1736
	1737	if (cbp_head) {
	1738	cbp_tail->b_trans_next = cbp;
	1739	cbp_tail = cbp;
	1740	} else {
	1741	cbp_head = cbp;
	1742	cbp_tail = cbp;
	1743
	1744	if ( (cbp_head->b_real_bp = real_bp) )
	1745	real_bp = (buf_t)NULL;
	1746	}
	1747	(buf_t )(&cbp->b_trans_head) = cbp_head;
	1748
	1749	trans_count++;
	1750
	1751	upl_offset += io_size;
	1752	f_offset += io_size;
	1753	size -= io_size;
	1754	/*
	1755	* keep track of how much of the original request
	1756	* that we've actually completed... non_rounded_size
	1757	* may go negative due to us rounding the request
	1758	* to a page size multiple (i.e. size > non_rounded_size)
	1759	*/
	1760	non_rounded_size -= io_size;
	1761
	1762	if (non_rounded_size <= 0) {
	1763	/*
	1764	* we've transferred all of the data in the original
	1765	* request, but we were unable to complete the tail
	1766	* of the last page because the file didn't have
	1767	* an allocation to back that portion... this is ok.
	1768	*/
	1769	size = 0;
	1770	}
	1771	if (size == 0) {
	1772	/*
	1773	* we have no more I/O to issue, so go
	1774	* finish the final transaction
	1775	*/
	1776	need_EOT = TRUE;
	1777	} else if ( ((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == 0) &&
	1778	((flags & CL_ASYNC) \|\| trans_count > max_trans_count) ) {
	1779	/*
	1780	* I/O directed to physically contiguous memory...
	1781	* which doesn't have a requirement to make sure we 'fill' a page
	1782	* or...
	1783	* the current I/O we've prepared fully
	1784	* completes the last page in this request
	1785	* and ...
	1786	* it's either an ASYNC request or
	1787	* we've already accumulated more than 8 I/O's into
	1788	* this transaction so mark it as complete so that
	1789	* it can finish asynchronously or via the cluster_complete_transaction
	1790	* below if the request is synchronous
	1791	*/
	1792	need_EOT = TRUE;
	1793	}
	1794	if (need_EOT == TRUE)
	1795	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1796
	1797	if (flags & CL_THROTTLE)
	1798	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
	1799
	1800	if ( !(io_flags & B_READ))
	1801	vnode_startwrite(vp);
	1802
	1803	if (flags & CL_RAW_ENCRYPTED) {
	1804	/*
	1805	* User requested raw encrypted bytes.
	1806	* Twiddle the bit in the ba_flags for the buffer
	1807	*/
	1808	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
	1809	}
	1810
	1811	(void) VNOP_STRATEGY(cbp);
	1812
	1813	if (need_EOT == TRUE) {
	1814	if ( !(flags & CL_ASYNC))
	1815	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
	1816
	1817	need_EOT = FALSE;
	1818	trans_count = 0;
	1819	cbp_head = NULL;
	1820	}
	1821	}
	1822	if (error) {
	1823	int abort_size;
	1824
	1825	io_size = 0;
	1826
	1827	if (cbp_head) {
	1828	/*
	1829	* Wait until all of the outstanding I/O
	1830	* for this partial transaction has completed
	1831	*/
	1832	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1833
	1834	/*
	1835	* Rewind the upl offset to the beginning of the
	1836	* transaction.
	1837	*/
	1838	upl_offset = cbp_head->b_uploffset;
	1839	}
	1840
	1841	if (ISSET(flags, CL_COMMIT)) {
	1842	cluster_handle_associated_upl(iostate, upl, upl_offset,
	1843	upl_end_offset - upl_offset);
	1844	}
	1845
	1846	// Free all the IO buffers in this transaction
	1847	for (cbp = cbp_head; cbp;) {
	1848	buf_t cbp_next;
	1849
	1850	size += cbp->b_bcount;
	1851	io_size += cbp->b_bcount;
	1852
	1853	cbp_next = cbp->b_trans_next;
	1854	free_io_buf(cbp);
	1855	cbp = cbp_next;
	1856	}
	1857
	1858	if (iostate) {
	1859	int need_wakeup = 0;
	1860
	1861	/*
	1862	* update the error condition for this stream
	1863	* since we never really issued the io
	1864	* just go ahead and adjust it back
	1865	*/
	1866	lck_mtx_lock_spin(&iostate->io_mtxp);
	1867
	1868	if (iostate->io_error == 0)
	1869	iostate->io_error = error;
	1870	iostate->io_issued -= io_size;
	1871
	1872	if (iostate->io_wanted) {
	1873	/*
	1874	* someone is waiting for the state of
	1875	* this io stream to change
	1876	*/
	1877	iostate->io_wanted = 0;
	1878	need_wakeup = 1;
	1879	}
	1880	lck_mtx_unlock(&iostate->io_mtxp);
	1881
	1882	if (need_wakeup)
	1883	wakeup((caddr_t)&iostate->io_wanted);
	1884	}
	1885
	1886	if (flags & CL_COMMIT) {
	1887	int upl_flags;
	1888
	1889	pg_offset = upl_offset & PAGE_MASK;
	1890	abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
	1891
	1892	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
	1893
	1894	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1895	upl, upl_offset - pg_offset, abort_size, (error << 24) \| upl_flags, 0);
	1896	}
	1897	if (retval == 0)
	1898	retval = error;
	1899	} else if (cbp_head)
	1900	panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
	1901
	1902	if (real_bp) {
	1903	/*
	1904	* can get here if we either encountered an error
	1905	* or we completely zero-filled the request and
	1906	* no I/O was issued
	1907	*/
	1908	if (error) {
	1909	real_bp->b_flags \|= B_ERROR;
	1910	real_bp->b_error = error;
	1911	}
	1912	buf_biodone(real_bp);
	1913	}
	1914	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
	1915
	1916	return (retval);
	1917	}
	1918
	1919	#define reset_vector_run_state() \
	1920	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
	1921
	1922	static int
	1923	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
	1924	int io_flag, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1925	{
	1926	vector_upl_set_pagelist(vector_upl);
	1927
	1928	if(io_flag & CL_READ) {
	1929	if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
	1930	io_flag &= ~CL_PRESERVE; /don't zero fill/
	1931	else
	1932	io_flag \|= CL_PRESERVE; /zero fill/
	1933	}
	1934	return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
	1935
	1936	}
	1937
	1938	static int
	1939	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	1940	{
	1941	int pages_in_prefetch;
	1942
	1943	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	1944	(int)f_offset, size, (int)filesize, 0, 0);
	1945
	1946	if (f_offset >= filesize) {
	1947	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1948	(int)f_offset, 0, 0, 0, 0);
	1949	return(0);
	1950	}
	1951	if ((off_t)size > (filesize - f_offset))
	1952	size = filesize - f_offset;
	1953	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1954
	1955	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
	1956
	1957	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1958	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	1959
	1960	return (pages_in_prefetch);
	1961	}
	1962
	1963
	1964
	1965	static void
	1966	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap, int (callback)(buf_t, void ), void *callback_arg,
	1967	int bflag)
	1968	{
	1969	daddr64_t r_addr;
	1970	off_t f_offset;
	1971	int size_of_prefetch;
	1972	u_int max_prefetch;
	1973
	1974
	1975	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	1976	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	1977
	1978	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	1979	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1980	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	1981	return;
	1982	}
	1983	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
	1984	rap->cl_ralen = 0;
	1985	rap->cl_maxra = 0;
	1986
	1987	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1988	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	1989
	1990	return;
	1991	}
	1992	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
	1993
	1994	if (max_prefetch > speculative_prefetch_max)
	1995	max_prefetch = speculative_prefetch_max;
	1996
	1997	if (max_prefetch <= PAGE_SIZE) {
	1998	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1999	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
	2000	return;
	2001	}
	2002	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
	2003	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
	2004
	2005	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2006	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	2007	return;
	2008	}
	2009	}
	2010	r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
	2011	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	2012
	2013	size_of_prefetch = 0;
	2014
	2015	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	2016
	2017	if (size_of_prefetch) {
	2018	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2019	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	2020	return;
	2021	}
	2022	if (f_offset < filesize) {
	2023	daddr64_t read_size;
	2024
	2025	rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
	2026
	2027	read_size = (extent->e_addr + 1) - extent->b_addr;
	2028
	2029	if (read_size > rap->cl_ralen) {
	2030	if (read_size > max_prefetch / PAGE_SIZE)
	2031	rap->cl_ralen = max_prefetch / PAGE_SIZE;
	2032	else
	2033	rap->cl_ralen = read_size;
	2034	}
	2035	size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
	2036
	2037	if (size_of_prefetch)
	2038	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	2039	}
	2040	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	2041	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	2042	}
	2043
	2044
	2045	int
	2046	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2047	int size, off_t filesize, int flags)
	2048	{
	2049	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2050
	2051	}
	2052
	2053
	2054	int
	2055	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2056	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2057	{
	2058	int io_size;
	2059	int rounded_size;
	2060	off_t max_size;
	2061	int local_flags;
	2062
	2063	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	2064
	2065	if ((flags & UPL_IOSYNC) == 0)
	2066	local_flags \|= CL_ASYNC;
	2067	if ((flags & UPL_NOCOMMIT) == 0)
	2068	local_flags \|= CL_COMMIT;
	2069	if ((flags & UPL_KEEPCACHED))
	2070	local_flags \|= CL_KEEPCACHED;
	2071	if (flags & UPL_PAGING_ENCRYPTED)
	2072	local_flags \|= CL_ENCRYPTED;
	2073
	2074
	2075	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	2076	(int)f_offset, size, (int)filesize, local_flags, 0);
	2077
	2078	/*
	2079	* If they didn't specify any I/O, then we are done...
	2080	* we can't issue an abort because we don't know how
	2081	* big the upl really is
	2082	*/
	2083	if (size <= 0)
	2084	return (EINVAL);
	2085
	2086	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	2087	if (local_flags & CL_COMMIT)
	2088	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2089	return (EROFS);
	2090	}
	2091	/*
	2092	* can't page-in from a negative offset
	2093	* or if we're starting beyond the EOF
	2094	* or if the file offset isn't page aligned
	2095	* or the size requested isn't a multiple of PAGE_SIZE
	2096	*/
	2097	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2098	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	2099	if (local_flags & CL_COMMIT)
	2100	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	2101	return (EINVAL);
	2102	}
	2103	max_size = filesize - f_offset;
	2104
	2105	if (size < max_size)
	2106	io_size = size;
	2107	else
	2108	io_size = max_size;
	2109
	2110	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2111
	2112	if (size > rounded_size) {
	2113	if (local_flags & CL_COMMIT)
	2114	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	2115	UPL_ABORT_FREE_ON_EMPTY);
	2116	}
	2117	return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2118	local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
	2119	}
	2120
	2121
	2122	int
	2123	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2124	int size, off_t filesize, int flags)
	2125	{
	2126	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	2127	}
	2128
	2129
	2130	int
	2131	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	2132	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2133	{
	2134	u_int io_size;
	2135	int rounded_size;
	2136	off_t max_size;
	2137	int retval;
	2138	int local_flags = 0;
	2139
	2140	if (upl == NULL \|\| size < 0)
	2141	panic("cluster_pagein: NULL upl passed in");
	2142
	2143	if ((flags & UPL_IOSYNC) == 0)
	2144	local_flags \|= CL_ASYNC;
	2145	if ((flags & UPL_NOCOMMIT) == 0)
	2146	local_flags \|= CL_COMMIT;
	2147	if (flags & UPL_IOSTREAMING)
	2148	local_flags \|= CL_IOSTREAMING;
	2149	if (flags & UPL_PAGING_ENCRYPTED)
	2150	local_flags \|= CL_ENCRYPTED;
	2151
	2152
	2153	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	2154	(int)f_offset, size, (int)filesize, local_flags, 0);
	2155
	2156	/*
	2157	* can't page-in from a negative offset
	2158	* or if we're starting beyond the EOF
	2159	* or if the file offset isn't page aligned
	2160	* or the size requested isn't a multiple of PAGE_SIZE
	2161	*/
	2162	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	2163	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	2164	if (local_flags & CL_COMMIT)
	2165	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2166	return (EINVAL);
	2167	}
	2168	max_size = filesize - f_offset;
	2169
	2170	if (size < max_size)
	2171	io_size = size;
	2172	else
	2173	io_size = max_size;
	2174
	2175	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2176
	2177	if (size > rounded_size && (local_flags & CL_COMMIT))
	2178	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	2179	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	2180
	2181	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	2182	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2183
	2184	return (retval);
	2185	}
	2186
	2187
	2188	int
	2189	cluster_bp(buf_t bp)
	2190	{
	2191	return cluster_bp_ext(bp, NULL, NULL);
	2192	}
	2193
	2194
	2195	int
	2196	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void ), void *callback_arg)
	2197	{
	2198	off_t f_offset;
	2199	int flags;
	2200
	2201	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	2202	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	2203
	2204	if (bp->b_flags & B_READ)
	2205	flags = CL_ASYNC \| CL_READ;
	2206	else
	2207	flags = CL_ASYNC;
	2208	if (bp->b_flags & B_PASSIVE)
	2209	flags \|= CL_PASSIVE;
	2210
	2211	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	2212
	2213	return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
	2214	}
	2215
	2216
	2217
	2218	int
	2219	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	2220	{
	2221	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
	2222	}
	2223
	2224
	2225	int
	2226	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
	2227	int xflags, int (callback)(buf_t, void ), void *callback_arg)
	2228	{
	2229	user_ssize_t cur_resid;
	2230	int retval = 0;
	2231	int flags;
	2232	int zflags;
	2233	int bflag;
	2234	int write_type = IO_COPY;
	2235	u_int32_t write_length;
	2236
	2237	flags = xflags;
	2238
	2239	if (flags & IO_PASSIVE)
	2240	bflag = CL_PASSIVE;
	2241	else
	2242	bflag = 0;
	2243
	2244	if (vp->v_flag & VNOCACHE_DATA){
	2245	flags \|= IO_NOCACHE;
	2246	bflag \|= CL_NOCACHE;
	2247	}
	2248	if (uio == NULL) {
	2249	/*
	2250	* no user data...
	2251	* this call is being made to zero-fill some range in the file
	2252	*/
	2253	retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
	2254
	2255	return(retval);
	2256	}
	2257	/*
	2258	* do a write through the cache if one of the following is true....
	2259	* NOCACHE is not true or NODIRECT is true
	2260	* the uio request doesn't target USERSPACE
	2261	* otherwise, find out if we want the direct or contig variant for
	2262	* the first vector in the uio request
	2263	*/
	2264	if ( ((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
	2265	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2266
	2267	if ( (flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT)
	2268	/*
	2269	* must go through the cached variant in this case
	2270	*/
	2271	write_type = IO_COPY;
	2272
	2273	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
	2274
	2275	switch (write_type) {
	2276
	2277	case IO_COPY:
	2278	/*
	2279	* make sure the uio_resid isn't too big...
	2280	* internally, we want to handle all of the I/O in
	2281	* chunk sizes that fit in a 32 bit int
	2282	*/
	2283	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	2284	/*
	2285	* we're going to have to call cluster_write_copy
	2286	* more than once...
	2287	*
	2288	* only want the last call to cluster_write_copy to
	2289	* have the IO_TAILZEROFILL flag set and only the
	2290	* first call should have IO_HEADZEROFILL
	2291	*/
	2292	zflags = flags & ~IO_TAILZEROFILL;
	2293	flags &= ~IO_HEADZEROFILL;
	2294
	2295	write_length = MAX_IO_REQUEST_SIZE;
	2296	} else {
	2297	/*
	2298	* last call to cluster_write_copy
	2299	*/
	2300	zflags = flags;
	2301
	2302	write_length = (u_int32_t)cur_resid;
	2303	}
	2304	retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
	2305	break;
	2306
	2307	case IO_CONTIG:
	2308	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
	2309
	2310	if (flags & IO_HEADZEROFILL) {
	2311	/*
	2312	* only do this once per request
	2313	*/
	2314	flags &= ~IO_HEADZEROFILL;
	2315
	2316	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
	2317	headOff, (off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2318	if (retval)
	2319	break;
	2320	}
	2321	retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
	2322
	2323	if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
	2324	/*
	2325	* we're done with the data from the user specified buffer(s)
	2326	* and we've been requested to zero fill at the tail
	2327	* treat this as an IO_HEADZEROFILL which doesn't require a uio
	2328	* by rearranging the args and passing in IO_HEADZEROFILL
	2329	*/
	2330	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
	2331	(off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2332	}
	2333	break;
	2334
	2335	case IO_DIRECT:
	2336	/*
	2337	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
	2338	*/
	2339	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
	2340	break;
	2341
	2342	case IO_UNKNOWN:
	2343	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2344	break;
	2345	}
	2346	/*
	2347	* in case we end up calling cluster_write_copy (from cluster_write_direct)
	2348	* multiple times to service a multi-vector request that is not aligned properly
	2349	* we need to update the oldEOF so that we
	2350	* don't zero-fill the head of a page if we've successfully written
	2351	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2352	* page that is beyond the oldEOF if the write is unaligned... we only
	2353	* want that to happen for the very first page of the cluster_write,
	2354	* NOT the first page of each vector making up a multi-vector write.
	2355	*/
	2356	if (uio->uio_offset > oldEOF)
	2357	oldEOF = uio->uio_offset;
	2358	}
	2359	return (retval);
	2360	}
	2361
	2362
	2363	static int
	2364	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int write_type, u_int32_t *write_length,
	2365	int flags, int (callback)(buf_t, void ), void *callback_arg)
	2366	{
	2367	upl_t upl;
	2368	upl_page_info_t *pl;
	2369	vm_offset_t upl_offset;
	2370	vm_offset_t vector_upl_offset = 0;
	2371	u_int32_t io_req_size;
	2372	u_int32_t offset_in_file;
	2373	u_int32_t offset_in_iovbase;
	2374	u_int32_t io_size;
	2375	int io_flag = 0;
	2376	upl_size_t upl_size, vector_upl_size = 0;
	2377	vm_size_t upl_needed_size;
	2378	mach_msg_type_number_t pages_in_pl;
	2379	upl_control_flags_t upl_flags;
	2380	kern_return_t kret;
	2381	mach_msg_type_number_t i;
	2382	int force_data_sync;
	2383	int retval = 0;
	2384	int first_IO = 1;
	2385	struct clios iostate;
	2386	user_addr_t iov_base;
	2387	u_int32_t mem_alignment_mask;
	2388	u_int32_t devblocksize;
	2389	u_int32_t max_io_size;
	2390	u_int32_t max_upl_size;
	2391	u_int32_t max_vector_size;
	2392	u_int32_t bytes_outstanding_limit;
	2393	boolean_t io_throttled = FALSE;
	2394
	2395	u_int32_t vector_upl_iosize = 0;
	2396	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	2397	off_t v_upl_uio_offset = 0;
	2398	int vector_upl_index=0;
	2399	upl_t vector_upl = NULL;
	2400
	2401
	2402	/*
	2403	* When we enter this routine, we know
	2404	* -- the resid will not exceed iov_len
	2405	*/
	2406	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	2407	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2408
	2409	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2410
	2411	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
	2412
	2413	if (flags & IO_PASSIVE)
	2414	io_flag \|= CL_PASSIVE;
	2415
	2416	if (flags & IO_NOCACHE)
	2417	io_flag \|= CL_NOCACHE;
	2418
	2419	if (flags & IO_SKIP_ENCRYPTION)
	2420	io_flag \|= CL_ENCRYPTED;
	2421
	2422	iostate.io_completed = 0;
	2423	iostate.io_issued = 0;
	2424	iostate.io_error = 0;
	2425	iostate.io_wanted = 0;
	2426
	2427	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2428
	2429	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2430	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2431
	2432	if (devblocksize == 1) {
	2433	/*
	2434	* the AFP client advertises a devblocksize of 1
	2435	* however, its BLOCKMAP routine maps to physical
	2436	* blocks that are PAGE_SIZE in size...
	2437	* therefore we can't ask for I/Os that aren't page aligned
	2438	* or aren't multiples of PAGE_SIZE in size
	2439	* by setting devblocksize to PAGE_SIZE, we re-instate
	2440	* the old behavior we had before the mem_alignment_mask
	2441	* changes went in...
	2442	*/
	2443	devblocksize = PAGE_SIZE;
	2444	}
	2445
	2446	next_dwrite:
	2447	io_req_size = *write_length;
	2448	iov_base = uio_curriovbase(uio);
	2449
	2450	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
	2451	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	2452
	2453	if (offset_in_file \|\| offset_in_iovbase) {
	2454	/*
	2455	* one of the 2 important offsets is misaligned
	2456	* so fire an I/O through the cache for this entire vector
	2457	*/
	2458	goto wait_for_dwrites;
	2459	}
	2460	if (iov_base & (devblocksize - 1)) {
	2461	/*
	2462	* the offset in memory must be on a device block boundary
	2463	* so that we can guarantee that we can generate an
	2464	* I/O that ends on a page boundary in cluster_io
	2465	*/
	2466	goto wait_for_dwrites;
	2467	}
	2468
	2469	task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
	2470	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
	2471	int throttle_type;
	2472
	2473	if ( (throttle_type = cluster_is_throttled(vp)) ) {
	2474	/*
	2475	* we're in the throttle window, at the very least
	2476	* we want to limit the size of the I/O we're about
	2477	* to issue
	2478	*/
	2479	if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
	2480	/*
	2481	* we're in the throttle window and at least 1 I/O
	2482	* has already been issued by a throttleable thread
	2483	* in this window, so return with EAGAIN to indicate
	2484	* to the FS issuing the cluster_write call that it
	2485	* should now throttle after dropping any locks
	2486	*/
	2487	throttle_info_update_by_mount(vp->v_mount);
	2488
	2489	io_throttled = TRUE;
	2490	goto wait_for_dwrites;
	2491	}
	2492	max_vector_size = THROTTLE_MAX_IOSIZE;
	2493	max_io_size = THROTTLE_MAX_IOSIZE;
	2494	} else {
	2495	max_vector_size = MAX_VECTOR_UPL_SIZE;
	2496	max_io_size = max_upl_size;
	2497	}
	2498
	2499	if (first_IO) {
	2500	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2501	first_IO = 0;
	2502	}
	2503	io_size = io_req_size & ~PAGE_MASK;
	2504	iov_base = uio_curriovbase(uio);
	2505
	2506	if (io_size > max_io_size)
	2507	io_size = max_io_size;
	2508
	2509	if(useVectorUPL && (iov_base & PAGE_MASK)) {
	2510	/*
	2511	* We have an iov_base that's not page-aligned.
	2512	* Issue all I/O's that have been collected within
	2513	* this Vectored UPL.
	2514	*/
	2515	if(vector_upl_index) {
	2516	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2517	reset_vector_run_state();
	2518	}
	2519
	2520	/*
	2521	* After this point, if we are using the Vector UPL path and the base is
	2522	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	2523	*/
	2524	}
	2525
	2526	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2527	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	2528
	2529	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	2530	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	2531
	2532	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2533	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	2534	pages_in_pl = 0;
	2535	upl_size = upl_needed_size;
	2536	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2537	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2538
	2539	kret = vm_map_get_upl(map,
	2540	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2541	&upl_size,
	2542	&upl,
	2543	NULL,
	2544	&pages_in_pl,
	2545	&upl_flags,
	2546	VM_KERN_MEMORY_FILE,
	2547	force_data_sync);
	2548
	2549	if (kret != KERN_SUCCESS) {
	2550	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2551	0, 0, 0, kret, 0);
	2552	/*
	2553	* failed to get pagelist
	2554	*
	2555	* we may have already spun some portion of this request
	2556	* off as async requests... we need to wait for the I/O
	2557	* to complete before returning
	2558	*/
	2559	goto wait_for_dwrites;
	2560	}
	2561	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2562	pages_in_pl = upl_size / PAGE_SIZE;
	2563
	2564	for (i = 0; i < pages_in_pl; i++) {
	2565	if (!upl_valid_page(pl, i))
	2566	break;
	2567	}
	2568	if (i == pages_in_pl)
	2569	break;
	2570
	2571	/*
	2572	* didn't get all the pages back that we
	2573	* needed... release this upl and try again
	2574	*/
	2575	ubc_upl_abort(upl, 0);
	2576	}
	2577	if (force_data_sync >= 3) {
	2578	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2579	i, pages_in_pl, upl_size, kret, 0);
	2580	/*
	2581	* for some reason, we couldn't acquire a hold on all
	2582	* the pages needed in the user's address space
	2583	*
	2584	* we may have already spun some portion of this request
	2585	* off as async requests... we need to wait for the I/O
	2586	* to complete before returning
	2587	*/
	2588	goto wait_for_dwrites;
	2589	}
	2590
	2591	/*
	2592	* Consider the possibility that upl_size wasn't satisfied.
	2593	*/
	2594	if (upl_size < upl_needed_size) {
	2595	if (upl_size && upl_offset == 0)
	2596	io_size = upl_size;
	2597	else
	2598	io_size = 0;
	2599	}
	2600	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2601	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	2602
	2603	if (io_size == 0) {
	2604	ubc_upl_abort(upl, 0);
	2605	/*
	2606	* we may have already spun some portion of this request
	2607	* off as async requests... we need to wait for the I/O
	2608	* to complete before returning
	2609	*/
	2610	goto wait_for_dwrites;
	2611	}
	2612
	2613	if(useVectorUPL) {
	2614	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	2615	if(end_off)
	2616	issueVectorUPL = 1;
	2617	/*
	2618	* After this point, if we are using a vector UPL, then
	2619	* either all the UPL elements end on a page boundary OR
	2620	* this UPL is the last element because it does not end
	2621	* on a page boundary.
	2622	*/
	2623	}
	2624
	2625	/*
	2626	* we want push out these writes asynchronously so that we can overlap
	2627	* the preparation of the next I/O
	2628	* if there are already too many outstanding writes
	2629	* wait until some complete before issuing the next
	2630	*/
	2631	if (vp->v_mount->mnt_minsaturationbytecount)
	2632	bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
	2633	else
	2634	bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
	2635
	2636	cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
	2637
	2638	if (iostate.io_error) {
	2639	/*
	2640	* one of the earlier writes we issued ran into a hard error
	2641	* don't issue any more writes, cleanup the UPL
	2642	* that was just created but not used, then
	2643	* go wait for all writes that are part of this stream
	2644	* to complete before returning the error to the caller
	2645	*/
	2646	ubc_upl_abort(upl, 0);
	2647
	2648	goto wait_for_dwrites;
	2649	}
	2650
	2651	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	2652	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	2653
	2654	if(!useVectorUPL)
	2655	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	2656	io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2657
	2658	else {
	2659	if(!vector_upl_index) {
	2660	vector_upl = vector_upl_create(upl_offset);
	2661	v_upl_uio_offset = uio->uio_offset;
	2662	vector_upl_offset = upl_offset;
	2663	}
	2664
	2665	vector_upl_set_subupl(vector_upl,upl,upl_size);
	2666	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	2667	vector_upl_index++;
	2668	vector_upl_iosize += io_size;
	2669	vector_upl_size += upl_size;
	2670
	2671	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	2672	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2673	reset_vector_run_state();
	2674	}
	2675	}
	2676
	2677	/*
	2678	* update the uio structure to
	2679	* reflect the I/O that we just issued
	2680	*/
	2681	uio_update(uio, (user_size_t)io_size);
	2682
	2683	/*
	2684	* in case we end up calling through to cluster_write_copy to finish
	2685	* the tail of this request, we need to update the oldEOF so that we
	2686	* don't zero-fill the head of a page if we've successfully written
	2687	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2688	* page that is beyond the oldEOF if the write is unaligned... we only
	2689	* want that to happen for the very first page of the cluster_write,
	2690	* NOT the first page of each vector making up a multi-vector write.
	2691	*/
	2692	if (uio->uio_offset > oldEOF)
	2693	oldEOF = uio->uio_offset;
	2694
	2695	io_req_size -= io_size;
	2696
	2697	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	2698	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
	2699
	2700	} /* end while */
	2701
	2702	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
	2703
	2704	retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
	2705
	2706	if (retval == 0 && *write_type == IO_DIRECT) {
	2707
	2708	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_NONE,
	2709	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2710
	2711	goto next_dwrite;
	2712	}
	2713	}
	2714
	2715	wait_for_dwrites:
	2716
	2717	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	2718	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2719	reset_vector_run_state();
	2720	}
	2721	/*
	2722	* make sure all async writes issued as part of this stream
	2723	* have completed before we return
	2724	*/
	2725	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
	2726
	2727	if (iostate.io_error)
	2728	retval = iostate.io_error;
	2729
	2730	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2731
	2732	if (io_throttled == TRUE && retval == 0)
	2733	retval = EAGAIN;
	2734
	2735	if (io_req_size && retval == 0) {
	2736	/*
	2737	* we couldn't handle the tail of this request in DIRECT mode
	2738	* so fire it through the copy path
	2739	*
	2740	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
	2741	* so we can just pass 0 in for the headOff and tailOff
	2742	*/
	2743	if (uio->uio_offset > oldEOF)
	2744	oldEOF = uio->uio_offset;
	2745
	2746	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
	2747
	2748	*write_type = IO_UNKNOWN;
	2749	}
	2750	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	2751	(int)uio->uio_offset, io_req_size, retval, 4, 0);
	2752
	2753	return (retval);
	2754	}
	2755
	2756
	2757	static int
	2758	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int write_type, u_int32_t *write_length,
	2759	int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2760	{
	2761	upl_page_info_t *pl;
	2762	addr64_t src_paddr = 0;
	2763	upl_t upl[MAX_VECTS];
	2764	vm_offset_t upl_offset;
	2765	u_int32_t tail_size = 0;
	2766	u_int32_t io_size;
	2767	u_int32_t xsize;
	2768	upl_size_t upl_size;
	2769	vm_size_t upl_needed_size;
	2770	mach_msg_type_number_t pages_in_pl;
	2771	upl_control_flags_t upl_flags;
	2772	kern_return_t kret;
	2773	struct clios iostate;
	2774	int error = 0;
	2775	int cur_upl = 0;
	2776	int num_upl = 0;
	2777	int n;
	2778	user_addr_t iov_base;
	2779	u_int32_t devblocksize;
	2780	u_int32_t mem_alignment_mask;
	2781
	2782	/*
	2783	* When we enter this routine, we know
	2784	* -- the io_req_size will not exceed iov_len
	2785	* -- the target address is physically contiguous
	2786	*/
	2787	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2788
	2789	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2790	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2791
	2792	iostate.io_completed = 0;
	2793	iostate.io_issued = 0;
	2794	iostate.io_error = 0;
	2795	iostate.io_wanted = 0;
	2796
	2797	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2798
	2799	next_cwrite:
	2800	io_size = *write_length;
	2801
	2802	iov_base = uio_curriovbase(uio);
	2803
	2804	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2805	upl_needed_size = upl_offset + io_size;
	2806
	2807	pages_in_pl = 0;
	2808	upl_size = upl_needed_size;
	2809	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2810	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2811
	2812	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	2813	kret = vm_map_get_upl(map,
	2814	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2815	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	2816
	2817	if (kret != KERN_SUCCESS) {
	2818	/*
	2819	* failed to get pagelist
	2820	*/
	2821	error = EINVAL;
	2822	goto wait_for_cwrites;
	2823	}
	2824	num_upl++;
	2825
	2826	/*
	2827	* Consider the possibility that upl_size wasn't satisfied.
	2828	*/
	2829	if (upl_size < upl_needed_size) {
	2830	/*
	2831	* This is a failure in the physical memory case.
	2832	*/
	2833	error = EINVAL;
	2834	goto wait_for_cwrites;
	2835	}
	2836	pl = ubc_upl_pageinfo(upl[cur_upl]);
	2837
	2838	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	2839
	2840	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	2841	u_int32_t head_size;
	2842
	2843	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	2844
	2845	if (head_size > io_size)
	2846	head_size = io_size;
	2847
	2848	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
	2849
	2850	if (error)
	2851	goto wait_for_cwrites;
	2852
	2853	upl_offset += head_size;
	2854	src_paddr += head_size;
	2855	io_size -= head_size;
	2856
	2857	iov_base += head_size;
	2858	}
	2859	if ((u_int32_t)iov_base & mem_alignment_mask) {
	2860	/*
	2861	* request doesn't set up on a memory boundary
	2862	* the underlying DMA engine can handle...
	2863	* return an error instead of going through
	2864	* the slow copy path since the intent of this
	2865	* path is direct I/O from device memory
	2866	*/
	2867	error = EINVAL;
	2868	goto wait_for_cwrites;
	2869	}
	2870
	2871	tail_size = io_size & (devblocksize - 1);
	2872	io_size -= tail_size;
	2873
	2874	while (io_size && error == 0) {
	2875
	2876	if (io_size > MAX_IO_CONTIG_SIZE)
	2877	xsize = MAX_IO_CONTIG_SIZE;
	2878	else
	2879	xsize = io_size;
	2880	/*
	2881	* request asynchronously so that we can overlap
	2882	* the preparation of the next I/O... we'll do
	2883	* the commit after all the I/O has completed
	2884	* since its all issued against the same UPL
	2885	* if there are already too many outstanding writes
	2886	* wait until some have completed before issuing the next
	2887	*/
	2888	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
	2889
	2890	if (iostate.io_error) {
	2891	/*
	2892	* one of the earlier writes we issued ran into a hard error
	2893	* don't issue any more writes...
	2894	* go wait for all writes that are part of this stream
	2895	* to complete before returning the error to the caller
	2896	*/
	2897	goto wait_for_cwrites;
	2898	}
	2899	/*
	2900	* issue an asynchronous write to cluster_io
	2901	*/
	2902	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
	2903	xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
	2904
	2905	if (error == 0) {
	2906	/*
	2907	* The cluster_io write completed successfully,
	2908	* update the uio structure
	2909	*/
	2910	uio_update(uio, (user_size_t)xsize);
	2911
	2912	upl_offset += xsize;
	2913	src_paddr += xsize;
	2914	io_size -= xsize;
	2915	}
	2916	}
	2917	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
	2918
	2919	error = cluster_io_type(uio, write_type, write_length, 0);
	2920
	2921	if (error == 0 && *write_type == IO_CONTIG) {
	2922	cur_upl++;
	2923	goto next_cwrite;
	2924	}
	2925	} else
	2926	*write_type = IO_UNKNOWN;
	2927
	2928	wait_for_cwrites:
	2929	/*
	2930	* make sure all async writes that are part of this stream
	2931	* have completed before we proceed
	2932	*/
	2933	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
	2934
	2935	if (iostate.io_error)
	2936	error = iostate.io_error;
	2937
	2938	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2939
	2940	if (error == 0 && tail_size)
	2941	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
	2942
	2943	for (n = 0; n < num_upl; n++)
	2944	/*
	2945	* just release our hold on each physically contiguous
	2946	* region without changing any state
	2947	*/
	2948	ubc_upl_abort(upl[n], 0);
	2949
	2950	return (error);
	2951	}
	2952
	2953
	2954	/*
	2955	* need to avoid a race between an msync of a range of pages dirtied via mmap
	2956	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
	2957	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
	2958	*
	2959	* we should never force-zero-fill pages that are already valid in the cache...
	2960	* the entire page contains valid data (either from disk, zero-filled or dirtied
	2961	* via an mmap) so we can only do damage by trying to zero-fill
	2962	*
	2963	*/
	2964	static int
	2965	cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
	2966	{
	2967	int zero_pg_index;
	2968	boolean_t need_cluster_zero = TRUE;
	2969
	2970	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2971
	2972	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	2973	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	2974
	2975	if (upl_valid_page(pl, zero_pg_index)) {
	2976	/*
	2977	* never force zero valid pages - dirty or clean
	2978	* we'll leave these in the UPL for cluster_write_copy to deal with
	2979	*/
	2980	need_cluster_zero = FALSE;
	2981	}
	2982	}
	2983	if (need_cluster_zero == TRUE)
	2984	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2985
	2986	return (bytes_to_zero);
	2987	}
	2988
	2989
	2990	void
	2991	cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
	2992	{
	2993	struct cl_extent cl;
	2994	boolean_t first_pass = TRUE;
	2995
	2996	assert(s_offset < e_offset);
	2997	assert((s_offset & PAGE_MASK_64) == 0);
	2998	assert((e_offset & PAGE_MASK_64) == 0);
	2999
	3000	cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
	3001	cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
	3002
	3003	cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
	3004	vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
	3005	}
	3006
	3007
	3008	static void
	3009	cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
	3010	boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
	3011	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	3012	{
	3013	struct cl_writebehind *wbp;
	3014	int cl_index;
	3015	int ret_cluster_try_push;
	3016	u_int max_cluster_pgcount;
	3017
	3018
	3019	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	3020
	3021	/*
	3022	* take the lock to protect our accesses
	3023	* of the writebehind and sparse cluster state
	3024	*/
	3025	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3026
	3027	if (wbp->cl_scmap) {
	3028
	3029	if ( !(flags & IO_NOCACHE)) {
	3030	/*
	3031	* we've fallen into the sparse
	3032	* cluster method of delaying dirty pages
	3033	*/
	3034	sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
	3035
	3036	lck_mtx_unlock(&wbp->cl_lockw);
	3037	return;
	3038	}
	3039	/*
	3040	* must have done cached writes that fell into
	3041	* the sparse cluster mechanism... we've switched
	3042	* to uncached writes on the file, so go ahead
	3043	* and push whatever's in the sparse map
	3044	* and switch back to normal clustering
	3045	*/
	3046	wbp->cl_number = 0;
	3047
	3048	sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
	3049	/*
	3050	* no clusters of either type present at this point
	3051	* so just go directly to start_new_cluster since
	3052	* we know we need to delay this I/O since we've
	3053	* already released the pages back into the cache
	3054	* to avoid the deadlock with sparse_cluster_push
	3055	*/
	3056	goto start_new_cluster;
	3057	}
	3058	if (*first_pass == TRUE) {
	3059	if (write_off == wbp->cl_last_write)
	3060	wbp->cl_seq_written += write_cnt;
	3061	else
	3062	wbp->cl_seq_written = write_cnt;
	3063
	3064	wbp->cl_last_write = write_off + write_cnt;
	3065
	3066	*first_pass = FALSE;
	3067	}
	3068	if (wbp->cl_number == 0)
	3069	/*
	3070	* no clusters currently present
	3071	*/
	3072	goto start_new_cluster;
	3073
	3074	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3075	/*
	3076	* check each cluster that we currently hold
	3077	* try to merge some or all of this write into
	3078	* one or more of the existing clusters... if
	3079	* any portion of the write remains, start a
	3080	* new cluster
	3081	*/
	3082	if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	3083	/*
	3084	* the current write starts at or after the current cluster
	3085	*/
	3086	if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3087	/*
	3088	* we have a write that fits entirely
	3089	* within the existing cluster limits
	3090	*/
	3091	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr)
	3092	/*
	3093	* update our idea of where the cluster ends
	3094	*/
	3095	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
	3096	break;
	3097	}
	3098	if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3099	/*
	3100	* we have a write that starts in the middle of the current cluster
	3101	* but extends beyond the cluster's limit... we know this because
	3102	* of the previous checks
	3103	* we'll extend the current cluster to the max
	3104	* and update the b_addr for the current write to reflect that
	3105	* the head of it was absorbed into this cluster...
	3106	* note that we'll always have a leftover tail in this case since
	3107	* full absorbtion would have occurred in the clause above
	3108	*/
	3109	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
	3110
	3111	cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
	3112	}
	3113	/*
	3114	* we come here for the case where the current write starts
	3115	* beyond the limit of the existing cluster or we have a leftover
	3116	* tail after a partial absorbtion
	3117	*
	3118	* in either case, we'll check the remaining clusters before
	3119	* starting a new one
	3120	*/
	3121	} else {
	3122	/*
	3123	* the current write starts in front of the cluster we're currently considering
	3124	*/
	3125	if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
	3126	/*
	3127	* we can just merge the new request into
	3128	* this cluster and leave it in the cache
	3129	* since the resulting cluster is still
	3130	* less than the maximum allowable size
	3131	*/
	3132	wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
	3133
	3134	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3135	/*
	3136	* the current write completely
	3137	* envelops the existing cluster and since
	3138	* each write is limited to at most max_cluster_pgcount pages
	3139	* we can just use the start and last blocknos of the write
	3140	* to generate the cluster limits
	3141	*/
	3142	wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
	3143	}
	3144	break;
	3145	}
	3146	/*
	3147	* if we were to combine this write with the current cluster
	3148	* we would exceed the cluster size limit.... so,
	3149	* let's see if there's any overlap of the new I/O with
	3150	* the cluster we're currently considering... in fact, we'll
	3151	* stretch the cluster out to it's full limit and see if we
	3152	* get an intersection with the current write
	3153	*
	3154	*/
	3155	if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
	3156	/*
	3157	* the current write extends into the proposed cluster
	3158	* clip the length of the current write after first combining it's
	3159	* tail with the newly shaped cluster
	3160	*/
	3161	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
	3162
	3163	cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
	3164	}
	3165	/*
	3166	* if we get here, there was no way to merge
	3167	* any portion of this write with this cluster
	3168	* or we could only merge part of it which
	3169	* will leave a tail...
	3170	* we'll check the remaining clusters before starting a new one
	3171	*/
	3172	}
	3173	}
	3174	if (cl_index < wbp->cl_number)
	3175	/*
	3176	* we found an existing cluster(s) that we
	3177	* could entirely merge this I/O into
	3178	*/
	3179	goto delay_io;
	3180
	3181	if (defer_writes == FALSE &&
	3182	wbp->cl_number == MAX_CLUSTERS &&
	3183	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
	3184	uint32_t n;
	3185
	3186	if (vp->v_mount->mnt_minsaturationbytecount) {
	3187	n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
	3188
	3189	if (n > MAX_CLUSTERS)
	3190	n = MAX_CLUSTERS;
	3191	} else
	3192	n = 0;
	3193
	3194	if (n == 0) {
	3195	if (disk_conditioner_mount_is_ssd(vp->v_mount))
	3196	n = WRITE_BEHIND_SSD;
	3197	else
	3198	n = WRITE_BEHIND;
	3199	}
	3200	while (n--)
	3201	cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
	3202	}
	3203	if (wbp->cl_number < MAX_CLUSTERS) {
	3204	/*
	3205	* we didn't find an existing cluster to
	3206	* merge into, but there's room to start
	3207	* a new one
	3208	*/
	3209	goto start_new_cluster;
	3210	}
	3211	/*
	3212	* no exisitng cluster to merge with and no
	3213	* room to start a new one... we'll try
	3214	* pushing one of the existing ones... if none of
	3215	* them are able to be pushed, we'll switch
	3216	* to the sparse cluster mechanism
	3217	* cluster_try_push updates cl_number to the
	3218	* number of remaining clusters... and
	3219	* returns the number of currently unused clusters
	3220	*/
	3221	ret_cluster_try_push = 0;
	3222
	3223	/*
	3224	* if writes are not deferred, call cluster push immediately
	3225	*/
	3226	if (defer_writes == FALSE) {
	3227
	3228	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
	3229	}
	3230	/*
	3231	* execute following regardless of writes being deferred or not
	3232	*/
	3233	if (ret_cluster_try_push == 0) {
	3234	/*
	3235	* no more room in the normal cluster mechanism
	3236	* so let's switch to the more expansive but expensive
	3237	* sparse mechanism....
	3238	*/
	3239	sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
	3240	sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
	3241
	3242	lck_mtx_unlock(&wbp->cl_lockw);
	3243	return;
	3244	}
	3245	start_new_cluster:
	3246	wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
	3247	wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
	3248
	3249	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
	3250
	3251	if (flags & IO_NOCACHE)
	3252	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
	3253
	3254	if (flags & IO_PASSIVE)
	3255	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
	3256
	3257	wbp->cl_number++;
	3258	delay_io:
	3259	lck_mtx_unlock(&wbp->cl_lockw);
	3260	return;
	3261	}
	3262
	3263
	3264	static int
	3265	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
	3266	off_t tailOff, int flags, int (callback)(buf_t, void ), void *callback_arg)
	3267	{
	3268	upl_page_info_t *pl;
	3269	upl_t upl;
	3270	vm_offset_t upl_offset = 0;
	3271	vm_size_t upl_size;
	3272	off_t upl_f_offset;
	3273	int pages_in_upl;
	3274	int start_offset;
	3275	int xfer_resid;
	3276	int io_size;
	3277	int io_offset;
	3278	int bytes_to_zero;
	3279	int bytes_to_move;
	3280	kern_return_t kret;
	3281	int retval = 0;
	3282	int io_resid;
	3283	long long total_size;
	3284	long long zero_cnt;
	3285	off_t zero_off;
	3286	long long zero_cnt1;
	3287	off_t zero_off1;
	3288	off_t write_off = 0;
	3289	int write_cnt = 0;
	3290	boolean_t first_pass = FALSE;
	3291	struct cl_extent cl;
	3292	int bflag;
	3293	u_int max_io_size;
	3294
	3295	if (uio) {
	3296	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3297	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
	3298
	3299	io_resid = io_req_size;
	3300	} else {
	3301	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	3302	0, 0, (int)oldEOF, (int)newEOF, 0);
	3303
	3304	io_resid = 0;
	3305	}
	3306	if (flags & IO_PASSIVE)
	3307	bflag = CL_PASSIVE;
	3308	else
	3309	bflag = 0;
	3310	if (flags & IO_NOCACHE)
	3311	bflag \|= CL_NOCACHE;
	3312
	3313	if (flags & IO_SKIP_ENCRYPTION)
	3314	bflag \|= CL_ENCRYPTED;
	3315
	3316	zero_cnt = 0;
	3317	zero_cnt1 = 0;
	3318	zero_off = 0;
	3319	zero_off1 = 0;
	3320
	3321	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	3322
	3323	if (flags & IO_HEADZEROFILL) {
	3324	/*
	3325	* some filesystems (HFS is one) don't support unallocated holes within a file...
	3326	* so we zero fill the intervening space between the old EOF and the offset
	3327	* where the next chunk of real data begins.... ftruncate will also use this
	3328	* routine to zero fill to the new EOF when growing a file... in this case, the
	3329	* uio structure will not be provided
	3330	*/
	3331	if (uio) {
	3332	if (headOff < uio->uio_offset) {
	3333	zero_cnt = uio->uio_offset - headOff;
	3334	zero_off = headOff;
	3335	}
	3336	} else if (headOff < newEOF) {
	3337	zero_cnt = newEOF - headOff;
	3338	zero_off = headOff;
	3339	}
	3340	} else {
	3341	if (uio && uio->uio_offset > oldEOF) {
	3342	zero_off = uio->uio_offset & ~PAGE_MASK_64;
	3343
	3344	if (zero_off >= oldEOF) {
	3345	zero_cnt = uio->uio_offset - zero_off;
	3346
	3347	flags \|= IO_HEADZEROFILL;
	3348	}
	3349	}
	3350	}
	3351	if (flags & IO_TAILZEROFILL) {
	3352	if (uio) {
	3353	zero_off1 = uio->uio_offset + io_req_size;
	3354
	3355	if (zero_off1 < tailOff)
	3356	zero_cnt1 = tailOff - zero_off1;
	3357	}
	3358	} else {
	3359	if (uio && newEOF > oldEOF) {
	3360	zero_off1 = uio->uio_offset + io_req_size;
	3361
	3362	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
	3363	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
	3364
	3365	flags \|= IO_TAILZEROFILL;
	3366	}
	3367	}
	3368	}
	3369	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	3370	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	3371	retval, 0, 0, 0, 0);
	3372	return (0);
	3373	}
	3374	if (uio) {
	3375	write_off = uio->uio_offset;
	3376	write_cnt = uio_resid(uio);
	3377	/*
	3378	* delay updating the sequential write info
	3379	* in the control block until we've obtained
	3380	* the lock for it
	3381	*/
	3382	first_pass = TRUE;
	3383	}
	3384	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	3385	/*
	3386	* for this iteration of the loop, figure out where our starting point is
	3387	*/
	3388	if (zero_cnt) {
	3389	start_offset = (int)(zero_off & PAGE_MASK_64);
	3390	upl_f_offset = zero_off - start_offset;
	3391	} else if (io_resid) {
	3392	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3393	upl_f_offset = uio->uio_offset - start_offset;
	3394	} else {
	3395	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	3396	upl_f_offset = zero_off1 - start_offset;
	3397	}
	3398	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	3399	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	3400
	3401	if (total_size > max_io_size)
	3402	total_size = max_io_size;
	3403
	3404	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	3405
	3406	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	3407	/*
	3408	* assumption... total_size <= io_resid
	3409	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	3410	*/
	3411	if ((start_offset + total_size) > max_io_size)
	3412	total_size = max_io_size - start_offset;
	3413	xfer_resid = total_size;
	3414
	3415	retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
	3416
	3417	if (retval)
	3418	break;
	3419
	3420	io_resid -= (total_size - xfer_resid);
	3421	total_size = xfer_resid;
	3422	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3423	upl_f_offset = uio->uio_offset - start_offset;
	3424
	3425	if (total_size == 0) {
	3426	if (start_offset) {
	3427	/*
	3428	* the write did not finish on a page boundary
	3429	* which will leave upl_f_offset pointing to the
	3430	* beginning of the last page written instead of
	3431	* the page beyond it... bump it in this case
	3432	* so that the cluster code records the last page
	3433	* written as dirty
	3434	*/
	3435	upl_f_offset += PAGE_SIZE_64;
	3436	}
	3437	upl_size = 0;
	3438
	3439	goto check_cluster;
	3440	}
	3441	}
	3442	/*
	3443	* compute the size of the upl needed to encompass
	3444	* the requested write... limit each call to cluster_io
	3445	* to the maximum UPL size... cluster_io will clip if
	3446	* this exceeds the maximum io_size for the device,
	3447	* make sure to account for
	3448	* a starting offset that's not page aligned
	3449	*/
	3450	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3451
	3452	if (upl_size > max_io_size)
	3453	upl_size = max_io_size;
	3454
	3455	pages_in_upl = upl_size / PAGE_SIZE;
	3456	io_size = upl_size - start_offset;
	3457
	3458	if ((long long)io_size > total_size)
	3459	io_size = total_size;
	3460
	3461	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	3462
	3463
	3464	/*
	3465	* Gather the pages from the buffer cache.
	3466	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	3467	* that we intend to modify these pages.
	3468	*/
	3469	kret = ubc_create_upl_kernel(vp,
	3470	upl_f_offset,
	3471	upl_size,
	3472	&upl,
	3473	&pl,
	3474	UPL_SET_LITE \| (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
	3475	VM_KERN_MEMORY_FILE);
	3476	if (kret != KERN_SUCCESS)
	3477	panic("cluster_write_copy: failed to get pagelist");
	3478
	3479	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	3480	upl, (int)upl_f_offset, start_offset, 0, 0);
	3481
	3482	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
	3483	int read_size;
	3484
	3485	/*
	3486	* we're starting in the middle of the first page of the upl
	3487	* and the page isn't currently valid, so we're going to have
	3488	* to read it in first... this is a synchronous operation
	3489	*/
	3490	read_size = PAGE_SIZE;
	3491
	3492	if ((upl_f_offset + read_size) > oldEOF)
	3493	read_size = oldEOF - upl_f_offset;
	3494
	3495	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	3496	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3497	if (retval) {
	3498	/*
	3499	* we had an error during the read which causes us to abort
	3500	* the current cluster_write request... before we do, we need
	3501	* to release the rest of the pages in the upl without modifying
	3502	* there state and mark the failed page in error
	3503	*/
	3504	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3505
	3506	if (upl_size > PAGE_SIZE)
	3507	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3508
	3509	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3510	upl, 0, 0, retval, 0);
	3511	break;
	3512	}
	3513	}
	3514	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	3515	/*
	3516	* the last offset we're writing to in this upl does not end on a page
	3517	* boundary... if it's not beyond the old EOF, then we'll also need to
	3518	* pre-read this page in if it isn't already valid
	3519	*/
	3520	upl_offset = upl_size - PAGE_SIZE;
	3521
	3522	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	3523	!upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
	3524	int read_size;
	3525
	3526	read_size = PAGE_SIZE;
	3527
	3528	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
	3529	read_size = oldEOF - (upl_f_offset + upl_offset);
	3530
	3531	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	3532	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	3533	if (retval) {
	3534	/*
	3535	* we had an error during the read which causes us to abort
	3536	* the current cluster_write request... before we do, we
	3537	* need to release the rest of the pages in the upl without
	3538	* modifying there state and mark the failed page in error
	3539	*/
	3540	ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	3541
	3542	if (upl_size > PAGE_SIZE)
	3543	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3544
	3545	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3546	upl, 0, 0, retval, 0);
	3547	break;
	3548	}
	3549	}
	3550	}
	3551	xfer_resid = io_size;
	3552	io_offset = start_offset;
	3553
	3554	while (zero_cnt && xfer_resid) {
	3555
	3556	if (zero_cnt < (long long)xfer_resid)
	3557	bytes_to_zero = zero_cnt;
	3558	else
	3559	bytes_to_zero = xfer_resid;
	3560
	3561	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
	3562
	3563	xfer_resid -= bytes_to_zero;
	3564	zero_cnt -= bytes_to_zero;
	3565	zero_off += bytes_to_zero;
	3566	io_offset += bytes_to_zero;
	3567	}
	3568	if (xfer_resid && io_resid) {
	3569	u_int32_t io_requested;
	3570
	3571	bytes_to_move = min(io_resid, xfer_resid);
	3572	io_requested = bytes_to_move;
	3573
	3574	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
	3575
	3576	if (retval) {
	3577	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3578
	3579	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3580	upl, 0, 0, retval, 0);
	3581	} else {
	3582	io_resid -= bytes_to_move;
	3583	xfer_resid -= bytes_to_move;
	3584	io_offset += bytes_to_move;
	3585	}
	3586	}
	3587	while (xfer_resid && zero_cnt1 && retval == 0) {
	3588
	3589	if (zero_cnt1 < (long long)xfer_resid)
	3590	bytes_to_zero = zero_cnt1;
	3591	else
	3592	bytes_to_zero = xfer_resid;
	3593
	3594	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
	3595
	3596	xfer_resid -= bytes_to_zero;
	3597	zero_cnt1 -= bytes_to_zero;
	3598	zero_off1 += bytes_to_zero;
	3599	io_offset += bytes_to_zero;
	3600	}
	3601	if (retval == 0) {
	3602	int do_zeroing = 1;
	3603
	3604	io_size += start_offset;
	3605
	3606	/* Force more restrictive zeroing behavior only on APFS */
	3607	if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
	3608	do_zeroing = 0;
	3609	}
	3610
	3611	if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
	3612
	3613	/*
	3614	* if we're extending the file with this write
	3615	* we'll zero fill the rest of the page so that
	3616	* if the file gets extended again in such a way as to leave a
	3617	* hole starting at this EOF, we'll have zero's in the correct spot
	3618	*/
	3619	cluster_zero(upl, io_size, upl_size - io_size, NULL);
	3620	}
	3621	/*
	3622	* release the upl now if we hold one since...
	3623	* 1) pages in it may be present in the sparse cluster map
	3624	* and may span 2 separate buckets there... if they do and
	3625	* we happen to have to flush a bucket to make room and it intersects
	3626	* this upl, a deadlock may result on page BUSY
	3627	* 2) we're delaying the I/O... from this point forward we're just updating
	3628	* the cluster state... no need to hold the pages, so commit them
	3629	* 3) IO_SYNC is set...
	3630	* because we had to ask for a UPL that provides currenty non-present pages, the
	3631	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	3632	* upon committing it... this is not the behavior we want since it's possible for
	3633	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	3634	* we'll pick these pages back up later with the correct behavior specified.
	3635	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
	3636	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
	3637	* we hold since the flushing context is holding the cluster lock.
	3638	*/
	3639	ubc_upl_commit_range(upl, 0, upl_size,
	3640	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	3641	check_cluster:
	3642	/*
	3643	* calculate the last logical block number
	3644	* that this delayed I/O encompassed
	3645	*/
	3646	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	3647
	3648	if (flags & IO_SYNC) {
	3649	/*
	3650	* if the IO_SYNC flag is set than we need to bypass
	3651	* any clustering and immediately issue the I/O
	3652	*
	3653	* we don't hold the lock at this point
	3654	*
	3655	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
	3656	* so that we correctly deal with a change in state of the hardware modify bit...
	3657	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
	3658	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
	3659	* responsible for generating the correct sized I/O(s)
	3660	*/
	3661	retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
	3662	} else {
	3663	boolean_t defer_writes = FALSE;
	3664
	3665	if (vfs_flags(vp->v_mount) & MNT_DEFWRITE)
	3666	defer_writes = TRUE;
	3667
	3668	cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
	3669	write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
	3670	}
	3671	}
	3672	}
	3673	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END, retval, 0, io_resid, 0, 0);
	3674
	3675	return (retval);
	3676	}
	3677
	3678
	3679
	3680	int
	3681	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	3682	{
	3683	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
	3684	}
	3685
	3686
	3687	int
	3688	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int xflags, int (callback)(buf_t, void ), void callback_arg)
	3689	{
	3690	int retval = 0;
	3691	int flags;
	3692	user_ssize_t cur_resid;
	3693	u_int32_t io_size;
	3694	u_int32_t read_length = 0;
	3695	int read_type = IO_COPY;
	3696
	3697	flags = xflags;
	3698
	3699	if (vp->v_flag & VNOCACHE_DATA)
	3700	flags \|= IO_NOCACHE;
	3701	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled)
	3702	flags \|= IO_RAOFF;
	3703
	3704	if (flags & IO_SKIP_ENCRYPTION)
	3705	flags \|= IO_ENCRYPTED;
	3706
	3707	/*
	3708	* do a read through the cache if one of the following is true....
	3709	* NOCACHE is not true
	3710	* the uio request doesn't target USERSPACE
	3711	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
	3712	* Reading encrypted data from a CP filesystem should never result in the data touching
	3713	* the UBC.
	3714	*
	3715	* otherwise, find out if we want the direct or contig variant for
	3716	* the first vector in the uio request
	3717	*/
	3718	if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED) ) {
	3719
	3720	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3721	}
	3722
	3723	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
	3724
	3725	switch (read_type) {
	3726
	3727	case IO_COPY:
	3728	/*
	3729	* make sure the uio_resid isn't too big...
	3730	* internally, we want to handle all of the I/O in
	3731	* chunk sizes that fit in a 32 bit int
	3732	*/
	3733	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
	3734	io_size = MAX_IO_REQUEST_SIZE;
	3735	else
	3736	io_size = (u_int32_t)cur_resid;
	3737
	3738	retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
	3739	break;
	3740
	3741	case IO_DIRECT:
	3742	retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
	3743	break;
	3744
	3745	case IO_CONTIG:
	3746	retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
	3747	break;
	3748
	3749	case IO_UNKNOWN:
	3750	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3751	break;
	3752	}
	3753	}
	3754	return (retval);
	3755	}
	3756
	3757
	3758
	3759	static void
	3760	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
	3761	{
	3762	int range;
	3763	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	3764
	3765	if ((range = last_pg - start_pg)) {
	3766	if (take_reference)
	3767	abort_flags \|= UPL_ABORT_REFERENCE;
	3768
	3769	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
	3770	}
	3771	}
	3772
	3773
	3774	static int
	3775	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int flags, int (callback)(buf_t, void ), void callback_arg)
	3776	{
	3777	upl_page_info_t *pl;
	3778	upl_t upl;
	3779	vm_offset_t upl_offset;
	3780	u_int32_t upl_size;
	3781	off_t upl_f_offset;
	3782	int start_offset;
	3783	int start_pg;
	3784	int last_pg;
	3785	int uio_last = 0;
	3786	int pages_in_upl;
	3787	off_t max_size;
	3788	off_t last_ioread_offset;
	3789	off_t last_request_offset;
	3790	kern_return_t kret;
	3791	int error = 0;
	3792	int retval = 0;
	3793	u_int32_t size_of_prefetch;
	3794	u_int32_t xsize;
	3795	u_int32_t io_size;
	3796	u_int32_t max_rd_size;
	3797	u_int32_t max_io_size;
	3798	u_int32_t max_prefetch;
	3799	u_int rd_ahead_enabled = 1;
	3800	u_int prefetch_enabled = 1;
	3801	struct cl_readahead * rap;
	3802	struct clios iostate;
	3803	struct cl_extent extent;
	3804	int bflag;
	3805	int take_reference = 1;
	3806	int policy = IOPOL_DEFAULT;
	3807	boolean_t iolock_inited = FALSE;
	3808
	3809	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	3810	(int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
	3811
	3812	if (flags & IO_ENCRYPTED) {
	3813	panic ("encrypted blocks will hit UBC!");
	3814	}
	3815
	3816	policy = throttle_get_io_policy(NULL);
	3817
	3818	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE))
	3819	take_reference = 0;
	3820
	3821	if (flags & IO_PASSIVE)
	3822	bflag = CL_PASSIVE;
	3823	else
	3824	bflag = 0;
	3825
	3826	if (flags & IO_NOCACHE)
	3827	bflag \|= CL_NOCACHE;
	3828
	3829	if (flags & IO_SKIP_ENCRYPTION)
	3830	bflag \|= CL_ENCRYPTED;
	3831
	3832	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	3833	max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
	3834	max_rd_size = max_prefetch;
	3835
	3836	last_request_offset = uio->uio_offset + io_req_size;
	3837
	3838	if (last_request_offset > filesize)
	3839	last_request_offset = filesize;
	3840
	3841	if ((flags & (IO_RAOFF\|IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	3842	rd_ahead_enabled = 0;
	3843	rap = NULL;
	3844	} else {
	3845	if (cluster_is_throttled(vp)) {
	3846	/*
	3847	* we're in the throttle window, at the very least
	3848	* we want to limit the size of the I/O we're about
	3849	* to issue
	3850	*/
	3851	rd_ahead_enabled = 0;
	3852	prefetch_enabled = 0;
	3853
	3854	max_rd_size = THROTTLE_MAX_IOSIZE;
	3855	}
	3856	if ((rap = cluster_get_rap(vp)) == NULL)
	3857	rd_ahead_enabled = 0;
	3858	else {
	3859	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	3860	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	3861	}
	3862	}
	3863	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	3864	/*
	3865	* determine if we already have a read-ahead in the pipe courtesy of the
	3866	* last read systemcall that was issued...
	3867	* if so, pick up it's extent to determine where we should start
	3868	* with respect to any read-ahead that might be necessary to
	3869	* garner all the data needed to complete this read systemcall
	3870	*/
	3871	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	3872
	3873	if (last_ioread_offset < uio->uio_offset)
	3874	last_ioread_offset = (off_t)0;
	3875	else if (last_ioread_offset > last_request_offset)
	3876	last_ioread_offset = last_request_offset;
	3877	} else
	3878	last_ioread_offset = (off_t)0;
	3879
	3880	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
	3881
	3882	max_size = filesize - uio->uio_offset;
	3883
	3884	if ((off_t)(io_req_size) < max_size)
	3885	io_size = io_req_size;
	3886	else
	3887	io_size = max_size;
	3888
	3889	if (!(flags & IO_NOCACHE)) {
	3890
	3891	while (io_size) {
	3892	u_int32_t io_resid;
	3893	u_int32_t io_requested;
	3894
	3895	/*
	3896	* if we keep finding the pages we need already in the cache, then
	3897	* don't bother to call cluster_read_prefetch since it costs CPU cycles
	3898	* to determine that we have all the pages we need... once we miss in
	3899	* the cache and have issued an I/O, than we'll assume that we're likely
	3900	* to continue to miss in the cache and it's to our advantage to try and prefetch
	3901	*/
	3902	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
	3903	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	3904	/*
	3905	* we've already issued I/O for this request and
	3906	* there's still work to do and
	3907	* our prefetch stream is running dry, so issue a
	3908	* pre-fetch I/O... the I/O latency will overlap
	3909	* with the copying of the data
	3910	*/
	3911	if (size_of_prefetch > max_rd_size)
	3912	size_of_prefetch = max_rd_size;
	3913
	3914	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	3915
	3916	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	3917
	3918	if (last_ioread_offset > last_request_offset)
	3919	last_ioread_offset = last_request_offset;
	3920	}
	3921	}
	3922	/*
	3923	* limit the size of the copy we're about to do so that
	3924	* we can notice that our I/O pipe is running dry and
	3925	* get the next I/O issued before it does go dry
	3926	*/
	3927	if (last_ioread_offset && io_size > (max_io_size / 4))
	3928	io_resid = (max_io_size / 4);
	3929	else
	3930	io_resid = io_size;
	3931
	3932	io_requested = io_resid;
	3933
	3934	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
	3935
	3936	xsize = io_requested - io_resid;
	3937
	3938	io_size -= xsize;
	3939	io_req_size -= xsize;
	3940
	3941	if (retval \|\| io_resid)
	3942	/*
	3943	* if we run into a real error or
	3944	* a page that is not in the cache
	3945	* we need to leave streaming mode
	3946	*/
	3947	break;
	3948
	3949	if (rd_ahead_enabled && (io_size == 0 \|\| last_ioread_offset == last_request_offset)) {
	3950	/*
	3951	* we're already finished the I/O for this read request
	3952	* let's see if we should do a read-ahead
	3953	*/
	3954	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	3955	}
	3956	}
	3957	if (retval)
	3958	break;
	3959	if (io_size == 0) {
	3960	if (rap != NULL) {
	3961	if (extent.e_addr < rap->cl_lastr)
	3962	rap->cl_maxra = 0;
	3963	rap->cl_lastr = extent.e_addr;
	3964	}
	3965	break;
	3966	}
	3967	/*
	3968	* recompute max_size since cluster_copy_ubc_data_internal
	3969	* may have advanced uio->uio_offset
	3970	*/
	3971	max_size = filesize - uio->uio_offset;
	3972	}
	3973
	3974	iostate.io_completed = 0;
	3975	iostate.io_issued = 0;
	3976	iostate.io_error = 0;
	3977	iostate.io_wanted = 0;
	3978
	3979	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	3980	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	3981	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	3982	/*
	3983	* we're in the throttle window and at least 1 I/O
	3984	* has already been issued by a throttleable thread
	3985	* in this window, so return with EAGAIN to indicate
	3986	* to the FS issuing the cluster_read call that it
	3987	* should now throttle after dropping any locks
	3988	*/
	3989	throttle_info_update_by_mount(vp->v_mount);
	3990
	3991	retval = EAGAIN;
	3992	break;
	3993	}
	3994	}
	3995	}
	3996
	3997	/*
	3998	* compute the size of the upl needed to encompass
	3999	* the requested read... limit each call to cluster_io
	4000	* to the maximum UPL size... cluster_io will clip if
	4001	* this exceeds the maximum io_size for the device,
	4002	* make sure to account for
	4003	* a starting offset that's not page aligned
	4004	*/
	4005	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	4006	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	4007
	4008	if (io_size > max_rd_size)
	4009	io_size = max_rd_size;
	4010
	4011	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	4012
	4013	if (flags & IO_NOCACHE) {
	4014	if (upl_size > max_io_size)
	4015	upl_size = max_io_size;
	4016	} else {
	4017	if (upl_size > max_io_size / 4) {
	4018	upl_size = max_io_size / 4;
	4019	upl_size &= ~PAGE_MASK;
	4020
	4021	if (upl_size == 0)
	4022	upl_size = PAGE_SIZE;
	4023	}
	4024	}
	4025	pages_in_upl = upl_size / PAGE_SIZE;
	4026
	4027	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	4028	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4029
	4030	kret = ubc_create_upl_kernel(vp,
	4031	upl_f_offset,
	4032	upl_size,
	4033	&upl,
	4034	&pl,
	4035	UPL_FILE_IO \| UPL_SET_LITE,
	4036	VM_KERN_MEMORY_FILE);
	4037	if (kret != KERN_SUCCESS)
	4038	panic("cluster_read_copy: failed to get pagelist");
	4039
	4040	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	4041	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4042
	4043	/*
	4044	* scan from the beginning of the upl looking for the first
	4045	* non-valid page.... this will become the first page in
	4046	* the request we're going to make to 'cluster_io'... if all
	4047	* of the pages are valid, we won't call through to 'cluster_io'
	4048	*/
	4049	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	4050	if (!upl_valid_page(pl, start_pg))
	4051	break;
	4052	}
	4053
	4054	/*
	4055	* scan from the starting invalid page looking for a valid
	4056	* page before the end of the upl is reached, if we
	4057	* find one, then it will be the last page of the request to
	4058	* 'cluster_io'
	4059	*/
	4060	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4061	if (upl_valid_page(pl, last_pg))
	4062	break;
	4063	}
	4064
	4065	if (start_pg < last_pg) {
	4066	/*
	4067	* we found a range of 'invalid' pages that must be filled
	4068	* if the last page in this range is the last page of the file
	4069	* we may have to clip the size of it to keep from reading past
	4070	* the end of the last physical block associated with the file
	4071	*/
	4072	if (iolock_inited == FALSE) {
	4073	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4074
	4075	iolock_inited = TRUE;
	4076	}
	4077	upl_offset = start_pg * PAGE_SIZE;
	4078	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4079
	4080	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	4081	io_size = filesize - (upl_f_offset + upl_offset);
	4082
	4083	/*
	4084	* issue an asynchronous read to cluster_io
	4085	*/
	4086
	4087	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	4088	io_size, CL_READ \| CL_ASYNC \| bflag, (buf_t)NULL, &iostate, callback, callback_arg);
	4089
	4090	if (rap) {
	4091	if (extent.e_addr < rap->cl_maxra) {
	4092	/*
	4093	* we've just issued a read for a block that should have been
	4094	* in the cache courtesy of the read-ahead engine... something
	4095	* has gone wrong with the pipeline, so reset the read-ahead
	4096	* logic which will cause us to restart from scratch
	4097	*/
	4098	rap->cl_maxra = 0;
	4099	}
	4100	}
	4101	}
	4102	if (error == 0) {
	4103	/*
	4104	* if the read completed successfully, or there was no I/O request
	4105	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	4106	* we'll first add on any 'valid'
	4107	* pages that were present in the upl when we acquired it.
	4108	*/
	4109	u_int val_size;
	4110
	4111	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	4112	if (!upl_valid_page(pl, uio_last))
	4113	break;
	4114	}
	4115	if (uio_last < pages_in_upl) {
	4116	/*
	4117	* there were some invalid pages beyond the valid pages
	4118	* that we didn't issue an I/O for, just release them
	4119	* unchanged now, so that any prefetch/readahed can
	4120	* include them
	4121	*/
	4122	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	4123	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	4124	}
	4125
	4126	/*
	4127	* compute size to transfer this round, if io_req_size is
	4128	* still non-zero after this attempt, we'll loop around and
	4129	* set up for another I/O.
	4130	*/
	4131	val_size = (uio_last * PAGE_SIZE) - start_offset;
	4132
	4133	if (val_size > max_size)
	4134	val_size = max_size;
	4135
	4136	if (val_size > io_req_size)
	4137	val_size = io_req_size;
	4138
	4139	if ((uio->uio_offset + val_size) > last_ioread_offset)
	4140	last_ioread_offset = uio->uio_offset + val_size;
	4141
	4142	if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	4143
	4144	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
	4145	/*
	4146	* if there's still I/O left to do for this request, and...
	4147	* we're not in hard throttle mode, and...
	4148	* we're close to using up the previous prefetch, then issue a
	4149	* new pre-fetch I/O... the I/O latency will overlap
	4150	* with the copying of the data
	4151	*/
	4152	if (size_of_prefetch > max_rd_size)
	4153	size_of_prefetch = max_rd_size;
	4154
	4155	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	4156
	4157	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	4158
	4159	if (last_ioread_offset > last_request_offset)
	4160	last_ioread_offset = last_request_offset;
	4161	}
	4162
	4163	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	4164	/*
	4165	* this transfer will finish this request, so...
	4166	* let's try to read ahead if we're in
	4167	* a sequential access pattern and we haven't
	4168	* explicitly disabled it
	4169	*/
	4170	if (rd_ahead_enabled)
	4171	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	4172
	4173	if (rap != NULL) {
	4174	if (extent.e_addr < rap->cl_lastr)
	4175	rap->cl_maxra = 0;
	4176	rap->cl_lastr = extent.e_addr;
	4177	}
	4178	}
	4179	if (iolock_inited == TRUE)
	4180	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4181
	4182	if (iostate.io_error)
	4183	error = iostate.io_error;
	4184	else {
	4185	u_int32_t io_requested;
	4186
	4187	io_requested = val_size;
	4188
	4189	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
	4190
	4191	io_req_size -= (val_size - io_requested);
	4192	}
	4193	} else {
	4194	if (iolock_inited == TRUE)
	4195	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4196	}
	4197	if (start_pg < last_pg) {
	4198	/*
	4199	* compute the range of pages that we actually issued an I/O for
	4200	* and either commit them as valid if the I/O succeeded
	4201	* or abort them if the I/O failed or we're not supposed to
	4202	* keep them in the cache
	4203	*/
	4204	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4205
	4206	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4207
	4208	if (error \|\| (flags & IO_NOCACHE))
	4209	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	4210	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	4211	else {
	4212	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
	4213
	4214	if (take_reference)
	4215	commit_flags \|= UPL_COMMIT_INACTIVATE;
	4216	else
	4217	commit_flags \|= UPL_COMMIT_SPECULATE;
	4218
	4219	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
	4220	}
	4221	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	4222	}
	4223	if ((last_pg - start_pg) < pages_in_upl) {
	4224	/*
	4225	* the set of pages that we issued an I/O for did not encompass
	4226	* the entire upl... so just release these without modifying
	4227	* their state
	4228	*/
	4229	if (error)
	4230	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	4231	else {
	4232
	4233	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	4234	upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	4235
	4236	/*
	4237	* handle any valid pages at the beginning of
	4238	* the upl... release these appropriately
	4239	*/
	4240	cluster_read_upl_release(upl, 0, start_pg, take_reference);
	4241
	4242	/*
	4243	* handle any valid pages immediately after the
	4244	* pages we issued I/O for... ... release these appropriately
	4245	*/
	4246	cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
	4247
	4248	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, -1, -1, 0, 0);
	4249	}
	4250	}
	4251	if (retval == 0)
	4252	retval = error;
	4253
	4254	if (io_req_size) {
	4255	if (cluster_is_throttled(vp)) {
	4256	/*
	4257	* we're in the throttle window, at the very least
	4258	* we want to limit the size of the I/O we're about
	4259	* to issue
	4260	*/
	4261	rd_ahead_enabled = 0;
	4262	prefetch_enabled = 0;
	4263	max_rd_size = THROTTLE_MAX_IOSIZE;
	4264	} else {
	4265	if (max_rd_size == THROTTLE_MAX_IOSIZE) {
	4266	/*
	4267	* coming out of throttled state
	4268	*/
	4269	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
	4270	if (rap != NULL)
	4271	rd_ahead_enabled = 1;
	4272	prefetch_enabled = 1;
	4273	}
	4274	max_rd_size = max_prefetch;
	4275	last_ioread_offset = 0;
	4276	}
	4277	}
	4278	}
	4279	}
	4280	if (iolock_inited == TRUE) {
	4281	/*
	4282	* cluster_io returned an error after it
	4283	* had already issued some I/O. we need
	4284	* to wait for that I/O to complete before
	4285	* we can destroy the iostate mutex...
	4286	* 'retval' already contains the early error
	4287	* so no need to pick it up from iostate.io_error
	4288	*/
	4289	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	4290
	4291	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4292	}
	4293	if (rap != NULL) {
	4294	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4295	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
	4296
	4297	lck_mtx_unlock(&rap->cl_lockr);
	4298	} else {
	4299	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	4300	(int)uio->uio_offset, io_req_size, 0, retval, 0);
	4301	}
	4302
	4303	return (retval);
	4304	}
	4305
	4306	/*
	4307	* We don't want another read/write lock for every vnode in the system
	4308	* so we keep a hash of them here. There should never be very many of
	4309	* these around at any point in time.
	4310	*/
	4311	cl_direct_read_lock_t *cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
	4312	{
	4313	struct cl_direct_read_locks *head
	4314	= &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
	4315	% CL_DIRECT_READ_LOCK_BUCKETS];
	4316
	4317	struct cl_direct_read_lock lck, new_lck = NULL;
	4318
	4319	for (;;) {
	4320	lck_spin_lock(&cl_direct_read_spin_lock);
	4321
	4322	LIST_FOREACH(lck, head, chain) {
	4323	if (lck->vp == vp) {
	4324	++lck->ref_count;
	4325	lck_spin_unlock(&cl_direct_read_spin_lock);
	4326	if (new_lck) {
	4327	// Someone beat us to it, ditch the allocation
	4328	lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
	4329	FREE(new_lck, M_TEMP);
	4330	}
	4331	lck_rw_lock(&lck->rw_lock, type);
	4332	return lck;
	4333	}
	4334	}
	4335
	4336	if (new_lck) {
	4337	// Use the lock we allocated
	4338	LIST_INSERT_HEAD(head, new_lck, chain);
	4339	lck_spin_unlock(&cl_direct_read_spin_lock);
	4340	lck_rw_lock(&new_lck->rw_lock, type);
	4341	return new_lck;
	4342	}
	4343
	4344	lck_spin_unlock(&cl_direct_read_spin_lock);
	4345
	4346	// Allocate a new lock
	4347	MALLOC(new_lck, cl_direct_read_lock_t , sizeof(new_lck),
	4348	M_TEMP, M_WAITOK);
	4349	lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
	4350	new_lck->vp = vp;
	4351	new_lck->ref_count = 1;
	4352
	4353	// Got to go round again
	4354	}
	4355	}
	4356
	4357	void cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
	4358	{
	4359	lck_rw_done(&lck->rw_lock);
	4360
	4361	lck_spin_lock(&cl_direct_read_spin_lock);
	4362	if (lck->ref_count == 1) {
	4363	LIST_REMOVE(lck, chain);
	4364	lck_spin_unlock(&cl_direct_read_spin_lock);
	4365	lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
	4366	FREE(lck, M_TEMP);
	4367	} else {
	4368	--lck->ref_count;
	4369	lck_spin_unlock(&cl_direct_read_spin_lock);
	4370	}
	4371	}
	4372
	4373	static int
	4374	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4375	int flags, int (callback)(buf_t, void ), void *callback_arg)
	4376	{
	4377	upl_t upl;
	4378	upl_page_info_t *pl;
	4379	off_t max_io_size;
	4380	vm_offset_t upl_offset, vector_upl_offset = 0;
	4381	upl_size_t upl_size, vector_upl_size = 0;
	4382	vm_size_t upl_needed_size;
	4383	unsigned int pages_in_pl;
	4384	upl_control_flags_t upl_flags;
	4385	kern_return_t kret;
	4386	unsigned int i;
	4387	int force_data_sync;
	4388	int retval = 0;
	4389	int no_zero_fill = 0;
	4390	int io_flag = 0;
	4391	int misaligned = 0;
	4392	struct clios iostate;
	4393	user_addr_t iov_base;
	4394	u_int32_t io_req_size;
	4395	u_int32_t offset_in_file;
	4396	u_int32_t offset_in_iovbase;
	4397	u_int32_t io_size;
	4398	u_int32_t io_min;
	4399	u_int32_t xsize;
	4400	u_int32_t devblocksize;
	4401	u_int32_t mem_alignment_mask;
	4402	u_int32_t max_upl_size;
	4403	u_int32_t max_rd_size;
	4404	u_int32_t max_rd_ahead;
	4405	u_int32_t max_vector_size;
	4406	boolean_t io_throttled = FALSE;
	4407
	4408	u_int32_t vector_upl_iosize = 0;
	4409	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	4410	off_t v_upl_uio_offset = 0;
	4411	int vector_upl_index=0;
	4412	upl_t vector_upl = NULL;
	4413	cl_direct_read_lock_t *lock = NULL;
	4414
	4415	user_addr_t orig_iov_base = 0;
	4416	user_addr_t last_iov_base = 0;
	4417	user_addr_t next_iov_base = 0;
	4418
	4419	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	4420	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4421
	4422	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4423
	4424	max_rd_size = max_upl_size;
	4425	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4426
	4427	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
	4428
	4429	if (flags & IO_PASSIVE)
	4430	io_flag \|= CL_PASSIVE;
	4431
	4432	if (flags & IO_ENCRYPTED) {
	4433	io_flag \|= CL_RAW_ENCRYPTED;
	4434	}
	4435
	4436	if (flags & IO_NOCACHE) {
	4437	io_flag \|= CL_NOCACHE;
	4438	}
	4439
	4440	if (flags & IO_SKIP_ENCRYPTION)
	4441	io_flag \|= CL_ENCRYPTED;
	4442
	4443	iostate.io_completed = 0;
	4444	iostate.io_issued = 0;
	4445	iostate.io_error = 0;
	4446	iostate.io_wanted = 0;
	4447
	4448	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4449
	4450	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4451	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4452
	4453	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4454	(int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
	4455
	4456	if (devblocksize == 1) {
	4457	/*
	4458	* the AFP client advertises a devblocksize of 1
	4459	* however, its BLOCKMAP routine maps to physical
	4460	* blocks that are PAGE_SIZE in size...
	4461	* therefore we can't ask for I/Os that aren't page aligned
	4462	* or aren't multiples of PAGE_SIZE in size
	4463	* by setting devblocksize to PAGE_SIZE, we re-instate
	4464	* the old behavior we had before the mem_alignment_mask
	4465	* changes went in...
	4466	*/
	4467	devblocksize = PAGE_SIZE;
	4468	}
	4469
	4470	orig_iov_base = uio_curriovbase(uio);
	4471	last_iov_base = orig_iov_base;
	4472
	4473	next_dread:
	4474	io_req_size = *read_length;
	4475	iov_base = uio_curriovbase(uio);
	4476
	4477	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
	4478	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	4479
	4480	if (offset_in_file \|\| offset_in_iovbase) {
	4481	/*
	4482	* one of the 2 important offsets is misaligned
	4483	* so fire an I/O through the cache for this entire vector
	4484	*/
	4485	misaligned = 1;
	4486	}
	4487	if (iov_base & (devblocksize - 1)) {
	4488	/*
	4489	* the offset in memory must be on a device block boundary
	4490	* so that we can guarantee that we can generate an
	4491	* I/O that ends on a page boundary in cluster_io
	4492	*/
	4493	misaligned = 1;
	4494	}
	4495
	4496	max_io_size = filesize - uio->uio_offset;
	4497
	4498	/*
	4499	* The user must request IO in aligned chunks. If the
	4500	* offset into the file is bad, or the userland pointer
	4501	* is non-aligned, then we cannot service the encrypted IO request.
	4502	*/
	4503	if (flags & IO_ENCRYPTED) {
	4504	if (misaligned \|\| (io_req_size & (devblocksize - 1)))
	4505	retval = EINVAL;
	4506
	4507	max_io_size = roundup(max_io_size, devblocksize);
	4508	}
	4509
	4510	if ((off_t)io_req_size > max_io_size)
	4511	io_req_size = max_io_size;
	4512
	4513	/*
	4514	* When we get to this point, we know...
	4515	* -- the offset into the file is on a devblocksize boundary
	4516	*/
	4517
	4518	while (io_req_size && retval == 0) {
	4519	u_int32_t io_start;
	4520
	4521	if (cluster_is_throttled(vp)) {
	4522	/*
	4523	* we're in the throttle window, at the very least
	4524	* we want to limit the size of the I/O we're about
	4525	* to issue
	4526	*/
	4527	max_rd_size = THROTTLE_MAX_IOSIZE;
	4528	max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
	4529	max_vector_size = THROTTLE_MAX_IOSIZE;
	4530	} else {
	4531	max_rd_size = max_upl_size;
	4532	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4533	max_vector_size = MAX_VECTOR_UPL_SIZE;
	4534	}
	4535	io_start = io_size = io_req_size;
	4536
	4537	/*
	4538	* First look for pages already in the cache
	4539	* and move them to user space. But only do this
	4540	* check if we are not retrieving encrypted data directly
	4541	* from the filesystem; those blocks should never
	4542	* be in the UBC.
	4543	*
	4544	* cluster_copy_ubc_data returns the resid
	4545	* in io_size
	4546	*/
	4547	if ((flags & IO_ENCRYPTED) == 0) {
	4548	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
	4549	}
	4550	/*
	4551	* calculate the number of bytes actually copied
	4552	* starting size - residual
	4553	*/
	4554	xsize = io_start - io_size;
	4555
	4556	io_req_size -= xsize;
	4557
	4558	if(useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
	4559	/*
	4560	* We found something in the cache or we have an iov_base that's not
	4561	* page-aligned.
	4562	*
	4563	* Issue all I/O's that have been collected within this Vectored UPL.
	4564	*/
	4565	if(vector_upl_index) {
	4566	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4567	reset_vector_run_state();
	4568	}
	4569
	4570	if(xsize)
	4571	useVectorUPL = 0;
	4572
	4573	/*
	4574	* After this point, if we are using the Vector UPL path and the base is
	4575	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	4576	*/
	4577	}
	4578
	4579	/*
	4580	* check to see if we are finished with this request.
	4581	*
	4582	* If we satisfied this IO already, then io_req_size will be 0.
	4583	* Otherwise, see if the IO was mis-aligned and needs to go through
	4584	* the UBC to deal with the 'tail'.
	4585	*
	4586	*/
	4587	if (io_req_size == 0 \|\| (misaligned)) {
	4588	/*
	4589	* see if there's another uio vector to
	4590	* process that's of type IO_DIRECT
	4591	*
	4592	* break out of while loop to get there
	4593	*/
	4594	break;
	4595	}
	4596	/*
	4597	* assume the request ends on a device block boundary
	4598	*/
	4599	io_min = devblocksize;
	4600
	4601	/*
	4602	* we can handle I/O's in multiples of the device block size
	4603	* however, if io_size isn't a multiple of devblocksize we
	4604	* want to clip it back to the nearest page boundary since
	4605	* we are going to have to go through cluster_read_copy to
	4606	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
	4607	* multiple, we avoid asking the drive for the same physical
	4608	* blocks twice.. once for the partial page at the end of the
	4609	* request and a 2nd time for the page we read into the cache
	4610	* (which overlaps the end of the direct read) in order to
	4611	* get at the overhang bytes
	4612	*/
	4613	if (io_size & (devblocksize - 1)) {
	4614	assert(!(flags & IO_ENCRYPTED));
	4615	/*
	4616	* Clip the request to the previous page size boundary
	4617	* since request does NOT end on a device block boundary
	4618	*/
	4619	io_size &= ~PAGE_MASK;
	4620	io_min = PAGE_SIZE;
	4621	}
	4622	if (retval \|\| io_size < io_min) {
	4623	/*
	4624	* either an error or we only have the tail left to
	4625	* complete via the copy path...
	4626	* we may have already spun some portion of this request
	4627	* off as async requests... we need to wait for the I/O
	4628	* to complete before returning
	4629	*/
	4630	goto wait_for_dreads;
	4631	}
	4632
	4633	/*
	4634	* Don't re-check the UBC data if we are looking for uncached IO
	4635	* or asking for encrypted blocks.
	4636	*/
	4637	if ((flags & IO_ENCRYPTED) == 0) {
	4638
	4639	if ((xsize = io_size) > max_rd_size)
	4640	xsize = max_rd_size;
	4641
	4642	io_size = 0;
	4643
	4644	if (!lock) {
	4645	/*
	4646	* We hold a lock here between the time we check the
	4647	* cache and the time we issue I/O. This saves us
	4648	* from having to lock the pages in the cache. Not
	4649	* all clients will care about this lock but some
	4650	* clients may want to guarantee stability between
	4651	* here and when the I/O is issued in which case they
	4652	* will take the lock exclusively.
	4653	*/
	4654	lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
	4655	}
	4656
	4657	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
	4658
	4659	if (io_size == 0) {
	4660	/*
	4661	* a page must have just come into the cache
	4662	* since the first page in this range is no
	4663	* longer absent, go back and re-evaluate
	4664	*/
	4665	continue;
	4666	}
	4667	}
	4668	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	4669	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4670	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4671	/*
	4672	* we're in the throttle window and at least 1 I/O
	4673	* has already been issued by a throttleable thread
	4674	* in this window, so return with EAGAIN to indicate
	4675	* to the FS issuing the cluster_read call that it
	4676	* should now throttle after dropping any locks
	4677	*/
	4678	throttle_info_update_by_mount(vp->v_mount);
	4679
	4680	io_throttled = TRUE;
	4681	goto wait_for_dreads;
	4682	}
	4683	}
	4684	}
	4685	if (io_size > max_rd_size)
	4686	io_size = max_rd_size;
	4687
	4688	iov_base = uio_curriovbase(uio);
	4689
	4690	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4691	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	4692
	4693	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	4694	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	4695
	4696	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
	4697	no_zero_fill = 1;
	4698	else
	4699	no_zero_fill = 0;
	4700
	4701	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4702	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	4703	pages_in_pl = 0;
	4704	upl_size = upl_needed_size;
	4705	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4706	if (no_zero_fill)
	4707	upl_flags \|= UPL_NOZEROFILL;
	4708	if (force_data_sync)
	4709	upl_flags \|= UPL_FORCE_DATA_SYNC;
	4710
	4711	kret = vm_map_create_upl(map,
	4712	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4713	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
	4714
	4715	if (kret != KERN_SUCCESS) {
	4716	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4717	(int)upl_offset, upl_size, io_size, kret, 0);
	4718	/*
	4719	* failed to get pagelist
	4720	*
	4721	* we may have already spun some portion of this request
	4722	* off as async requests... we need to wait for the I/O
	4723	* to complete before returning
	4724	*/
	4725	goto wait_for_dreads;
	4726	}
	4727	pages_in_pl = upl_size / PAGE_SIZE;
	4728	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	4729
	4730	for (i = 0; i < pages_in_pl; i++) {
	4731	if (!upl_page_present(pl, i))
	4732	break;
	4733	}
	4734	if (i == pages_in_pl)
	4735	break;
	4736
	4737	ubc_upl_abort(upl, 0);
	4738	}
	4739	if (force_data_sync >= 3) {
	4740	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4741	(int)upl_offset, upl_size, io_size, kret, 0);
	4742
	4743	goto wait_for_dreads;
	4744	}
	4745	/*
	4746	* Consider the possibility that upl_size wasn't satisfied.
	4747	*/
	4748	if (upl_size < upl_needed_size) {
	4749	if (upl_size && upl_offset == 0)
	4750	io_size = upl_size;
	4751	else
	4752	io_size = 0;
	4753	}
	4754	if (io_size == 0) {
	4755	ubc_upl_abort(upl, 0);
	4756	goto wait_for_dreads;
	4757	}
	4758	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4759	(int)upl_offset, upl_size, io_size, kret, 0);
	4760
	4761	if(useVectorUPL) {
	4762	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	4763	if(end_off)
	4764	issueVectorUPL = 1;
	4765	/*
	4766	* After this point, if we are using a vector UPL, then
	4767	* either all the UPL elements end on a page boundary OR
	4768	* this UPL is the last element because it does not end
	4769	* on a page boundary.
	4770	*/
	4771	}
	4772
	4773	/*
	4774	* request asynchronously so that we can overlap
	4775	* the preparation of the next I/O
	4776	* if there are already too many outstanding reads
	4777	* wait until some have completed before issuing the next read
	4778	*/
	4779	cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
	4780
	4781	if (iostate.io_error) {
	4782	/*
	4783	* one of the earlier reads we issued ran into a hard error
	4784	* don't issue any more reads, cleanup the UPL
	4785	* that was just created but not used, then
	4786	* go wait for any other reads to complete before
	4787	* returning the error to the caller
	4788	*/
	4789	ubc_upl_abort(upl, 0);
	4790
	4791	goto wait_for_dreads;
	4792	}
	4793	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	4794	upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	4795
	4796	if(!useVectorUPL) {
	4797	if (no_zero_fill)
	4798	io_flag &= ~CL_PRESERVE;
	4799	else
	4800	io_flag \|= CL_PRESERVE;
	4801
	4802	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4803
	4804	} else {
	4805
	4806	if(!vector_upl_index) {
	4807	vector_upl = vector_upl_create(upl_offset);
	4808	v_upl_uio_offset = uio->uio_offset;
	4809	vector_upl_offset = upl_offset;
	4810	}
	4811
	4812	vector_upl_set_subupl(vector_upl,upl, upl_size);
	4813	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	4814	vector_upl_index++;
	4815	vector_upl_size += upl_size;
	4816	vector_upl_iosize += io_size;
	4817
	4818	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	4819	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4820	reset_vector_run_state();
	4821	}
	4822	}
	4823	last_iov_base = iov_base + io_size;
	4824
	4825	if (lock) {
	4826	// We don't need to wait for the I/O to complete
	4827	cluster_unlock_direct_read(lock);
	4828	lock = NULL;
	4829	}
	4830
	4831	/*
	4832	* update the uio structure
	4833	*/
	4834	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
	4835	uio_update(uio, (user_size_t)max_io_size);
	4836	}
	4837	else {
	4838	uio_update(uio, (user_size_t)io_size);
	4839	}
	4840
	4841	io_req_size -= io_size;
	4842
	4843	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	4844	upl, (int)uio->uio_offset, io_req_size, retval, 0);
	4845
	4846	} /* end while */
	4847
	4848	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
	4849
	4850	retval = cluster_io_type(uio, read_type, read_length, 0);
	4851
	4852	if (retval == 0 && *read_type == IO_DIRECT) {
	4853
	4854	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4855	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4856
	4857	goto next_dread;
	4858	}
	4859	}
	4860
	4861	wait_for_dreads:
	4862
	4863	if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	4864	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4865	reset_vector_run_state();
	4866	}
	4867
	4868	// We don't need to wait for the I/O to complete
	4869	if (lock)
	4870	cluster_unlock_direct_read(lock);
	4871
	4872	/*
	4873	* make sure all async reads that are part of this stream
	4874	* have completed before we return
	4875	*/
	4876	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
	4877
	4878	if (iostate.io_error)
	4879	retval = iostate.io_error;
	4880
	4881	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4882
	4883	if (io_throttled == TRUE && retval == 0)
	4884	retval = EAGAIN;
	4885
	4886	for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
	4887	/*
	4888	* This is specifically done for pmap accounting purposes.
	4889	* vm_pre_fault() will call vm_fault() to enter the page into
	4890	* the pmap if there isn't _a_ physical page for that VA already.
	4891	*/
	4892	vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
	4893	}
	4894
	4895	if (io_req_size && retval == 0) {
	4896	/*
	4897	* we couldn't handle the tail of this request in DIRECT mode
	4898	* so fire it through the copy path
	4899	*/
	4900	if (flags & IO_ENCRYPTED) {
	4901	/*
	4902	* We cannot fall back to the copy path for encrypted I/O. If this
	4903	* happens, there is something wrong with the user buffer passed
	4904	* down.
	4905	*/
	4906	retval = EFAULT;
	4907	} else {
	4908	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
	4909	}
	4910
	4911	*read_type = IO_UNKNOWN;
	4912	}
	4913	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	4914	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
	4915
	4916	return (retval);
	4917	}
	4918
	4919
	4920	static int
	4921	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4922	int (callback)(buf_t, void ), void *callback_arg, int flags)
	4923	{
	4924	upl_page_info_t *pl;
	4925	upl_t upl[MAX_VECTS];
	4926	vm_offset_t upl_offset;
	4927	addr64_t dst_paddr = 0;
	4928	user_addr_t iov_base;
	4929	off_t max_size;
	4930	upl_size_t upl_size;
	4931	vm_size_t upl_needed_size;
	4932	mach_msg_type_number_t pages_in_pl;
	4933	upl_control_flags_t upl_flags;
	4934	kern_return_t kret;
	4935	struct clios iostate;
	4936	int error= 0;
	4937	int cur_upl = 0;
	4938	int num_upl = 0;
	4939	int n;
	4940	u_int32_t xsize;
	4941	u_int32_t io_size;
	4942	u_int32_t devblocksize;
	4943	u_int32_t mem_alignment_mask;
	4944	u_int32_t tail_size = 0;
	4945	int bflag;
	4946
	4947	if (flags & IO_PASSIVE)
	4948	bflag = CL_PASSIVE;
	4949	else
	4950	bflag = 0;
	4951
	4952	if (flags & IO_NOCACHE)
	4953	bflag \|= CL_NOCACHE;
	4954
	4955	/*
	4956	* When we enter this routine, we know
	4957	* -- the read_length will not exceed the current iov_len
	4958	* -- the target address is physically contiguous for read_length
	4959	*/
	4960	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
	4961
	4962	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4963	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4964
	4965	iostate.io_completed = 0;
	4966	iostate.io_issued = 0;
	4967	iostate.io_error = 0;
	4968	iostate.io_wanted = 0;
	4969
	4970	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4971
	4972	next_cread:
	4973	io_size = *read_length;
	4974
	4975	max_size = filesize - uio->uio_offset;
	4976
	4977	if (io_size > max_size)
	4978	io_size = max_size;
	4979
	4980	iov_base = uio_curriovbase(uio);
	4981
	4982	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4983	upl_needed_size = upl_offset + io_size;
	4984
	4985	pages_in_pl = 0;
	4986	upl_size = upl_needed_size;
	4987	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4988
	4989
	4990	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_START,
	4991	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
	4992
	4993	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	4994	kret = vm_map_get_upl(map,
	4995	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4996	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
	4997
	4998	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_END,
	4999	(int)upl_offset, upl_size, io_size, kret, 0);
	5000
	5001	if (kret != KERN_SUCCESS) {
	5002	/*
	5003	* failed to get pagelist
	5004	*/
	5005	error = EINVAL;
	5006	goto wait_for_creads;
	5007	}
	5008	num_upl++;
	5009
	5010	if (upl_size < upl_needed_size) {
	5011	/*
	5012	* The upl_size wasn't satisfied.
	5013	*/
	5014	error = EINVAL;
	5015	goto wait_for_creads;
	5016	}
	5017	pl = ubc_upl_pageinfo(upl[cur_upl]);
	5018
	5019	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	5020
	5021	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	5022	u_int32_t head_size;
	5023
	5024	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	5025
	5026	if (head_size > io_size)
	5027	head_size = io_size;
	5028
	5029	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
	5030
	5031	if (error)
	5032	goto wait_for_creads;
	5033
	5034	upl_offset += head_size;
	5035	dst_paddr += head_size;
	5036	io_size -= head_size;
	5037
	5038	iov_base += head_size;
	5039	}
	5040	if ((u_int32_t)iov_base & mem_alignment_mask) {
	5041	/*
	5042	* request doesn't set up on a memory boundary
	5043	* the underlying DMA engine can handle...
	5044	* return an error instead of going through
	5045	* the slow copy path since the intent of this
	5046	* path is direct I/O to device memory
	5047	*/
	5048	error = EINVAL;
	5049	goto wait_for_creads;
	5050	}
	5051
	5052	tail_size = io_size & (devblocksize - 1);
	5053
	5054	io_size -= tail_size;
	5055
	5056	while (io_size && error == 0) {
	5057
	5058	if (io_size > MAX_IO_CONTIG_SIZE)
	5059	xsize = MAX_IO_CONTIG_SIZE;
	5060	else
	5061	xsize = io_size;
	5062	/*
	5063	* request asynchronously so that we can overlap
	5064	* the preparation of the next I/O... we'll do
	5065	* the commit after all the I/O has completed
	5066	* since its all issued against the same UPL
	5067	* if there are already too many outstanding reads
	5068	* wait until some have completed before issuing the next
	5069	*/
	5070	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
	5071
	5072	if (iostate.io_error) {
	5073	/*
	5074	* one of the earlier reads we issued ran into a hard error
	5075	* don't issue any more reads...
	5076	* go wait for any other reads to complete before
	5077	* returning the error to the caller
	5078	*/
	5079	goto wait_for_creads;
	5080	}
	5081	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
	5082	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
	5083	(buf_t)NULL, &iostate, callback, callback_arg);
	5084	/*
	5085	* The cluster_io read was issued successfully,
	5086	* update the uio structure
	5087	*/
	5088	if (error == 0) {
	5089	uio_update(uio, (user_size_t)xsize);
	5090
	5091	dst_paddr += xsize;
	5092	upl_offset += xsize;
	5093	io_size -= xsize;
	5094	}
	5095	}
	5096	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
	5097
	5098	error = cluster_io_type(uio, read_type, read_length, 0);
	5099
	5100	if (error == 0 && *read_type == IO_CONTIG) {
	5101	cur_upl++;
	5102	goto next_cread;
	5103	}
	5104	} else
	5105	*read_type = IO_UNKNOWN;
	5106
	5107	wait_for_creads:
	5108	/*
	5109	* make sure all async reads that are part of this stream
	5110	* have completed before we proceed
	5111	*/
	5112	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
	5113
	5114	if (iostate.io_error)
	5115	error = iostate.io_error;
	5116
	5117	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	5118
	5119	if (error == 0 && tail_size)
	5120	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
	5121
	5122	for (n = 0; n < num_upl; n++)
	5123	/*
	5124	* just release our hold on each physically contiguous
	5125	* region without changing any state
	5126	*/
	5127	ubc_upl_abort(upl[n], 0);
	5128
	5129	return (error);
	5130	}
	5131
	5132
	5133	static int
	5134	cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length)
	5135	{
	5136	user_size_t iov_len;
	5137	user_addr_t iov_base = 0;
	5138	upl_t upl;
	5139	upl_size_t upl_size;
	5140	upl_control_flags_t upl_flags;
	5141	int retval = 0;
	5142
	5143	/*
	5144	* skip over any emtpy vectors
	5145	*/
	5146	uio_update(uio, (user_size_t)0);
	5147
	5148	iov_len = uio_curriovlen(uio);
	5149
	5150	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
	5151
	5152	if (iov_len) {
	5153	iov_base = uio_curriovbase(uio);
	5154	/*
	5155	* make sure the size of the vector isn't too big...
	5156	* internally, we want to handle all of the I/O in
	5157	* chunk sizes that fit in a 32 bit int
	5158	*/
	5159	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
	5160	upl_size = MAX_IO_REQUEST_SIZE;
	5161	else
	5162	upl_size = (u_int32_t)iov_len;
	5163
	5164	upl_flags = UPL_QUERY_OBJECT_TYPE;
	5165
	5166	vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
	5167	if ((vm_map_get_upl(map,
	5168	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	5169	&upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
	5170	/*
	5171	* the user app must have passed in an invalid address
	5172	*/
	5173	retval = EFAULT;
	5174	}
	5175	if (upl_size == 0)
	5176	retval = EFAULT;
	5177
	5178	*io_length = upl_size;
	5179
	5180	if (upl_flags & UPL_PHYS_CONTIG)
	5181	*io_type = IO_CONTIG;
	5182	else if (iov_len >= min_length)
	5183	*io_type = IO_DIRECT;
	5184	else
	5185	*io_type = IO_COPY;
	5186	} else {
	5187	/*
	5188	* nothing left to do for this uio
	5189	*/
	5190	*io_length = 0;
	5191	*io_type = IO_UNKNOWN;
	5192	}
	5193	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, 0);
	5194
	5195	return (retval);
	5196	}
	5197
	5198
	5199	/*
	5200	* generate advisory I/O's in the largest chunks possible
	5201	* the completed pages will be released into the VM cache
	5202	*/
	5203	int
	5204	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	5205	{
	5206	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
	5207	}
	5208
	5209	int
	5210	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	5211	{
	5212	upl_page_info_t *pl;
	5213	upl_t upl;
	5214	vm_offset_t upl_offset;
	5215	int upl_size;
	5216	off_t upl_f_offset;
	5217	int start_offset;
	5218	int start_pg;
	5219	int last_pg;
	5220	int pages_in_upl;
	5221	off_t max_size;
	5222	int io_size;
	5223	kern_return_t kret;
	5224	int retval = 0;
	5225	int issued_io;
	5226	int skip_range;
	5227	uint32_t max_io_size;
	5228
	5229
	5230	if ( !UBCINFOEXISTS(vp))
	5231	return(EINVAL);
	5232
	5233	if (resid < 0)
	5234	return(EINVAL);
	5235
	5236	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	5237
	5238	#if CONFIG_EMBEDDED
	5239	if (max_io_size > speculative_prefetch_max_iosize)
	5240	max_io_size = speculative_prefetch_max_iosize;
	5241	#else
	5242	if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
	5243	if (max_io_size > speculative_prefetch_max_iosize)
	5244	max_io_size = speculative_prefetch_max_iosize;
	5245	}
	5246	#endif
	5247
	5248	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	5249	(int)f_offset, resid, (int)filesize, 0, 0);
	5250
	5251	while (resid && f_offset < filesize && retval == 0) {
	5252	/*
	5253	* compute the size of the upl needed to encompass
	5254	* the requested read... limit each call to cluster_io
	5255	* to the maximum UPL size... cluster_io will clip if
	5256	* this exceeds the maximum io_size for the device,
	5257	* make sure to account for
	5258	* a starting offset that's not page aligned
	5259	*/
	5260	start_offset = (int)(f_offset & PAGE_MASK_64);
	5261	upl_f_offset = f_offset - (off_t)start_offset;
	5262	max_size = filesize - f_offset;
	5263
	5264	if (resid < max_size)
	5265	io_size = resid;
	5266	else
	5267	io_size = max_size;
	5268
	5269	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5270	if ((uint32_t)upl_size > max_io_size)
	5271	upl_size = max_io_size;
	5272
	5273	skip_range = 0;
	5274	/*
	5275	* return the number of contiguously present pages in the cache
	5276	* starting at upl_f_offset within the file
	5277	*/
	5278	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	5279
	5280	if (skip_range) {
	5281	/*
	5282	* skip over pages already present in the cache
	5283	*/
	5284	io_size = skip_range - start_offset;
	5285
	5286	f_offset += io_size;
	5287	resid -= io_size;
	5288
	5289	if (skip_range == upl_size)
	5290	continue;
	5291	/*
	5292	* have to issue some real I/O
	5293	* at this point, we know it's starting on a page boundary
	5294	* because we've skipped over at least the first page in the request
	5295	*/
	5296	start_offset = 0;
	5297	upl_f_offset += skip_range;
	5298	upl_size -= skip_range;
	5299	}
	5300	pages_in_upl = upl_size / PAGE_SIZE;
	5301
	5302	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	5303	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5304
	5305	kret = ubc_create_upl_kernel(vp,
	5306	upl_f_offset,
	5307	upl_size,
	5308	&upl,
	5309	&pl,
	5310	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE,
	5311	VM_KERN_MEMORY_FILE);
	5312	if (kret != KERN_SUCCESS)
	5313	return(retval);
	5314	issued_io = 0;
	5315
	5316	/*
	5317	* before we start marching forward, we must make sure we end on
	5318	* a present page, otherwise we will be working with a freed
	5319	* upl
	5320	*/
	5321	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5322	if (upl_page_present(pl, last_pg))
	5323	break;
	5324	}
	5325	pages_in_upl = last_pg + 1;
	5326
	5327
	5328	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	5329	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	5330
	5331
	5332	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5333	/*
	5334	* scan from the beginning of the upl looking for the first
	5335	* page that is present.... this will become the first page in
	5336	* the request we're going to make to 'cluster_io'... if all
	5337	* of the pages are absent, we won't call through to 'cluster_io'
	5338	*/
	5339	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5340	if (upl_page_present(pl, start_pg))
	5341	break;
	5342	}
	5343
	5344	/*
	5345	* scan from the starting present page looking for an absent
	5346	* page before the end of the upl is reached, if we
	5347	* find one, then it will terminate the range of pages being
	5348	* presented to 'cluster_io'
	5349	*/
	5350	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5351	if (!upl_page_present(pl, last_pg))
	5352	break;
	5353	}
	5354
	5355	if (last_pg > start_pg) {
	5356	/*
	5357	* we found a range of pages that must be filled
	5358	* if the last page in this range is the last page of the file
	5359	* we may have to clip the size of it to keep from reading past
	5360	* the end of the last physical block associated with the file
	5361	*/
	5362	upl_offset = start_pg * PAGE_SIZE;
	5363	io_size = (last_pg - start_pg) * PAGE_SIZE;
	5364
	5365	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	5366	io_size = filesize - (upl_f_offset + upl_offset);
	5367
	5368	/*
	5369	* issue an asynchronous read to cluster_io
	5370	*/
	5371	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5372	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5373
	5374	issued_io = 1;
	5375	}
	5376	}
	5377	if (issued_io == 0)
	5378	ubc_upl_abort(upl, 0);
	5379
	5380	io_size = upl_size - start_offset;
	5381
	5382	if (io_size > resid)
	5383	io_size = resid;
	5384	f_offset += io_size;
	5385	resid -= io_size;
	5386	}
	5387
	5388	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	5389	(int)f_offset, resid, retval, 0, 0);
	5390
	5391	return(retval);
	5392	}
	5393
	5394
	5395	int
	5396	cluster_push(vnode_t vp, int flags)
	5397	{
	5398	return cluster_push_ext(vp, flags, NULL, NULL);
	5399	}
	5400
	5401
	5402	int
	5403	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void ), void *callback_arg)
	5404	{
	5405	return cluster_push_err(vp, flags, callback, callback_arg, NULL);
	5406	}
	5407
	5408	/* write errors via err, but return the number of clusters written */
	5409	int
	5410	cluster_push_err(vnode_t vp, int flags, int (callback)(buf_t, void ), void callback_arg, int err)
	5411	{
	5412	int retval;
	5413	int my_sparse_wait = 0;
	5414	struct cl_writebehind *wbp;
	5415	int local_err = 0;
	5416
	5417	if (err)
	5418	*err = 0;
	5419
	5420	if ( !UBCINFOEXISTS(vp)) {
	5421	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
	5422	return (0);
	5423	}
	5424	/* return if deferred write is set */
	5425	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	5426	return (0);
	5427	}
	5428	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	5429	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
	5430	return (0);
	5431	}
	5432	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	5433	lck_mtx_unlock(&wbp->cl_lockw);
	5434
	5435	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
	5436	return(0);
	5437	}
	5438	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	5439	wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	5440
	5441	/*
	5442	* if we have an fsync in progress, we don't want to allow any additional
	5443	* sync/fsync/close(s) to occur until it finishes.
	5444	* note that its possible for writes to continue to occur to this file
	5445	* while we're waiting and also once the fsync starts to clean if we're
	5446	* in the sparse map case
	5447	*/
	5448	while (wbp->cl_sparse_wait) {
	5449	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5450
	5451	msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5452
	5453	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5454	}
	5455	if (flags & IO_SYNC) {
	5456	my_sparse_wait = 1;
	5457	wbp->cl_sparse_wait = 1;
	5458
	5459	/*
	5460	* this is an fsync (or equivalent)... we must wait for any existing async
	5461	* cleaning operations to complete before we evaulate the current state
	5462	* and finish cleaning... this insures that all writes issued before this
	5463	* fsync actually get cleaned to the disk before this fsync returns
	5464	*/
	5465	while (wbp->cl_sparse_pushes) {
	5466	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
	5467
	5468	msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5469
	5470	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
	5471	}
	5472	}
	5473	if (wbp->cl_scmap) {
	5474	void *scmap;
	5475
	5476	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
	5477
	5478	scmap = wbp->cl_scmap;
	5479	wbp->cl_scmap = NULL;
	5480
	5481	wbp->cl_sparse_pushes++;
	5482
	5483	lck_mtx_unlock(&wbp->cl_lockw);
	5484
	5485	retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
	5486
	5487	lck_mtx_lock(&wbp->cl_lockw);
	5488
	5489	wbp->cl_sparse_pushes--;
	5490
	5491	if (retval) {
	5492	if (wbp->cl_scmap != NULL) {
	5493	panic("cluster_push_err: Expected NULL cl_scmap\n");
	5494	}
	5495
	5496	wbp->cl_scmap = scmap;
	5497	}
	5498
	5499	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
	5500	wakeup((caddr_t)&wbp->cl_sparse_pushes);
	5501	} else {
	5502	retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
	5503	}
	5504
	5505	local_err = retval;
	5506
	5507	if (err)
	5508	*err = retval;
	5509	retval = 1;
	5510	} else {
	5511	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
	5512	if (err)
	5513	*err = local_err;
	5514	}
	5515	lck_mtx_unlock(&wbp->cl_lockw);
	5516
	5517	if (flags & IO_SYNC)
	5518	(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
	5519
	5520	if (my_sparse_wait) {
	5521	/*
	5522	* I'm the owner of the serialization token
	5523	* clear it and wakeup anyone that is waiting
	5524	* for me to finish
	5525	*/
	5526	lck_mtx_lock(&wbp->cl_lockw);
	5527
	5528	wbp->cl_sparse_wait = 0;
	5529	wakeup((caddr_t)&wbp->cl_sparse_wait);
	5530
	5531	lck_mtx_unlock(&wbp->cl_lockw);
	5532	}
	5533	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	5534	wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
	5535
	5536	return (retval);
	5537	}
	5538
	5539
	5540	__private_extern__ void
	5541	cluster_release(struct ubc_info *ubc)
	5542	{
	5543	struct cl_writebehind *wbp;
	5544	struct cl_readahead *rap;
	5545
	5546	if ((wbp = ubc->cl_wbehind)) {
	5547
	5548	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
	5549
	5550	if (wbp->cl_scmap)
	5551	vfs_drt_control(&(wbp->cl_scmap), 0);
	5552	} else {
	5553	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, 0, 0, 0, 0);
	5554	}
	5555
	5556	rap = ubc->cl_rahead;
	5557
	5558	if (wbp != NULL) {
	5559	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	5560	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	5561	}
	5562	if ((rap = ubc->cl_rahead)) {
	5563	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	5564	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	5565	}
	5566	ubc->cl_rahead = NULL;
	5567	ubc->cl_wbehind = NULL;
	5568
	5569	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, ubc, rap, wbp, 0, 0);
	5570	}
	5571
	5572
	5573	static int
	5574	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg, int *err, boolean_t vm_initiated)
	5575	{
	5576	int cl_index;
	5577	int cl_index1;
	5578	int min_index;
	5579	int cl_len;
	5580	int cl_pushed = 0;
	5581	struct cl_wextent l_clusters[MAX_CLUSTERS];
	5582	u_int max_cluster_pgcount;
	5583	int error = 0;
	5584
	5585	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	5586	/*
	5587	* the write behind context exists and has
	5588	* already been locked...
	5589	*/
	5590	if (wbp->cl_number == 0)
	5591	/*
	5592	* no clusters to push
	5593	* return number of empty slots
	5594	*/
	5595	return (MAX_CLUSTERS);
	5596
	5597	/*
	5598	* make a local 'sorted' copy of the clusters
	5599	* and clear wbp->cl_number so that new clusters can
	5600	* be developed
	5601	*/
	5602	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5603	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	5604	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
	5605	continue;
	5606	if (min_index == -1)
	5607	min_index = cl_index1;
	5608	else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
	5609	min_index = cl_index1;
	5610	}
	5611	if (min_index == -1)
	5612	break;
	5613
	5614	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	5615	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	5616	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
	5617
	5618	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	5619	}
	5620	wbp->cl_number = 0;
	5621
	5622	cl_len = cl_index;
	5623
	5624	/* skip switching to the sparse cluster mechanism if on diskimage */
	5625	if ( ((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) &&
	5626	!(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) ) {
	5627	int i;
	5628
	5629	/*
	5630	* determine if we appear to be writing the file sequentially
	5631	* if not, by returning without having pushed any clusters
	5632	* we will cause this vnode to be pushed into the sparse cluster mechanism
	5633	* used for managing more random I/O patterns
	5634	*
	5635	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	5636	* that's why we're in try_push with PUSH_DELAY...
	5637	*
	5638	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	5639	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	5640	* so we can just make a simple pass through, up to, but not including the last one...
	5641	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	5642	* are sequential
	5643	*
	5644	* we let the last one be partial as long as it was adjacent to the previous one...
	5645	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	5646	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	5647	*/
	5648	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	5649	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
	5650	goto dont_try;
	5651	if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
	5652	goto dont_try;
	5653	}
	5654	}
	5655	if (vm_initiated == TRUE)
	5656	lck_mtx_unlock(&wbp->cl_lockw);
	5657
	5658	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	5659	int flags;
	5660	struct cl_extent cl;
	5661	int retval;
	5662
	5663	flags = io_flags & (IO_PASSIVE\|IO_CLOSE);
	5664
	5665	/*
	5666	* try to push each cluster in turn...
	5667	*/
	5668	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
	5669	flags \|= IO_NOCACHE;
	5670
	5671	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
	5672	flags \|= IO_PASSIVE;
	5673
	5674	if (push_flag & PUSH_SYNC)
	5675	flags \|= IO_SYNC;
	5676
	5677	cl.b_addr = l_clusters[cl_index].b_addr;
	5678	cl.e_addr = l_clusters[cl_index].e_addr;
	5679
	5680	retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
	5681
	5682	if (retval == 0) {
	5683	cl_pushed++;
	5684
	5685	l_clusters[cl_index].b_addr = 0;
	5686	l_clusters[cl_index].e_addr = 0;
	5687	} else if (error == 0) {
	5688	error = retval;
	5689	}
	5690
	5691	if ( !(push_flag & PUSH_ALL) )
	5692	break;
	5693	}
	5694	if (vm_initiated == TRUE)
	5695	lck_mtx_lock(&wbp->cl_lockw);
	5696
	5697	if (err)
	5698	*err = error;
	5699
	5700	dont_try:
	5701	if (cl_len > cl_pushed) {
	5702	/*
	5703	* we didn't push all of the clusters, so
	5704	* lets try to merge them back in to the vnode
	5705	*/
	5706	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	5707	/*
	5708	* we picked up some new clusters while we were trying to
	5709	* push the old ones... this can happen because I've dropped
	5710	* the vnode lock... the sum of the
	5711	* leftovers plus the new cluster count exceeds our ability
	5712	* to represent them, so switch to the sparse cluster mechanism
	5713	*
	5714	* collect the active public clusters...
	5715	*/
	5716	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
	5717
	5718	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	5719	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5720	continue;
	5721	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5722	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5723	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5724
	5725	cl_index1++;
	5726	}
	5727	/*
	5728	* update the cluster count
	5729	*/
	5730	wbp->cl_number = cl_index1;
	5731
	5732	/*
	5733	* and collect the original clusters that were moved into the
	5734	* local storage for sorting purposes
	5735	*/
	5736	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
	5737
	5738	} else {
	5739	/*
	5740	* we've got room to merge the leftovers back in
	5741	* just append them starting at the next 'hole'
	5742	* represented by wbp->cl_number
	5743	*/
	5744	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	5745	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5746	continue;
	5747
	5748	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5749	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5750	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5751
	5752	cl_index1++;
	5753	}
	5754	/*
	5755	* update the cluster count
	5756	*/
	5757	wbp->cl_number = cl_index1;
	5758	}
	5759	}
	5760	return (MAX_CLUSTERS - wbp->cl_number);
	5761	}
	5762
	5763
	5764
	5765	static int
	5766	cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
	5767	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	5768	{
	5769	upl_page_info_t *pl;
	5770	upl_t upl;
	5771	vm_offset_t upl_offset;
	5772	int upl_size;
	5773	off_t upl_f_offset;
	5774	int pages_in_upl;
	5775	int start_pg;
	5776	int last_pg;
	5777	int io_size;
	5778	int io_flags;
	5779	int upl_flags;
	5780	int bflag;
	5781	int size;
	5782	int error = 0;
	5783	int retval;
	5784	kern_return_t kret;
	5785
	5786	if (flags & IO_PASSIVE)
	5787	bflag = CL_PASSIVE;
	5788	else
	5789	bflag = 0;
	5790
	5791	if (flags & IO_SKIP_ENCRYPTION)
	5792	bflag \|= CL_ENCRYPTED;
	5793
	5794	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	5795	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	5796
	5797	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	5798	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	5799
	5800	return (0);
	5801	}
	5802	upl_size = pages_in_upl * PAGE_SIZE;
	5803	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5804
	5805	if (upl_f_offset + upl_size >= EOF) {
	5806
	5807	if (upl_f_offset >= EOF) {
	5808	/*
	5809	* must have truncated the file and missed
	5810	* clearing a dangling cluster (i.e. it's completely
	5811	* beyond the new EOF
	5812	*/
	5813	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	5814
	5815	return(0);
	5816	}
	5817	size = EOF - upl_f_offset;
	5818
	5819	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5820	pages_in_upl = upl_size / PAGE_SIZE;
	5821	} else
	5822	size = upl_size;
	5823
	5824
	5825	if (vm_initiated) {
	5826	vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
	5827	UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_KEEPCACHED, &error);
	5828
	5829	return (error);
	5830	}
	5831	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	5832
	5833	/*
	5834	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	5835	*
	5836	* - only pages that are currently dirty are returned... these are the ones we need to clean
	5837	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	5838	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	5839	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	5840	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	5841	*
	5842	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	5843	*/
	5844
	5845	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE))
	5846	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	5847	else
	5848	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	5849
	5850	kret = ubc_create_upl_kernel(vp,
	5851	upl_f_offset,
	5852	upl_size,
	5853	&upl,
	5854	&pl,
	5855	upl_flags,
	5856	VM_KERN_MEMORY_FILE);
	5857	if (kret != KERN_SUCCESS)
	5858	panic("cluster_push: failed to get pagelist");
	5859
	5860	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
	5861
	5862	/*
	5863	* since we only asked for the dirty pages back
	5864	* it's possible that we may only get a few or even none, so...
	5865	* before we start marching forward, we must make sure we know
	5866	* where the last present page is in the UPL, otherwise we could
	5867	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	5868	* employed by commit_range and abort_range.
	5869	*/
	5870	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5871	if (upl_page_present(pl, last_pg))
	5872	break;
	5873	}
	5874	pages_in_upl = last_pg + 1;
	5875
	5876	if (pages_in_upl == 0) {
	5877	ubc_upl_abort(upl, 0);
	5878
	5879	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	5880	return(0);
	5881	}
	5882
	5883	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5884	/*
	5885	* find the next dirty page in the UPL
	5886	* this will become the first page in the
	5887	* next I/O to generate
	5888	*/
	5889	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5890	if (upl_dirty_page(pl, start_pg))
	5891	break;
	5892	if (upl_page_present(pl, start_pg))
	5893	/*
	5894	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	5895	* just release these unchanged since we're not going
	5896	* to steal them or change their state
	5897	*/
	5898	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	5899	}
	5900	if (start_pg >= pages_in_upl)
	5901	/*
	5902	* done... no more dirty pages to push
	5903	*/
	5904	break;
	5905	if (start_pg > last_pg)
	5906	/*
	5907	* skipped over some non-dirty pages
	5908	*/
	5909	size -= ((start_pg - last_pg) * PAGE_SIZE);
	5910
	5911	/*
	5912	* find a range of dirty pages to write
	5913	*/
	5914	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5915	if (!upl_dirty_page(pl, last_pg))
	5916	break;
	5917	}
	5918	upl_offset = start_pg * PAGE_SIZE;
	5919
	5920	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	5921
	5922	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
	5923
	5924	if ( !(flags & IO_SYNC))
	5925	io_flags \|= CL_ASYNC;
	5926
	5927	if (flags & IO_CLOSE)
	5928	io_flags \|= CL_CLOSE;
	5929
	5930	if (flags & IO_NOCACHE)
	5931	io_flags \|= CL_NOCACHE;
	5932
	5933	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5934	io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5935
	5936	if (error == 0 && retval)
	5937	error = retval;
	5938
	5939	size -= io_size;
	5940	}
	5941	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, error, 0, 0);
	5942
	5943	return(error);
	5944	}
	5945
	5946
	5947	/*
	5948	* sparse_cluster_switch is called with the write behind lock held
	5949	*/
	5950	static int
	5951	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int (callback)(buf_t, void ), void callback_arg, boolean_t vm_initiated)
	5952	{
	5953	int cl_index;
	5954	int error;
	5955
	5956	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
	5957
	5958	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5959	int flags;
	5960	struct cl_extent cl;
	5961
	5962	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	5963
	5964	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
	5965	if (flags & UPL_POP_DIRTY) {
	5966	cl.e_addr = cl.b_addr + 1;
	5967
	5968	error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
	5969
	5970	if (error) {
	5971	break;
	5972	}
	5973	}
	5974	}
	5975	}
	5976	}
	5977	wbp->cl_number -= cl_index;
	5978
	5979	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
	5980
	5981	return error;
	5982	}
	5983
	5984
	5985	/*
	5986	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
	5987	* still associated with the write-behind context... however, if the scmap has been disassociated
	5988	* from the write-behind context (the cluster_push case), the wb lock is not held
	5989	*/
	5990	static int
	5991	sparse_cluster_push(struct cl_writebehind wbp, void *scmap, vnode_t vp, off_t EOF, int push_flag,
	5992	int io_flags, int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	5993	{
	5994	struct cl_extent cl;
	5995	off_t offset;
	5996	u_int length;
	5997	void *l_scmap;
	5998	int error = 0;
	5999
	6000	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
	6001
	6002	if (push_flag & PUSH_ALL)
	6003	vfs_drt_control(scmap, 1);
	6004
	6005	l_scmap = *scmap;
	6006
	6007	for (;;) {
	6008	int retval;
	6009
	6010	if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
	6011	break;
	6012
	6013	if (vm_initiated == TRUE)
	6014	lck_mtx_unlock(&wbp->cl_lockw);
	6015
	6016	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	6017	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	6018
	6019	retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
	6020	if (error == 0 && retval)
	6021	error = retval;
	6022
	6023	if (vm_initiated == TRUE) {
	6024	lck_mtx_lock(&wbp->cl_lockw);
	6025
	6026	if (*scmap != l_scmap)
	6027	break;
	6028	}
	6029
	6030	if (error) {
	6031	if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
	6032	panic("Failed to restore dirty state on failure\n");
	6033	}
	6034
	6035	break;
	6036	}
	6037
	6038	if ( !(push_flag & PUSH_ALL)) {
	6039	break;
	6040	}
	6041	}
	6042	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
	6043
	6044	return error;
	6045	}
	6046
	6047
	6048	/*
	6049	* sparse_cluster_add is called with the write behind lock held
	6050	*/
	6051	static int
	6052	sparse_cluster_add(struct cl_writebehind wbp, void scmap, vnode_t vp, struct cl_extent cl, off_t EOF,
	6053	int (callback)(buf_t, void ), void *callback_arg, boolean_t vm_initiated)
	6054	{
	6055	u_int new_dirty;
	6056	u_int length;
	6057	off_t offset;
	6058	int error;
	6059
	6060	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
	6061
	6062	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	6063	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	6064
	6065	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
	6066	/*
	6067	* no room left in the map
	6068	* only a partial update was done
	6069	* push out some pages and try again
	6070	*/
	6071	error = sparse_cluster_push(wbp, scmap, vp, EOF, 0, 0, callback, callback_arg, vm_initiated);
	6072
	6073	if (error) {
	6074	break;
	6075	}
	6076
	6077	offset += (new_dirty * PAGE_SIZE_64);
	6078	length -= (new_dirty * PAGE_SIZE);
	6079	}
	6080	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
	6081
	6082	return error;
	6083	}
	6084
	6085
	6086	static int
	6087	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (callback)(buf_t, void ), void callback_arg)
	6088	{
	6089	upl_page_info_t *pl;
	6090	upl_t upl;
	6091	addr64_t ubc_paddr;
	6092	kern_return_t kret;
	6093	int error = 0;
	6094	int did_read = 0;
	6095	int abort_flags;
	6096	int upl_flags;
	6097	int bflag;
	6098
	6099	if (flags & IO_PASSIVE)
	6100	bflag = CL_PASSIVE;
	6101	else
	6102	bflag = 0;
	6103
	6104	if (flags & IO_NOCACHE)
	6105	bflag \|= CL_NOCACHE;
	6106
	6107	upl_flags = UPL_SET_LITE;
	6108
	6109	if ( !(flags & CL_READ) ) {
	6110	/*
	6111	* "write" operation: let the UPL subsystem know
	6112	* that we intend to modify the buffer cache pages
	6113	* we're gathering.
	6114	*/
	6115	upl_flags \|= UPL_WILL_MODIFY;
	6116	} else {
	6117	/*
	6118	* indicate that there is no need to pull the
	6119	* mapping for this page... we're only going
	6120	* to read from it, not modify it.
	6121	*/
	6122	upl_flags \|= UPL_FILE_IO;
	6123	}
	6124	kret = ubc_create_upl_kernel(vp,
	6125	uio->uio_offset & ~PAGE_MASK_64,
	6126	PAGE_SIZE,
	6127	&upl,
	6128	&pl,
	6129	upl_flags,
	6130	VM_KERN_MEMORY_FILE);
	6131
	6132	if (kret != KERN_SUCCESS)
	6133	return(EINVAL);
	6134
	6135	if (!upl_valid_page(pl, 0)) {
	6136	/*
	6137	* issue a synchronous read to cluster_io
	6138	*/
	6139	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6140	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6141	if (error) {
	6142	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	6143
	6144	return(error);
	6145	}
	6146	did_read = 1;
	6147	}
	6148	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	6149
	6150	/*
	6151	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	6152	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	6153	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	6154	* way to do so without exporting them to kexts as well.
	6155	*/
	6156	if (flags & CL_READ)
	6157	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	6158	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	6159	else
	6160	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	6161	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	6162
	6163	if ( !(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	6164	/*
	6165	* issue a synchronous write to cluster_io
	6166	*/
	6167	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	6168	bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	6169	}
	6170	if (error == 0)
	6171	uio_update(uio, (user_size_t)xsize);
	6172
	6173	if (did_read)
	6174	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	6175	else
	6176	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	6177
	6178	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	6179
	6180	return (error);
	6181	}
	6182
	6183	int
	6184	cluster_copy_upl_data(struct uio uio, upl_t upl, int upl_offset, int io_resid)
	6185	{
	6186	int pg_offset;
	6187	int pg_index;
	6188	int csize;
	6189	int segflg;
	6190	int retval = 0;
	6191	int xsize;
	6192	upl_page_info_t *pl;
	6193	int dirty_count;
	6194
	6195	xsize = *io_resid;
	6196
	6197	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6198	(int)uio->uio_offset, upl_offset, xsize, 0, 0);
	6199
	6200	segflg = uio->uio_segflg;
	6201
	6202	switch(segflg) {
	6203
	6204	case UIO_USERSPACE32:
	6205	case UIO_USERISPACE32:
	6206	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6207	break;
	6208
	6209	case UIO_USERSPACE:
	6210	case UIO_USERISPACE:
	6211	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6212	break;
	6213
	6214	case UIO_USERSPACE64:
	6215	case UIO_USERISPACE64:
	6216	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6217	break;
	6218
	6219	case UIO_SYSSPACE:
	6220	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6221	break;
	6222
	6223	}
	6224	pl = ubc_upl_pageinfo(upl);
	6225
	6226	pg_index = upl_offset / PAGE_SIZE;
	6227	pg_offset = upl_offset & PAGE_MASK;
	6228	csize = min(PAGE_SIZE - pg_offset, xsize);
	6229
	6230	dirty_count = 0;
	6231	while (xsize && retval == 0) {
	6232	addr64_t paddr;
	6233
	6234	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
	6235	if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE))
	6236	dirty_count++;
	6237
	6238	retval = uiomove64(paddr, csize, uio);
	6239
	6240	pg_index += 1;
	6241	pg_offset = 0;
	6242	xsize -= csize;
	6243	csize = min(PAGE_SIZE, xsize);
	6244	}
	6245	*io_resid = xsize;
	6246
	6247	uio->uio_segflg = segflg;
	6248
	6249	task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
	6250	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6251	(int)uio->uio_offset, xsize, retval, segflg, 0);
	6252
	6253	return (retval);
	6254	}
	6255
	6256
	6257	int
	6258	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	6259	{
	6260
	6261	return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
	6262	}
	6263
	6264
	6265	static int
	6266	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference)
	6267	{
	6268	int segflg;
	6269	int io_size;
	6270	int xsize;
	6271	int start_offset;
	6272	int retval = 0;
	6273	memory_object_control_t control;
	6274
	6275	io_size = *io_resid;
	6276
	6277	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	6278	(int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
	6279
	6280	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	6281
	6282	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	6283	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6284	(int)uio->uio_offset, io_size, retval, 3, 0);
	6285
	6286	return(0);
	6287	}
	6288	segflg = uio->uio_segflg;
	6289
	6290	switch(segflg) {
	6291
	6292	case UIO_USERSPACE32:
	6293	case UIO_USERISPACE32:
	6294	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	6295	break;
	6296
	6297	case UIO_USERSPACE64:
	6298	case UIO_USERISPACE64:
	6299	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	6300	break;
	6301
	6302	case UIO_USERSPACE:
	6303	case UIO_USERISPACE:
	6304	uio->uio_segflg = UIO_PHYS_USERSPACE;
	6305	break;
	6306
	6307	case UIO_SYSSPACE:
	6308	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	6309	break;
	6310	}
	6311
	6312	if ( (io_size = *io_resid) ) {
	6313	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	6314	xsize = uio_resid(uio);
	6315
	6316	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
	6317	start_offset, io_size, mark_dirty, take_reference);
	6318	xsize -= uio_resid(uio);
	6319	io_size -= xsize;
	6320	}
	6321	uio->uio_segflg = segflg;
	6322	*io_resid = io_size;
	6323
	6324	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	6325	(int)uio->uio_offset, io_size, retval, 0x80000000 \| segflg, 0);
	6326
	6327	return(retval);
	6328	}
	6329
	6330
	6331	int
	6332	is_file_clean(vnode_t vp, off_t filesize)
	6333	{
	6334	off_t f_offset;
	6335	int flags;
	6336	int total_dirty = 0;
	6337
	6338	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	6339	if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
	6340	if (flags & UPL_POP_DIRTY) {
	6341	total_dirty++;
	6342	}
	6343	}
	6344	}
	6345	if (total_dirty)
	6346	return(EINVAL);
	6347
	6348	return (0);
	6349	}
	6350
	6351
	6352
	6353	/*
	6354	* Dirty region tracking/clustering mechanism.
	6355	*
	6356	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	6357	* dirty regions within a larger space (file). It is primarily intended to
	6358	* support clustering in large files with many dirty areas.
	6359	*
	6360	* The implementation assumes that the dirty regions are pages.
	6361	*
	6362	* To represent dirty pages within the file, we store bit vectors in a
	6363	* variable-size circular hash.
	6364	*/
	6365
	6366	/*
	6367	* Bitvector size. This determines the number of pages we group in a
	6368	* single hashtable entry. Each hashtable entry is aligned to this
	6369	* size within the file.
	6370	*/
	6371	#define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
	6372
	6373	/*
	6374	* File offset handling.
	6375	*
	6376	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	6377	* the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6378	*/
	6379	#define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
	6380	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	6381
	6382	/*
	6383	* Hashtable address field handling.
	6384	*
	6385	* The low-order bits of the hashtable address are used to conserve
	6386	* space.
	6387	*
	6388	* DRT_HASH_COUNT_MASK must be large enough to store the range
	6389	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	6390	* to indicate that the bucket is actually unoccupied.
	6391	*/
	6392	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	6393	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	6394	do { \
	6395	(scm)->scm_hashtable[(i)].dhe_control = \
	6396	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	6397	} while (0)
	6398	#define DRT_HASH_COUNT_MASK 0x1ff
	6399	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	6400	#define DRT_HASH_SET_COUNT(scm, i, c) \
	6401	do { \
	6402	(scm)->scm_hashtable[(i)].dhe_control = \
	6403	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	6404	} while (0)
	6405	#define DRT_HASH_CLEAR(scm, i) \
	6406	do { \
	6407	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	6408	} while (0)
	6409	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	6410	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	6411	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	6412	do { \
	6413	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	6414	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	6415	} while(0);
	6416
	6417
	6418	#if CONFIG_EMBEDDED
	6419	/*
	6420	* Hash table moduli.
	6421	*
	6422	* Since the hashtable entry's size is dependent on the size of
	6423	* the bitvector, and since the hashtable size is constrained to
	6424	* both being prime and fitting within the desired allocation
	6425	* size, these values need to be manually determined.
	6426	*
	6427	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
	6428	*
	6429	* The small hashtable allocation is 4096 bytes, so the modulus is 251.
	6430	* The large hashtable allocation is 32768 bytes, so the modulus is 2039.
	6431	*/
	6432
	6433	#define DRT_HASH_SMALL_MODULUS 251
	6434	#define DRT_HASH_LARGE_MODULUS 2039
	6435
	6436	/*
	6437	* Physical memory required before the large hash modulus is permitted.
	6438	*
	6439	* On small memory systems, the large hash modulus can lead to phsyical
	6440	* memory starvation, so we avoid using it there.
	6441	*/
	6442	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
	6443
	6444	#define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
	6445	#define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
	6446
	6447	#else
	6448	/*
	6449	* Hash table moduli.
	6450	*
	6451	* Since the hashtable entry's size is dependent on the size of
	6452	* the bitvector, and since the hashtable size is constrained to
	6453	* both being prime and fitting within the desired allocation
	6454	* size, these values need to be manually determined.
	6455	*
	6456	* For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
	6457	*
	6458	* The small hashtable allocation is 16384 bytes, so the modulus is 1019.
	6459	* The large hashtable allocation is 131072 bytes, so the modulus is 8179.
	6460	*/
	6461
	6462	#define DRT_HASH_SMALL_MODULUS 1019
	6463	#define DRT_HASH_LARGE_MODULUS 8179
	6464
	6465	/*
	6466	* Physical memory required before the large hash modulus is permitted.
	6467	*
	6468	* On small memory systems, the large hash modulus can lead to phsyical
	6469	* memory starvation, so we avoid using it there.
	6470	*/
	6471	#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
	6472
	6473	#define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
	6474	#define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
	6475
	6476	#endif
	6477
	6478	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	6479
	6480	/*
	6481	* Hashtable entry.
	6482	*/
	6483	struct vfs_drt_hashentry {
	6484	u_int64_t dhe_control;
	6485	/*
	6486	* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	6487	* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
	6488	* Since PAGE_SIZE is only known at boot time,
	6489	* -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
	6490	* -declare dhe_bitvector array for largest possible length
	6491	*/
	6492	#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
	6493	u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
	6494	};
	6495
	6496	/*
	6497	* Hashtable bitvector handling.
	6498	*
	6499	* Bitvector fields are 32 bits long.
	6500	*/
	6501
	6502	#define DRT_HASH_SET_BIT(scm, i, bit) \
	6503	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	6504
	6505	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	6506	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	6507
	6508	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	6509	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	6510
	6511	#define DRT_BITVECTOR_CLEAR(scm, i) \
	6512	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6513
	6514	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	6515	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	6516	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	6517	(MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	6518
	6519	/*
	6520	* Dirty Region Tracking structure.
	6521	*
	6522	* The hashtable is allocated entirely inside the DRT structure.
	6523	*
	6524	* The hash is a simple circular prime modulus arrangement, the structure
	6525	* is resized from small to large if it overflows.
	6526	*/
	6527
	6528	struct vfs_drt_clustermap {
	6529	u_int32_t scm_magic; /* sanity/detection */
	6530	#define DRT_SCM_MAGIC 0x12020003
	6531	u_int32_t scm_modulus; /* current ring size */
	6532	u_int32_t scm_buckets; /* number of occupied buckets */
	6533	u_int32_t scm_lastclean; /* last entry we cleaned */
	6534	u_int32_t scm_iskips; /* number of slot skips */
	6535
	6536	struct vfs_drt_hashentry scm_hashtable[0];
	6537	};
	6538
	6539
	6540	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	6541	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	6542
	6543	/*
	6544	* Debugging codes and arguments.
	6545	*/
	6546	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	6547	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	6548	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	6549	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	6550	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	6551	* dirty */
	6552	/* 0, setcount */
	6553	/* 1 (clean, no map) */
	6554	/* 2 (map alloc fail) */
	6555	/* 3, resid (partial) */
	6556	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	6557	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	6558	* lastclean, iskips */
	6559
	6560
	6561	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	6562	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	6563	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	6564	u_int64_t offset, int *indexp);
	6565	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	6566	u_int64_t offset,
	6567	int *indexp,
	6568	int recursed);
	6569	static kern_return_t vfs_drt_do_mark_pages(
	6570	void **cmapp,
	6571	u_int64_t offset,
	6572	u_int length,
	6573	u_int *setcountp,
	6574	int dirty);
	6575	static void vfs_drt_trace(
	6576	struct vfs_drt_clustermap *cmap,
	6577	int code,
	6578	int arg1,
	6579	int arg2,
	6580	int arg3,
	6581	int arg4);
	6582
	6583
	6584	/*
	6585	* Allocate and initialise a sparse cluster map.
	6586	*
	6587	* Will allocate a new map, resize or compact an existing map.
	6588	*
	6589	* XXX we should probably have at least one intermediate map size,
	6590	* as the 1:16 ratio seems a bit drastic.
	6591	*/
	6592	static kern_return_t
	6593	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	6594	{
	6595	struct vfs_drt_clustermap cmap, ocmap;
	6596	kern_return_t kret;
	6597	u_int64_t offset;
	6598	u_int32_t i;
	6599	int nsize, active_buckets, index, copycount;
	6600
	6601	ocmap = NULL;
	6602	if (cmapp != NULL)
	6603	ocmap = *cmapp;
	6604
	6605	/*
	6606	* Decide on the size of the new map.
	6607	*/
	6608	if (ocmap == NULL) {
	6609	nsize = DRT_HASH_SMALL_MODULUS;
	6610	} else {
	6611	/* count the number of active buckets in the old map */
	6612	active_buckets = 0;
	6613	for (i = 0; i < ocmap->scm_modulus; i++) {
	6614	if (!DRT_HASH_VACANT(ocmap, i) &&
	6615	(DRT_HASH_GET_COUNT(ocmap, i) != 0))
	6616	active_buckets++;
	6617	}
	6618	/*
	6619	* If we're currently using the small allocation, check to
	6620	* see whether we should grow to the large one.
	6621	*/
	6622	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	6623	/*
	6624	* If the ring is nearly full and we are allowed to
	6625	* use the large modulus, upgrade.
	6626	*/
	6627	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
	6628	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
	6629	nsize = DRT_HASH_LARGE_MODULUS;
	6630	} else {
	6631	nsize = DRT_HASH_SMALL_MODULUS;
	6632	}
	6633	} else {
	6634	/* already using the large modulus */
	6635	nsize = DRT_HASH_LARGE_MODULUS;
	6636	/*
	6637	* If the ring is completely full, there's
	6638	* nothing useful for us to do. Behave as
	6639	* though we had compacted into the new
	6640	* array and return.
	6641	*/
	6642	if (active_buckets >= DRT_HASH_LARGE_MODULUS)
	6643	return(KERN_SUCCESS);
	6644	}
	6645	}
	6646
	6647	/*
	6648	* Allocate and initialise the new map.
	6649	*/
	6650
	6651	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
	6652	(nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION, VM_KERN_MEMORY_FILE);
	6653	if (kret != KERN_SUCCESS)
	6654	return(kret);
	6655	cmap->scm_magic = DRT_SCM_MAGIC;
	6656	cmap->scm_modulus = nsize;
	6657	cmap->scm_buckets = 0;
	6658	cmap->scm_lastclean = 0;
	6659	cmap->scm_iskips = 0;
	6660	for (i = 0; i < cmap->scm_modulus; i++) {
	6661	DRT_HASH_CLEAR(cmap, i);
	6662	DRT_HASH_VACATE(cmap, i);
	6663	DRT_BITVECTOR_CLEAR(cmap, i);
	6664	}
	6665
	6666	/*
	6667	* If there's an old map, re-hash entries from it into the new map.
	6668	*/
	6669	copycount = 0;
	6670	if (ocmap != NULL) {
	6671	for (i = 0; i < ocmap->scm_modulus; i++) {
	6672	/* skip empty buckets */
	6673	if (DRT_HASH_VACANT(ocmap, i) \|\|
	6674	(DRT_HASH_GET_COUNT(ocmap, i) == 0))
	6675	continue;
	6676	/* get new index */
	6677	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	6678	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	6679	if (kret != KERN_SUCCESS) {
	6680	/* XXX need to bail out gracefully here */
	6681	panic("vfs_drt: new cluster map mysteriously too small");
	6682	index = 0;
	6683	}
	6684	/* copy */
	6685	DRT_HASH_COPY(ocmap, i, cmap, index);
	6686	copycount++;
	6687	}
	6688	}
	6689
	6690	/* log what we've done */
	6691	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	6692
	6693	/*
	6694	* It's important to ensure that *cmapp always points to
	6695	* a valid map, so we must overwrite it before freeing
	6696	* the old map.
	6697	*/
	6698	*cmapp = cmap;
	6699	if (ocmap != NULL) {
	6700	/* emit stats into trace buffer */
	6701	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	6702	ocmap->scm_modulus,
	6703	ocmap->scm_buckets,
	6704	ocmap->scm_lastclean,
	6705	ocmap->scm_iskips);
	6706
	6707	vfs_drt_free_map(ocmap);
	6708	}
	6709	return(KERN_SUCCESS);
	6710	}
	6711
	6712
	6713	/*
	6714	* Free a sparse cluster map.
	6715	*/
	6716	static kern_return_t
	6717	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	6718	{
	6719	kmem_free(kernel_map, (vm_offset_t)cmap,
	6720	(cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	6721	return(KERN_SUCCESS);
	6722	}
	6723
	6724
	6725	/*
	6726	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	6727	*/
	6728	static kern_return_t
	6729	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	6730	{
	6731	int index;
	6732	u_int32_t i;
	6733
	6734	offset = DRT_ALIGN_ADDRESS(offset);
	6735	index = DRT_HASH(cmap, offset);
	6736
	6737	/* traverse the hashtable */
	6738	for (i = 0; i < cmap->scm_modulus; i++) {
	6739
	6740	/*
	6741	* If the slot is vacant, we can stop.
	6742	*/
	6743	if (DRT_HASH_VACANT(cmap, index))
	6744	break;
	6745
	6746	/*
	6747	* If the address matches our offset, we have success.
	6748	*/
	6749	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	6750	*indexp = index;
	6751	return(KERN_SUCCESS);
	6752	}
	6753
	6754	/*
	6755	* Move to the next slot, try again.
	6756	*/
	6757	index = DRT_HASH_NEXT(cmap, index);
	6758	}
	6759	/*
	6760	* It's not there.
	6761	*/
	6762	return(KERN_FAILURE);
	6763	}
	6764
	6765	/*
	6766	* Find the hashtable slot for the supplied offset. If we haven't allocated
	6767	* one yet, allocate one and populate the address field. Note that it will
	6768	* not have a nonzero page count and thus will still technically be free, so
	6769	* in the case where we are called to clean pages, the slot will remain free.
	6770	*/
	6771	static kern_return_t
	6772	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	6773	{
	6774	struct vfs_drt_clustermap *cmap;
	6775	kern_return_t kret;
	6776	u_int32_t index;
	6777	u_int32_t i;
	6778
	6779	cmap = *cmapp;
	6780
	6781	/* look for an existing entry */
	6782	kret = vfs_drt_search_index(cmap, offset, indexp);
	6783	if (kret == KERN_SUCCESS)
	6784	return(kret);
	6785
	6786	/* need to allocate an entry */
	6787	offset = DRT_ALIGN_ADDRESS(offset);
	6788	index = DRT_HASH(cmap, offset);
	6789
	6790	/* scan from the index forwards looking for a vacant slot */
	6791	for (i = 0; i < cmap->scm_modulus; i++) {
	6792	/* slot vacant? */
	6793	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap,index) == 0) {
	6794	cmap->scm_buckets++;
	6795	if (index < cmap->scm_lastclean)
	6796	cmap->scm_lastclean = index;
	6797	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	6798	DRT_HASH_SET_COUNT(cmap, index, 0);
	6799	DRT_BITVECTOR_CLEAR(cmap, index);
	6800	*indexp = index;
	6801	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	6802	return(KERN_SUCCESS);
	6803	}
	6804	cmap->scm_iskips += i;
	6805	index = DRT_HASH_NEXT(cmap, index);
	6806	}
	6807
	6808	/*
	6809	* We haven't found a vacant slot, so the map is full. If we're not
	6810	* already recursed, try reallocating/compacting it.
	6811	*/
	6812	if (recursed)
	6813	return(KERN_FAILURE);
	6814	kret = vfs_drt_alloc_map(cmapp);
	6815	if (kret == KERN_SUCCESS) {
	6816	/* now try to insert again */
	6817	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	6818	}
	6819	return(kret);
	6820	}
	6821
	6822	/*
	6823	* Implementation of set dirty/clean.
	6824	*
	6825	* In the 'clean' case, not finding a map is OK.
	6826	*/
	6827	static kern_return_t
	6828	vfs_drt_do_mark_pages(
	6829	void **private,
	6830	u_int64_t offset,
	6831	u_int length,
	6832	u_int *setcountp,
	6833	int dirty)
	6834	{
	6835	struct vfs_drt_clustermap cmap, *cmapp;
	6836	kern_return_t kret;
	6837	int i, index, pgoff, pgcount, setcount, ecount;
	6838
	6839	cmapp = (struct vfs_drt_clustermap **)private;
	6840	cmap = *cmapp;
	6841
	6842	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	6843
	6844	if (setcountp != NULL)
	6845	*setcountp = 0;
	6846
	6847	/* allocate a cluster map if we don't already have one */
	6848	if (cmap == NULL) {
	6849	/* no cluster map, nothing to clean */
	6850	if (!dirty) {
	6851	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	6852	return(KERN_SUCCESS);
	6853	}
	6854	kret = vfs_drt_alloc_map(cmapp);
	6855	if (kret != KERN_SUCCESS) {
	6856	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	6857	return(kret);
	6858	}
	6859	}
	6860	setcount = 0;
	6861
	6862	/*
	6863	* Iterate over the length of the region.
	6864	*/
	6865	while (length > 0) {
	6866	/*
	6867	* Get the hashtable index for this offset.
	6868	*
	6869	* XXX this will add blank entries if we are clearing a range
	6870	* that hasn't been dirtied.
	6871	*/
	6872	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	6873	cmap = cmapp; / may have changed! */
	6874	/* this may be a partial-success return */
	6875	if (kret != KERN_SUCCESS) {
	6876	if (setcountp != NULL)
	6877	*setcountp = setcount;
	6878	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	6879
	6880	return(kret);
	6881	}
	6882
	6883	/*
	6884	* Work out how many pages we're modifying in this
	6885	* hashtable entry.
	6886	*/
	6887	pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
	6888	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	6889
	6890	/*
	6891	* Iterate over pages, dirty/clearing as we go.
	6892	*/
	6893	ecount = DRT_HASH_GET_COUNT(cmap, index);
	6894	for (i = 0; i < pgcount; i++) {
	6895	if (dirty) {
	6896	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6897	if (ecount >= DRT_BITVECTOR_PAGES)
	6898	panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
	6899	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	6900	ecount++;
	6901	setcount++;
	6902	}
	6903	} else {
	6904	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6905	if (ecount <= 0)
	6906	panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
	6907	assert(ecount > 0);
	6908	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	6909	ecount--;
	6910	setcount++;
	6911	}
	6912	}
	6913	}
	6914	DRT_HASH_SET_COUNT(cmap, index, ecount);
	6915
	6916	offset += pgcount * PAGE_SIZE;
	6917	length -= pgcount * PAGE_SIZE;
	6918	}
	6919	if (setcountp != NULL)
	6920	*setcountp = setcount;
	6921
	6922	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	6923
	6924	return(KERN_SUCCESS);
	6925	}
	6926
	6927	/*
	6928	* Mark a set of pages as dirty/clean.
	6929	*
	6930	* This is a public interface.
	6931	*
	6932	* cmapp
	6933	* Pointer to storage suitable for holding a pointer. Note that
	6934	* this must either be NULL or a value set by this function.
	6935	*
	6936	* size
	6937	* Current file size in bytes.
	6938	*
	6939	* offset
	6940	* Offset of the first page to be marked as dirty, in bytes. Must be
	6941	* page-aligned.
	6942	*
	6943	* length
	6944	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	6945	*
	6946	* setcountp
	6947	* Number of pages newly marked dirty by this call (optional).
	6948	*
	6949	* Returns KERN_SUCCESS if all the pages were successfully marked.
	6950	*/
	6951	static kern_return_t
	6952	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
	6953	{
	6954	/* XXX size unused, drop from interface */
	6955	return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
	6956	}
	6957
	6958	#if 0
	6959	static kern_return_t
	6960	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	6961	{
	6962	return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
	6963	}
	6964	#endif
	6965
	6966	/*
	6967	* Get a cluster of dirty pages.
	6968	*
	6969	* This is a public interface.
	6970	*
	6971	* cmapp
	6972	* Pointer to storage managed by drt_mark_pages. Note that this must
	6973	* be NULL or a value set by drt_mark_pages.
	6974	*
	6975	* offsetp
	6976	* Returns the byte offset into the file of the first page in the cluster.
	6977	*
	6978	* lengthp
	6979	* Returns the length in bytes of the cluster of dirty pages.
	6980	*
	6981	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	6982	* are no dirty pages meeting the minmum size criteria. Private storage will
	6983	* be released if there are no more dirty pages left in the map
	6984	*
	6985	*/
	6986	static kern_return_t
	6987	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	6988	{
	6989	struct vfs_drt_clustermap *cmap;
	6990	u_int64_t offset;
	6991	u_int length;
	6992	u_int32_t j;
	6993	int index, i, fs, ls;
	6994
	6995	/* sanity */
	6996	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6997	return(KERN_FAILURE);
	6998	cmap = *cmapp;
	6999
	7000	/* walk the hashtable */
	7001	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	7002	index = DRT_HASH(cmap, offset);
	7003
	7004	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0))
	7005	continue;
	7006
	7007	/* scan the bitfield for a string of bits */
	7008	fs = -1;
	7009
	7010	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	7011	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	7012	fs = i;
	7013	break;
	7014	}
	7015	}
	7016	if (fs == -1) {
	7017	/* didn't find any bits set */
	7018	panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
	7019	cmap, index, DRT_HASH_GET_COUNT(cmap, index));
	7020	}
	7021	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	7022	if (!DRT_HASH_TEST_BIT(cmap, index, i))
	7023	break;
	7024	}
	7025
	7026	/* compute offset and length, mark pages clean */
	7027	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	7028	length = ls * PAGE_SIZE;
	7029	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	7030	cmap->scm_lastclean = index;
	7031
	7032	/* return successful */
	7033	*offsetp = (off_t)offset;
	7034	*lengthp = length;
	7035
	7036	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	7037	return(KERN_SUCCESS);
	7038	}
	7039	/*
	7040	* We didn't find anything... hashtable is empty
	7041	* emit stats into trace buffer and
	7042	* then free it
	7043	*/
	7044	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	7045	cmap->scm_modulus,
	7046	cmap->scm_buckets,
	7047	cmap->scm_lastclean,
	7048	cmap->scm_iskips);
	7049
	7050	vfs_drt_free_map(cmap);
	7051	*cmapp = NULL;
	7052
	7053	return(KERN_FAILURE);
	7054	}
	7055
	7056
	7057	static kern_return_t
	7058	vfs_drt_control(void **cmapp, int op_type)
	7059	{
	7060	struct vfs_drt_clustermap *cmap;
	7061
	7062	/* sanity */
	7063	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	7064	return(KERN_FAILURE);
	7065	cmap = *cmapp;
	7066
	7067	switch (op_type) {
	7068	case 0:
	7069	/* emit stats into trace buffer */
	7070	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	7071	cmap->scm_modulus,
	7072	cmap->scm_buckets,
	7073	cmap->scm_lastclean,
	7074	cmap->scm_iskips);
	7075
	7076	vfs_drt_free_map(cmap);
	7077	*cmapp = NULL;
	7078	break;
	7079
	7080	case 1:
	7081	cmap->scm_lastclean = 0;
	7082	break;
	7083	}
	7084	return(KERN_SUCCESS);
	7085	}
	7086
	7087
	7088
	7089	/*
	7090	* Emit a summary of the state of the clustermap into the trace buffer
	7091	* along with some caller-provided data.
	7092	*/
	7093	#if KDEBUG
	7094	static void
	7095	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	7096	{
	7097	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	7098	}
	7099	#else
	7100	static void
	7101	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	7102	__unused int arg1, __unused int arg2, __unused int arg3,
	7103	__unused int arg4)
	7104	{
	7105	}
	7106	#endif
	7107
	7108	#if 0
	7109	/*
	7110	* Perform basic sanity check on the hash entry summary count
	7111	* vs. the actual bits set in the entry.
	7112	*/
	7113	static void
	7114	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	7115	{
	7116	int index, i;
	7117	int bits_on;
	7118
	7119	for (index = 0; index < cmap->scm_modulus; index++) {
	7120	if (DRT_HASH_VACANT(cmap, index))
	7121	continue;
	7122
	7123	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	7124	if (DRT_HASH_TEST_BIT(cmap, index, i))
	7125	bits_on++;
	7126	}
	7127	if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
	7128	panic("bits_on = %d, index = %d\n", bits_on, index);
	7129	}
	7130	}
	7131	#endif