git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2014 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
	62	*/
	63
	64	#include <sys/param.h>
	65	#include <sys/proc_internal.h>
	66	#include <sys/buf_internal.h>
	67	#include <sys/mount_internal.h>
	68	#include <sys/vnode_internal.h>
	69	#include <sys/trace.h>
	70	#include <sys/malloc.h>
	71	#include <sys/time.h>
	72	#include <sys/kernel.h>
	73	#include <sys/resourcevar.h>
	74	#include <miscfs/specfs/specdev.h>
	75	#include <sys/uio_internal.h>
	76	#include <libkern/libkern.h>
	77	#include <machine/machine_routines.h>
	78
	79	#include <sys/ubc_internal.h>
	80	#include <vm/vnode_pager.h>
	81
	82	#include <mach/mach_types.h>
	83	#include <mach/memory_object_types.h>
	84	#include <mach/vm_map.h>
	85	#include <mach/upl.h>
	86	#include <kern/task.h>
	87
	88	#include <vm/vm_kern.h>
	89	#include <vm/vm_map.h>
	90	#include <vm/vm_pageout.h>
	91	#include <vm/vm_fault.h>
	92
	93	#include <sys/kdebug.h>
	94	#include <libkern/OSAtomic.h>
	95
	96	#include <sys/sdt.h>
	97
	98	#if 0
	99	#undef KERNEL_DEBUG
	100	#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
	101	#endif
	102
	103
	104	#define CL_READ 0x01
	105	#define CL_WRITE 0x02
	106	#define CL_ASYNC 0x04
	107	#define CL_COMMIT 0x08
	108	#define CL_PAGEOUT 0x10
	109	#define CL_AGE 0x20
	110	#define CL_NOZERO 0x40
	111	#define CL_PAGEIN 0x80
	112	#define CL_DEV_MEMORY 0x100
	113	#define CL_PRESERVE 0x200
	114	#define CL_THROTTLE 0x400
	115	#define CL_KEEPCACHED 0x800
	116	#define CL_DIRECT_IO 0x1000
	117	#define CL_PASSIVE 0x2000
	118	#define CL_IOSTREAMING 0x4000
	119	#define CL_CLOSE 0x8000
	120	#define CL_ENCRYPTED 0x10000
	121	#define CL_RAW_ENCRYPTED 0x20000
	122	#define CL_NOCACHE 0x40000
	123
	124	#define MAX_VECTOR_UPL_ELEMENTS 8
	125	#define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
	126
	127	extern upl_t vector_upl_create(vm_offset_t);
	128	extern boolean_t vector_upl_is_valid(upl_t);
	129	extern boolean_t vector_upl_set_subupl(upl_t,upl_t, u_int32_t);
	130	extern void vector_upl_set_pagelist(upl_t);
	131	extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
	132
	133	struct clios {
	134	lck_mtx_t io_mtxp;
	135	u_int io_completed; /* amount of io that has currently completed */
	136	u_int io_issued; /* amount of io that was successfully issued */
	137	int io_error; /* error code of first error encountered */
	138	int io_wanted; /* someone is sleeping waiting for a change in state */
	139	};
	140
	141	static lck_grp_t *cl_mtx_grp;
	142	static lck_attr_t *cl_mtx_attr;
	143	static lck_grp_attr_t *cl_mtx_grp_attr;
	144	static lck_mtx_t *cl_transaction_mtxp;
	145
	146
	147	#define IO_UNKNOWN 0
	148	#define IO_DIRECT 1
	149	#define IO_CONTIG 2
	150	#define IO_COPY 3
	151
	152	#define PUSH_DELAY 0x01
	153	#define PUSH_ALL 0x02
	154	#define PUSH_SYNC 0x04
	155
	156
	157	static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
	158	static void cluster_wait_IO(buf_t cbp_head, int async);
	159	static void cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait);
	160
	161	static int cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length);
	162
	163	static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	164	int flags, buf_t real_bp, struct clios iostate, int ()(buf_t, void ), void callback_arg);
	165	static int cluster_iodone(buf_t bp, void *callback_arg);
	166	static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
	167	static int cluster_is_throttled(vnode_t vp);
	168
	169	static void cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name);
	170
	171	static void cluster_syncup(vnode_t vp, off_t newEOF, int ()(buf_t, void ), void *callback_arg, int flags);
	172
	173	static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
	174	static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference);
	175
	176	static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
	177	int ()(buf_t, void ), void *callback_arg);
	178	static int cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	179	int flags, int ()(buf_t, void ), void *callback_arg);
	180	static int cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	181	int ()(buf_t, void ), void *callback_arg, int flags);
	182
	183	static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
	184	off_t headOff, off_t tailOff, int flags, int ()(buf_t, void ), void *callback_arg);
	185	static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
	186	int write_type, u_int32_t write_length, int flags, int ()(buf_t, void ), void *callback_arg);
	187	static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
	188	int write_type, u_int32_t write_length, int ()(buf_t, void ), void *callback_arg, int bflag);
	189
	190	static int cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int ()(buf_t, void ), void callback_arg);
	191
	192	static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	193	static void cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead ra, int (callback)(buf_t, void ), void *callback_arg, int bflag);
	194
	195	static int cluster_push_now(vnode_t vp, struct cl_extent , off_t EOF, int flags, int ()(buf_t, void ), void callback_arg);
	196
	197	static int cluster_try_push(struct cl_writebehind , vnode_t vp, off_t EOF, int push_flag, int flags, int ()(buf_t, void ), void callback_arg);
	198
	199	static void sparse_cluster_switch(struct cl_writebehind , vnode_t vp, off_t EOF, int ()(buf_t, void ), void callback_arg);
	200	static void sparse_cluster_push(void *cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int ()(buf_t, void ), void callback_arg);
	201	static void sparse_cluster_add(void *cmapp, vnode_t vp, struct cl_extent , off_t EOF, int ()(buf_t, void ), void *callback_arg);
	202
	203	static kern_return_t vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp);
	204	static kern_return_t vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp);
	205	static kern_return_t vfs_drt_control(void **cmapp, int op_type);
	206
	207
	208	/*
	209	* For throttled IO to check whether
	210	* a block is cached by the boot cache
	211	* and thus it can avoid delaying the IO.
	212	*
	213	* bootcache_contains_block is initially
	214	* NULL. The BootCache will set it while
	215	* the cache is active and clear it when
	216	* the cache is jettisoned.
	217	*
	218	* Returns 0 if the block is not
	219	* contained in the cache, 1 if it is
	220	* contained.
	221	*
	222	* The function pointer remains valid
	223	* after the cache has been evicted even
	224	* if bootcache_contains_block has been
	225	* cleared.
	226	*
	227	* See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
	228	*/
	229	int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
	230
	231
	232	/*
	233	* limit the internal I/O size so that we
	234	* can represent it in a 32 bit int
	235	*/
	236	#define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
	237	#define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
	238	#define MAX_VECTS 16
	239	#define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE)
	240
	241	#define WRITE_THROTTLE 6
	242	#define WRITE_THROTTLE_SSD 2
	243	#define WRITE_BEHIND 1
	244	#define WRITE_BEHIND_SSD 1
	245
	246	#define PREFETCH 3
	247	#define PREFETCH_SSD 2
	248	uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
	249	uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
	250
	251
	252	#define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
	253	#define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
	254	#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)))
	255
	256	int ignore_is_ssd = 0;
	257	int speculative_reads_disabled = 0;
	258
	259	/*
	260	* throttle the number of async writes that
	261	* can be outstanding on a single vnode
	262	* before we issue a synchronous write
	263	*/
	264	#define THROTTLE_MAXCNT 0
	265
	266	uint32_t throttle_max_iosize = (128 * 1024);
	267
	268	#define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
	269
	270	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
	271
	272
	273	void
	274	cluster_init(void) {
	275	/*
	276	* allocate lock group attribute and group
	277	*/
	278	cl_mtx_grp_attr = lck_grp_attr_alloc_init();
	279	cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
	280
	281	/*
	282	* allocate the lock attribute
	283	*/
	284	cl_mtx_attr = lck_attr_alloc_init();
	285
	286	cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
	287
	288	if (cl_transaction_mtxp == NULL)
	289	panic("cluster_init: failed to allocate cl_transaction_mtxp");
	290	}
	291
	292
	293	uint32_t
	294	cluster_max_io_size(mount_t mp, int type)
	295	{
	296	uint32_t max_io_size;
	297	uint32_t segcnt;
	298	uint32_t maxcnt;
	299
	300	switch(type) {
	301
	302	case CL_READ:
	303	segcnt = mp->mnt_segreadcnt;
	304	maxcnt = mp->mnt_maxreadcnt;
	305	break;
	306	case CL_WRITE:
	307	segcnt = mp->mnt_segwritecnt;
	308	maxcnt = mp->mnt_maxwritecnt;
	309	break;
	310	default:
	311	segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
	312	maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
	313	break;
	314	}
	315	if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
	316	/*
	317	* don't allow a size beyond the max UPL size we can create
	318	*/
	319	segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
	320	}
	321	max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
	322
	323	if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
	324	/*
	325	* don't allow a size smaller than the old fixed limit
	326	*/
	327	max_io_size = MAX_UPL_TRANSFER_BYTES;
	328	} else {
	329	/*
	330	* make sure the size specified is a multiple of PAGE_SIZE
	331	*/
	332	max_io_size &= ~PAGE_MASK;
	333	}
	334	return (max_io_size);
	335	}
	336
	337
	338
	339
	340	#define CLW_ALLOCATE 0x01
	341	#define CLW_RETURNLOCKED 0x02
	342	#define CLW_IONOCACHE 0x04
	343	#define CLW_IOPASSIVE 0x08
	344
	345	/*
	346	* if the read ahead context doesn't yet exist,
	347	* allocate and initialize it...
	348	* the vnode lock serializes multiple callers
	349	* during the actual assignment... first one
	350	* to grab the lock wins... the other callers
	351	* will release the now unnecessary storage
	352	*
	353	* once the context is present, try to grab (but don't block on)
	354	* the lock associated with it... if someone
	355	* else currently owns it, than the read
	356	* will run without read-ahead. this allows
	357	* multiple readers to run in parallel and
	358	* since there's only 1 read ahead context,
	359	* there's no real loss in only allowing 1
	360	* reader to have read-ahead enabled.
	361	*/
	362	static struct cl_readahead *
	363	cluster_get_rap(vnode_t vp)
	364	{
	365	struct ubc_info *ubc;
	366	struct cl_readahead *rap;
	367
	368	ubc = vp->v_ubcinfo;
	369
	370	if ((rap = ubc->cl_rahead) == NULL) {
	371	MALLOC_ZONE(rap, struct cl_readahead , sizeof rap, M_CLRDAHEAD, M_WAITOK);
	372
	373	bzero(rap, sizeof *rap);
	374	rap->cl_lastr = -1;
	375	lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
	376
	377	vnode_lock(vp);
	378
	379	if (ubc->cl_rahead == NULL)
	380	ubc->cl_rahead = rap;
	381	else {
	382	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	383	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	384	rap = ubc->cl_rahead;
	385	}
	386	vnode_unlock(vp);
	387	}
	388	if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
	389	return(rap);
	390
	391	return ((struct cl_readahead *)NULL);
	392	}
	393
	394
	395	/*
	396	* if the write behind context doesn't yet exist,
	397	* and CLW_ALLOCATE is specified, allocate and initialize it...
	398	* the vnode lock serializes multiple callers
	399	* during the actual assignment... first one
	400	* to grab the lock wins... the other callers
	401	* will release the now unnecessary storage
	402	*
	403	* if CLW_RETURNLOCKED is set, grab (blocking if necessary)
	404	* the lock associated with the write behind context before
	405	* returning
	406	*/
	407
	408	static struct cl_writebehind *
	409	cluster_get_wbp(vnode_t vp, int flags)
	410	{
	411	struct ubc_info *ubc;
	412	struct cl_writebehind *wbp;
	413
	414	ubc = vp->v_ubcinfo;
	415
	416	if ((wbp = ubc->cl_wbehind) == NULL) {
	417
	418	if ( !(flags & CLW_ALLOCATE))
	419	return ((struct cl_writebehind *)NULL);
	420
	421	MALLOC_ZONE(wbp, struct cl_writebehind , sizeof wbp, M_CLWRBEHIND, M_WAITOK);
	422
	423	bzero(wbp, sizeof *wbp);
	424	lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
	425
	426	vnode_lock(vp);
	427
	428	if (ubc->cl_wbehind == NULL)
	429	ubc->cl_wbehind = wbp;
	430	else {
	431	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	432	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	433	wbp = ubc->cl_wbehind;
	434	}
	435	vnode_unlock(vp);
	436	}
	437	if (flags & CLW_RETURNLOCKED)
	438	lck_mtx_lock(&wbp->cl_lockw);
	439
	440	return (wbp);
	441	}
	442
	443
	444	static void
	445	cluster_syncup(vnode_t vp, off_t newEOF, int (callback)(buf_t, void ), void *callback_arg, int flags)
	446	{
	447	struct cl_writebehind *wbp;
	448
	449	if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
	450
	451	if (wbp->cl_number) {
	452	lck_mtx_lock(&wbp->cl_lockw);
	453
	454	cluster_try_push(wbp, vp, newEOF, PUSH_ALL \| flags, 0, callback, callback_arg);
	455
	456	lck_mtx_unlock(&wbp->cl_lockw);
	457	}
	458	}
	459	}
	460
	461
	462	static int
	463	cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
	464	{
	465	daddr64_t blkno;
	466	size_t io_size;
	467	int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
	468
	469	if (bootcache_check_fn) {
	470	if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ, NULL))
	471	return(0);
	472
	473	if (io_size == 0)
	474	return (0);
	475
	476	if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno))
	477	return(1);
	478	}
	479	return(0);
	480	}
	481
	482
	483	static int
	484	cluster_is_throttled(vnode_t vp)
	485	{
	486	return (throttle_io_will_be_throttled(-1, vp->v_mount));
	487	}
	488
	489
	490	static void
	491	cluster_iostate_wait(struct clios iostate, u_int target, const char wait_name)
	492	{
	493
	494	lck_mtx_lock(&iostate->io_mtxp);
	495
	496	while ((iostate->io_issued - iostate->io_completed) > target) {
	497
	498	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_START,
	499	iostate->io_issued, iostate->io_completed, target, 0, 0);
	500
	501	iostate->io_wanted = 1;
	502	msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
	503
	504	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) \| DBG_FUNC_END,
	505	iostate->io_issued, iostate->io_completed, target, 0, 0);
	506	}
	507	lck_mtx_unlock(&iostate->io_mtxp);
	508	}
	509
	510
	511	static int
	512	cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
	513	{
	514	int upl_abort_code = 0;
	515	int page_in = 0;
	516	int page_out = 0;
	517
	518	if ((io_flags & (B_PHYS \| B_CACHE)) == (B_PHYS \| B_CACHE))
	519	/*
	520	* direct write of any flavor, or a direct read that wasn't aligned
	521	*/
	522	ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
	523	else {
	524	if (io_flags & B_PAGEIO) {
	525	if (io_flags & B_READ)
	526	page_in = 1;
	527	else
	528	page_out = 1;
	529	}
	530	if (io_flags & B_CACHE)
	531	/*
	532	* leave pages in the cache unchanged on error
	533	*/
	534	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	535	else if (page_out && ((error != ENXIO) \|\| vnode_isswap(vp)))
	536	/*
	537	* transient error... leave pages unchanged
	538	*/
	539	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
	540	else if (page_in)
	541	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR;
	542	else
	543	upl_abort_code = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	544
	545	ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
	546	}
	547	return (upl_abort_code);
	548	}
	549
	550
	551	static int
	552	cluster_iodone(buf_t bp, void *callback_arg)
	553	{
	554	int b_flags;
	555	int error;
	556	int total_size;
	557	int total_resid;
	558	int upl_offset;
	559	int zero_offset;
	560	int pg_offset = 0;
	561	int commit_size = 0;
	562	int upl_flags = 0;
	563	int transaction_size = 0;
	564	upl_t upl;
	565	buf_t cbp;
	566	buf_t cbp_head;
	567	buf_t cbp_next;
	568	buf_t real_bp;
	569	vnode_t vp;
	570	struct clios *iostate;
	571	boolean_t transaction_complete = FALSE;
	572
	573	cbp_head = (buf_t)(bp->b_trans_head);
	574
	575	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_START,
	576	cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	577
	578	if (cbp_head->b_trans_next \|\| !(cbp_head->b_flags & B_EOT)) {
	579	boolean_t need_wakeup = FALSE;
	580
	581	lck_mtx_lock_spin(cl_transaction_mtxp);
	582
	583	bp->b_flags \|= B_TDONE;
	584
	585	if (bp->b_flags & B_TWANTED) {
	586	CLR(bp->b_flags, B_TWANTED);
	587	need_wakeup = TRUE;
	588	}
	589	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	590	/*
	591	* all I/O requests that are part of this transaction
	592	* have to complete before we can process it
	593	*/
	594	if ( !(cbp->b_flags & B_TDONE)) {
	595
	596	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	597	cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
	598
	599	lck_mtx_unlock(cl_transaction_mtxp);
	600
	601	if (need_wakeup == TRUE)
	602	wakeup(bp);
	603
	604	return 0;
	605	}
	606	if (cbp->b_flags & B_EOT)
	607	transaction_complete = TRUE;
	608	}
	609	lck_mtx_unlock(cl_transaction_mtxp);
	610
	611	if (need_wakeup == TRUE)
	612	wakeup(bp);
	613
	614	if (transaction_complete == FALSE) {
	615	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	616	cbp_head, 0, 0, 0, 0);
	617	return 0;
	618	}
	619	}
	620	error = 0;
	621	total_size = 0;
	622	total_resid = 0;
	623
	624	cbp = cbp_head;
	625	vp = cbp->b_vp;
	626	upl_offset = cbp->b_uploffset;
	627	upl = cbp->b_upl;
	628	b_flags = cbp->b_flags;
	629	real_bp = cbp->b_real_bp;
	630	zero_offset= cbp->b_validend;
	631	iostate = (struct clios *)cbp->b_iostate;
	632
	633	if (real_bp)
	634	real_bp->b_dev = cbp->b_dev;
	635
	636	while (cbp) {
	637	if ((cbp->b_flags & B_ERROR) && error == 0)
	638	error = cbp->b_error;
	639
	640	total_resid += cbp->b_resid;
	641	total_size += cbp->b_bcount;
	642
	643	cbp_next = cbp->b_trans_next;
	644
	645	if (cbp_next == NULL)
	646	/*
	647	* compute the overall size of the transaction
	648	* in case we created one that has 'holes' in it
	649	* 'total_size' represents the amount of I/O we
	650	* did, not the span of the transaction w/r to the UPL
	651	*/
	652	transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
	653
	654	if (cbp != cbp_head)
	655	free_io_buf(cbp);
	656
	657	cbp = cbp_next;
	658	}
	659	if (error == 0 && total_resid)
	660	error = EIO;
	661
	662	if (error == 0) {
	663	int (cliodone_func)(buf_t, void ) = (int ()(buf_t, void ))(cbp_head->b_cliodone);
	664
	665	if (cliodone_func != NULL) {
	666	cbp_head->b_bcount = transaction_size;
	667
	668	error = (*cliodone_func)(cbp_head, callback_arg);
	669	}
	670	}
	671	if (zero_offset)
	672	cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
	673
	674	free_io_buf(cbp_head);
	675
	676	if (iostate) {
	677	int need_wakeup = 0;
	678
	679	/*
	680	* someone has issued multiple I/Os asynchrounsly
	681	* and is waiting for them to complete (streaming)
	682	*/
	683	lck_mtx_lock_spin(&iostate->io_mtxp);
	684
	685	if (error && iostate->io_error == 0)
	686	iostate->io_error = error;
	687
	688	iostate->io_completed += total_size;
	689
	690	if (iostate->io_wanted) {
	691	/*
	692	* someone is waiting for the state of
	693	* this io stream to change
	694	*/
	695	iostate->io_wanted = 0;
	696	need_wakeup = 1;
	697	}
	698	lck_mtx_unlock(&iostate->io_mtxp);
	699
	700	if (need_wakeup)
	701	wakeup((caddr_t)&iostate->io_wanted);
	702	}
	703
	704	if (b_flags & B_COMMIT_UPL) {
	705
	706	pg_offset = upl_offset & PAGE_MASK;
	707	commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	708
	709	if (error)
	710	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
	711	else {
	712	upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
	713
	714	if ((b_flags & B_PHYS) && (b_flags & B_READ))
	715	upl_flags \|= UPL_COMMIT_SET_DIRTY;
	716
	717	if (b_flags & B_AGE)
	718	upl_flags \|= UPL_COMMIT_INACTIVATE;
	719
	720	ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
	721	}
	722	}
	723	if (real_bp) {
	724	if (error) {
	725	real_bp->b_flags \|= B_ERROR;
	726	real_bp->b_error = error;
	727	}
	728	real_bp->b_resid = total_resid;
	729
	730	buf_biodone(real_bp);
	731	}
	732	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) \| DBG_FUNC_END,
	733	upl, upl_offset - pg_offset, commit_size, (error << 24) \| upl_flags, 0);
	734
	735	return (error);
	736	}
	737
	738
	739	uint32_t
	740	cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
	741	{
	742	if (cluster_is_throttled(vp)) {
	743	*limit = THROTTLE_MAX_IOSIZE;
	744	return 1;
	745	}
	746	return 0;
	747	}
	748
	749
	750	void
	751	cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
	752	{
	753
	754	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_START,
	755	upl_offset, size, bp, 0, 0);
	756
	757	if (bp == NULL \|\| bp->b_datap == 0) {
	758	upl_page_info_t *pl;
	759	addr64_t zero_addr;
	760
	761	pl = ubc_upl_pageinfo(upl);
	762
	763	if (upl_device_page(pl) == TRUE) {
	764	zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
	765
	766	bzero_phys_nc(zero_addr, size);
	767	} else {
	768	while (size) {
	769	int page_offset;
	770	int page_index;
	771	int zero_cnt;
	772
	773	page_index = upl_offset / PAGE_SIZE;
	774	page_offset = upl_offset & PAGE_MASK;
	775
	776	zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
	777	zero_cnt = min(PAGE_SIZE - page_offset, size);
	778
	779	bzero_phys(zero_addr, zero_cnt);
	780
	781	size -= zero_cnt;
	782	upl_offset += zero_cnt;
	783	}
	784	}
	785	} else
	786	bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
	787
	788	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) \| DBG_FUNC_END,
	789	upl_offset, size, 0, 0, 0);
	790	}
	791
	792
	793	static void
	794	cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
	795	{
	796	cbp_head->b_validend = zero_offset;
	797	cbp_tail->b_flags \|= B_EOT;
	798	}
	799
	800	static void
	801	cluster_wait_IO(buf_t cbp_head, int async)
	802	{
	803	buf_t cbp;
	804
	805	if (async) {
	806	/*
	807	* async callback completion will not normally
	808	* generate a wakeup upon I/O completion...
	809	* by setting B_TWANTED, we will force a wakeup
	810	* to occur as any outstanding I/Os complete...
	811	* I/Os already completed will have B_TDONE already
	812	* set and we won't cause us to block
	813	* note that we're actually waiting for the bp to have
	814	* completed the callback function... only then
	815	* can we safely take back ownership of the bp
	816	*/
	817	lck_mtx_lock_spin(cl_transaction_mtxp);
	818
	819	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
	820	cbp->b_flags \|= B_TWANTED;
	821
	822	lck_mtx_unlock(cl_transaction_mtxp);
	823	}
	824	for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
	825
	826	if (async) {
	827	while (!ISSET(cbp->b_flags, B_TDONE)) {
	828
	829	lck_mtx_lock_spin(cl_transaction_mtxp);
	830
	831	if (!ISSET(cbp->b_flags, B_TDONE)) {
	832	DTRACE_IO1(wait__start, buf_t, cbp);
	833	(void) msleep(cbp, cl_transaction_mtxp, PDROP \| (PRIBIO+1), "cluster_wait_IO", NULL);
	834	DTRACE_IO1(wait__done, buf_t, cbp);
	835	} else
	836	lck_mtx_unlock(cl_transaction_mtxp);
	837	}
	838	} else
	839	buf_biowait(cbp);
	840	}
	841	}
	842
	843	static void
	844	cluster_complete_transaction(buf_t cbp_head, void callback_arg, int *retval, int flags, int needwait)
	845	{
	846	buf_t cbp;
	847	int error;
	848	boolean_t isswapout = FALSE;
	849
	850	/*
	851	* cluster_complete_transaction will
	852	* only be called if we've issued a complete chain in synchronous mode
	853	* or, we've already done a cluster_wait_IO on an incomplete chain
	854	*/
	855	if (needwait) {
	856	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	857	buf_biowait(cbp);
	858	}
	859	/*
	860	* we've already waited on all of the I/Os in this transaction,
	861	* so mark all of the buf_t's in this transaction as B_TDONE
	862	* so that cluster_iodone sees the transaction as completed
	863	*/
	864	for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next)
	865	cbp->b_flags \|= B_TDONE;
	866	cbp = *cbp_head;
	867
	868	if ((flags & (CL_ASYNC \| CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp))
	869	isswapout = TRUE;
	870
	871	error = cluster_iodone(cbp, callback_arg);
	872
	873	if ( !(flags & CL_ASYNC) && error && *retval == 0) {
	874	if (((flags & (CL_PAGEOUT \| CL_KEEPCACHED)) != CL_PAGEOUT) \|\| (error != ENXIO))
	875	*retval = error;
	876	else if (isswapout == TRUE)
	877	*retval = error;
	878	}
	879	*cbp_head = (buf_t)NULL;
	880	}
	881
	882
	883	static int
	884	cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
	885	int flags, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	886	{
	887	buf_t cbp;
	888	u_int size;
	889	u_int io_size;
	890	int io_flags;
	891	int bmap_flags;
	892	int error = 0;
	893	int retval = 0;
	894	buf_t cbp_head = NULL;
	895	buf_t cbp_tail = NULL;
	896	int trans_count = 0;
	897	int max_trans_count;
	898	u_int pg_count;
	899	int pg_offset;
	900	u_int max_iosize;
	901	u_int max_vectors;
	902	int priv;
	903	int zero_offset = 0;
	904	int async_throttle = 0;
	905	mount_t mp;
	906	vm_offset_t upl_end_offset;
	907	boolean_t need_EOT = FALSE;
	908
	909	/*
	910	* we currently don't support buffers larger than a page
	911	*/
	912	if (real_bp && non_rounded_size > PAGE_SIZE)
	913	panic("%s(): Called with real buffer of size %d bytes which "
	914	"is greater than the maximum allowed size of "
	915	"%d bytes (the system PAGE_SIZE).\n",
	916	__FUNCTION__, non_rounded_size, PAGE_SIZE);
	917
	918	mp = vp->v_mount;
	919
	920	/*
	921	* we don't want to do any funny rounding of the size for IO requests
	922	* coming through the DIRECT or CONTIGUOUS paths... those pages don't
	923	* belong to us... we can't extend (nor do we need to) the I/O to fill
	924	* out a page
	925	*/
	926	if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY \| CL_DIRECT_IO))) {
	927	/*
	928	* round the requested size up so that this I/O ends on a
	929	* page boundary in case this is a 'write'... if the filesystem
	930	* has blocks allocated to back the page beyond the EOF, we want to
	931	* make sure to write out the zero's that are sitting beyond the EOF
	932	* so that in case the filesystem doesn't explicitly zero this area
	933	* if a hole is created via a lseek/write beyond the current EOF,
	934	* it will return zeros when it's read back from the disk. If the
	935	* physical allocation doesn't extend for the whole page, we'll
	936	* only write/read from the disk up to the end of this allocation
	937	* via the extent info returned from the VNOP_BLOCKMAP call.
	938	*/
	939	pg_offset = upl_offset & PAGE_MASK;
	940
	941	size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
	942	} else {
	943	/*
	944	* anyone advertising a blocksize of 1 byte probably
	945	* can't deal with us rounding up the request size
	946	* AFP is one such filesystem/device
	947	*/
	948	size = non_rounded_size;
	949	}
	950	upl_end_offset = upl_offset + size;
	951
	952	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
	953
	954	/*
	955	* Set the maximum transaction size to the maximum desired number of
	956	* buffers.
	957	*/
	958	max_trans_count = 8;
	959	if (flags & CL_DEV_MEMORY)
	960	max_trans_count = 16;
	961
	962	if (flags & CL_READ) {
	963	io_flags = B_READ;
	964	bmap_flags = VNODE_READ;
	965
	966	max_iosize = mp->mnt_maxreadcnt;
	967	max_vectors = mp->mnt_segreadcnt;
	968	} else {
	969	io_flags = B_WRITE;
	970	bmap_flags = VNODE_WRITE;
	971
	972	max_iosize = mp->mnt_maxwritecnt;
	973	max_vectors = mp->mnt_segwritecnt;
	974	}
	975	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
	976
	977	/*
	978	* make sure the maximum iosize is a
	979	* multiple of the page size
	980	*/
	981	max_iosize &= ~PAGE_MASK;
	982
	983	/*
	984	* Ensure the maximum iosize is sensible.
	985	*/
	986	if (!max_iosize)
	987	max_iosize = PAGE_SIZE;
	988
	989	if (flags & CL_THROTTLE) {
	990	if ( !(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
	991	if (max_iosize > THROTTLE_MAX_IOSIZE)
	992	max_iosize = THROTTLE_MAX_IOSIZE;
	993	async_throttle = THROTTLE_MAXCNT;
	994	} else {
	995	if ( (flags & CL_DEV_MEMORY) )
	996	async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
	997	else {
	998	u_int max_cluster;
	999	u_int max_cluster_size;
	1000	u_int scale;
	1001
	1002	max_cluster_size = MAX_CLUSTER_SIZE(vp);
	1003
	1004	if (max_iosize > max_cluster_size)
	1005	max_cluster = max_cluster_size;
	1006	else
	1007	max_cluster = max_iosize;
	1008
	1009	if (size < max_cluster)
	1010	max_cluster = size;
	1011
	1012	if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
	1013	scale = WRITE_THROTTLE_SSD;
	1014	else
	1015	scale = WRITE_THROTTLE;
	1016
	1017	if (flags & CL_CLOSE)
	1018	scale += MAX_CLUSTERS;
	1019
	1020	async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
	1021	}
	1022	}
	1023	}
	1024	if (flags & CL_AGE)
	1025	io_flags \|= B_AGE;
	1026	if (flags & (CL_PAGEIN \| CL_PAGEOUT))
	1027	io_flags \|= B_PAGEIO;
	1028	if (flags & (CL_IOSTREAMING))
	1029	io_flags \|= B_IOSTREAMING;
	1030	if (flags & CL_COMMIT)
	1031	io_flags \|= B_COMMIT_UPL;
	1032	if (flags & CL_DIRECT_IO)
	1033	io_flags \|= B_PHYS;
	1034	if (flags & (CL_PRESERVE \| CL_KEEPCACHED))
	1035	io_flags \|= B_CACHE;
	1036	if (flags & CL_PASSIVE)
	1037	io_flags \|= B_PASSIVE;
	1038	if (flags & CL_ENCRYPTED)
	1039	io_flags \|= B_ENCRYPTED_IO;
	1040	if (vp->v_flag & VSYSTEM)
	1041	io_flags \|= B_META;
	1042
	1043	if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
	1044	/*
	1045	* then we are going to end up
	1046	* with a page that we can't complete (the file size wasn't a multiple
	1047	* of PAGE_SIZE and we're trying to read to the end of the file
	1048	* so we'll go ahead and zero out the portion of the page we can't
	1049	* read in from the file
	1050	*/
	1051	zero_offset = upl_offset + non_rounded_size;
	1052	}
	1053	while (size) {
	1054	daddr64_t blkno;
	1055	daddr64_t lblkno;
	1056	u_int io_size_wanted;
	1057	size_t io_size_tmp;
	1058
	1059	if (size > max_iosize)
	1060	io_size = max_iosize;
	1061	else
	1062	io_size = size;
	1063
	1064	io_size_wanted = io_size;
	1065	io_size_tmp = (size_t)io_size;
	1066
	1067	if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL)))
	1068	break;
	1069
	1070	if (io_size_tmp > io_size_wanted)
	1071	io_size = io_size_wanted;
	1072	else
	1073	io_size = (u_int)io_size_tmp;
	1074
	1075	if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
	1076	real_bp->b_blkno = blkno;
	1077
	1078	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) \| DBG_FUNC_NONE,
	1079	(int)f_offset, (int)(blkno>>32), (int)blkno, io_size, 0);
	1080
	1081	if (io_size == 0) {
	1082	/*
	1083	* vnop_blockmap didn't return an error... however, it did
	1084	* return an extent size of 0 which means we can't
	1085	* make forward progress on this I/O... a hole in the
	1086	* file would be returned as a blkno of -1 with a non-zero io_size
	1087	* a real extent is returned with a blkno != -1 and a non-zero io_size
	1088	*/
	1089	error = EINVAL;
	1090	break;
	1091	}
	1092	if ( !(flags & CL_READ) && blkno == -1) {
	1093	off_t e_offset;
	1094	int pageout_flags;
	1095
	1096	if (upl_get_internal_vectorupl(upl))
	1097	panic("Vector UPLs should not take this code-path\n");
	1098	/*
	1099	* we're writing into a 'hole'
	1100	*/
	1101	if (flags & CL_PAGEOUT) {
	1102	/*
	1103	* if we got here via cluster_pageout
	1104	* then just error the request and return
	1105	* the 'hole' should already have been covered
	1106	*/
	1107	error = EINVAL;
	1108	break;
	1109	}
	1110	/*
	1111	* we can get here if the cluster code happens to
	1112	* pick up a page that was dirtied via mmap vs
	1113	* a 'write' and the page targets a 'hole'...
	1114	* i.e. the writes to the cluster were sparse
	1115	* and the file was being written for the first time
	1116	*
	1117	* we can also get here if the filesystem supports
	1118	* 'holes' that are less than PAGE_SIZE.... because
	1119	* we can't know if the range in the page that covers
	1120	* the 'hole' has been dirtied via an mmap or not,
	1121	* we have to assume the worst and try to push the
	1122	* entire page to storage.
	1123	*
	1124	* Try paging out the page individually before
	1125	* giving up entirely and dumping it (the pageout
	1126	* path will insure that the zero extent accounting
	1127	* has been taken care of before we get back into cluster_io)
	1128	*
	1129	* go direct to vnode_pageout so that we don't have to
	1130	* unbusy the page from the UPL... we used to do this
	1131	* so that we could call ubc_msync, but that results
	1132	* in a potential deadlock if someone else races us to acquire
	1133	* that page and wins and in addition needs one of the pages
	1134	* we're continuing to hold in the UPL
	1135	*/
	1136	pageout_flags = UPL_MSYNC \| UPL_VNODE_PAGER \| UPL_NESTED_PAGEOUT;
	1137
	1138	if ( !(flags & CL_ASYNC))
	1139	pageout_flags \|= UPL_IOSYNC;
	1140	if ( !(flags & CL_COMMIT))
	1141	pageout_flags \|= UPL_NOCOMMIT;
	1142
	1143	if (cbp_head) {
	1144	buf_t last_cbp;
	1145
	1146	/*
	1147	* first we have to wait for the the current outstanding I/Os
	1148	* to complete... EOT hasn't been set yet on this transaction
	1149	* so the pages won't be released just because all of the current
	1150	* I/O linked to this transaction has completed...
	1151	*/
	1152	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1153
	1154	/*
	1155	* we've got a transcation that
	1156	* includes the page we're about to push out through vnode_pageout...
	1157	* find the last bp in the list which will be the one that
	1158	* includes the head of this page and round it's iosize down
	1159	* to a page boundary...
	1160	*/
	1161	for (last_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next)
	1162	last_cbp = cbp;
	1163
	1164	cbp->b_bcount &= ~PAGE_MASK;
	1165
	1166	if (cbp->b_bcount == 0) {
	1167	/*
	1168	* this buf no longer has any I/O associated with it
	1169	*/
	1170	free_io_buf(cbp);
	1171
	1172	if (cbp == cbp_head) {
	1173	/*
	1174	* the buf we just freed was the only buf in
	1175	* this transaction... so there's no I/O to do
	1176	*/
	1177	cbp_head = NULL;
	1178	} else {
	1179	/*
	1180	* remove the buf we just freed from
	1181	* the transaction list
	1182	*/
	1183	last_cbp->b_trans_next = NULL;
	1184	cbp_tail = last_cbp;
	1185	}
	1186	}
	1187	if (cbp_head) {
	1188	/*
	1189	* there was more to the current transaction
	1190	* than just the page we are pushing out via vnode_pageout...
	1191	* mark it as finished and complete it... we've already
	1192	* waited for the I/Os to complete above in the call to cluster_wait_IO
	1193	*/
	1194	cluster_EOT(cbp_head, cbp_tail, 0);
	1195
	1196	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1197
	1198	trans_count = 0;
	1199	}
	1200	}
	1201	if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
	1202	error = EINVAL;
	1203	}
	1204	e_offset = round_page_64(f_offset + 1);
	1205	io_size = e_offset - f_offset;
	1206
	1207	f_offset += io_size;
	1208	upl_offset += io_size;
	1209
	1210	if (size >= io_size)
	1211	size -= io_size;
	1212	else
	1213	size = 0;
	1214	/*
	1215	* keep track of how much of the original request
	1216	* that we've actually completed... non_rounded_size
	1217	* may go negative due to us rounding the request
	1218	* to a page size multiple (i.e. size > non_rounded_size)
	1219	*/
	1220	non_rounded_size -= io_size;
	1221
	1222	if (non_rounded_size <= 0) {
	1223	/*
	1224	* we've transferred all of the data in the original
	1225	* request, but we were unable to complete the tail
	1226	* of the last page because the file didn't have
	1227	* an allocation to back that portion... this is ok.
	1228	*/
	1229	size = 0;
	1230	}
	1231	if (error) {
	1232	if (size == 0)
	1233	flags &= ~CL_COMMIT;
	1234	break;
	1235	}
	1236	continue;
	1237	}
	1238	lblkno = (daddr64_t)(f_offset / 0x1000);
	1239	/*
	1240	* we have now figured out how much I/O we can do - this is in 'io_size'
	1241	* pg_offset is the starting point in the first page for the I/O
	1242	* pg_count is the number of full and partial pages that 'io_size' encompasses
	1243	*/
	1244	pg_offset = upl_offset & PAGE_MASK;
	1245
	1246	if (flags & CL_DEV_MEMORY) {
	1247	/*
	1248	* treat physical requests as one 'giant' page
	1249	*/
	1250	pg_count = 1;
	1251	} else
	1252	pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1253
	1254	if ((flags & CL_READ) && blkno == -1) {
	1255	vm_offset_t commit_offset;
	1256	int bytes_to_zero;
	1257	int complete_transaction_now = 0;
	1258
	1259	/*
	1260	* if we're reading and blkno == -1, then we've got a
	1261	* 'hole' in the file that we need to deal with by zeroing
	1262	* out the affected area in the upl
	1263	*/
	1264	if (io_size >= (u_int)non_rounded_size) {
	1265	/*
	1266	* if this upl contains the EOF and it is not a multiple of PAGE_SIZE
	1267	* than 'zero_offset' will be non-zero
	1268	* if the 'hole' returned by vnop_blockmap extends all the way to the eof
	1269	* (indicated by the io_size finishing off the I/O request for this UPL)
	1270	* than we're not going to issue an I/O for the
	1271	* last page in this upl... we need to zero both the hole and the tail
	1272	* of the page beyond the EOF, since the delayed zero-fill won't kick in
	1273	*/
	1274	bytes_to_zero = non_rounded_size;
	1275	if (!(flags & CL_NOZERO))
	1276	bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
	1277
	1278	zero_offset = 0;
	1279	} else
	1280	bytes_to_zero = io_size;
	1281
	1282	pg_count = 0;
	1283
	1284	cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
	1285
	1286	if (cbp_head) {
	1287	int pg_resid;
	1288
	1289	/*
	1290	* if there is a current I/O chain pending
	1291	* then the first page of the group we just zero'd
	1292	* will be handled by the I/O completion if the zero
	1293	* fill started in the middle of the page
	1294	*/
	1295	commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1296
	1297	pg_resid = commit_offset - upl_offset;
	1298
	1299	if (bytes_to_zero >= pg_resid) {
	1300	/*
	1301	* the last page of the current I/O
	1302	* has been completed...
	1303	* compute the number of fully zero'd
	1304	* pages that are beyond it
	1305	* plus the last page if its partial
	1306	* and we have no more I/O to issue...
	1307	* otherwise a partial page is left
	1308	* to begin the next I/O
	1309	*/
	1310	if ((int)io_size >= non_rounded_size)
	1311	pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1312	else
	1313	pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
	1314
	1315	complete_transaction_now = 1;
	1316	}
	1317	} else {
	1318	/*
	1319	* no pending I/O to deal with
	1320	* so, commit all of the fully zero'd pages
	1321	* plus the last page if its partial
	1322	* and we have no more I/O to issue...
	1323	* otherwise a partial page is left
	1324	* to begin the next I/O
	1325	*/
	1326	if ((int)io_size >= non_rounded_size)
	1327	pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1328	else
	1329	pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
	1330
	1331	commit_offset = upl_offset & ~PAGE_MASK;
	1332	}
	1333	if ( (flags & CL_COMMIT) && pg_count) {
	1334	ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
	1335	UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY);
	1336	}
	1337	upl_offset += io_size;
	1338	f_offset += io_size;
	1339	size -= io_size;
	1340
	1341	/*
	1342	* keep track of how much of the original request
	1343	* that we've actually completed... non_rounded_size
	1344	* may go negative due to us rounding the request
	1345	* to a page size multiple (i.e. size > non_rounded_size)
	1346	*/
	1347	non_rounded_size -= io_size;
	1348
	1349	if (non_rounded_size <= 0) {
	1350	/*
	1351	* we've transferred all of the data in the original
	1352	* request, but we were unable to complete the tail
	1353	* of the last page because the file didn't have
	1354	* an allocation to back that portion... this is ok.
	1355	*/
	1356	size = 0;
	1357	}
	1358	if (cbp_head && (complete_transaction_now \|\| size == 0)) {
	1359	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1360
	1361	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1362
	1363	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
	1364
	1365	trans_count = 0;
	1366	}
	1367	continue;
	1368	}
	1369	if (pg_count > max_vectors) {
	1370	if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
	1371	io_size = PAGE_SIZE - pg_offset;
	1372	pg_count = 1;
	1373	} else {
	1374	io_size -= (pg_count - max_vectors) * PAGE_SIZE;
	1375	pg_count = max_vectors;
	1376	}
	1377	}
	1378	/*
	1379	* If the transaction is going to reach the maximum number of
	1380	* desired elements, truncate the i/o to the nearest page so
	1381	* that the actual i/o is initiated after this buffer is
	1382	* created and added to the i/o chain.
	1383	*
	1384	* I/O directed to physically contiguous memory
	1385	* doesn't have a requirement to make sure we 'fill' a page
	1386	*/
	1387	if ( !(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
	1388	((upl_offset + io_size) & PAGE_MASK)) {
	1389	vm_offset_t aligned_ofs;
	1390
	1391	aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
	1392	/*
	1393	* If the io_size does not actually finish off even a
	1394	* single page we have to keep adding buffers to the
	1395	* transaction despite having reached the desired limit.
	1396	*
	1397	* Eventually we get here with the page being finished
	1398	* off (and exceeded) and then we truncate the size of
	1399	* this i/o request so that it is page aligned so that
	1400	* we can finally issue the i/o on the transaction.
	1401	*/
	1402	if (aligned_ofs > upl_offset) {
	1403	io_size = aligned_ofs - upl_offset;
	1404	pg_count--;
	1405	}
	1406	}
	1407
	1408	if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
	1409	/*
	1410	* if we're not targeting a virtual device i.e. a disk image
	1411	* it's safe to dip into the reserve pool since real devices
	1412	* can complete this I/O request without requiring additional
	1413	* bufs from the alloc_io_buf pool
	1414	*/
	1415	priv = 1;
	1416	else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
	1417	/*
	1418	* Throttle the speculative IO
	1419	*/
	1420	priv = 0;
	1421	else
	1422	priv = 1;
	1423
	1424	cbp = alloc_io_buf(vp, priv);
	1425
	1426	if (flags & CL_PAGEOUT) {
	1427	u_int i;
	1428
	1429	for (i = 0; i < pg_count; i++) {
	1430	if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
	1431	panic("BUSY bp found in cluster_io");
	1432	}
	1433	}
	1434	if (flags & CL_ASYNC) {
	1435	if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg))
	1436	panic("buf_setcallback failed\n");
	1437	}
	1438	cbp->b_cliodone = (void *)callback;
	1439	cbp->b_flags \|= io_flags;
	1440	if (flags & CL_NOCACHE)
	1441	cbp->b_attr.ba_flags \|= BA_NOCACHE;
	1442
	1443	cbp->b_lblkno = lblkno;
	1444	cbp->b_blkno = blkno;
	1445	cbp->b_bcount = io_size;
	1446
	1447	if (buf_setupl(cbp, upl, upl_offset))
	1448	panic("buf_setupl failed\n");
	1449	#if CONFIG_IOSCHED
	1450	upl_set_blkno(upl, upl_offset, io_size, blkno);
	1451	#endif
	1452	cbp->b_trans_next = (buf_t)NULL;
	1453
	1454	if ((cbp->b_iostate = (void *)iostate))
	1455	/*
	1456	* caller wants to track the state of this
	1457	* io... bump the amount issued against this stream
	1458	*/
	1459	iostate->io_issued += io_size;
	1460
	1461	if (flags & CL_READ) {
	1462	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) \| DBG_FUNC_NONE,
	1463	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1464	}
	1465	else {
	1466	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) \| DBG_FUNC_NONE,
	1467	(int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
	1468	}
	1469
	1470	if (cbp_head) {
	1471	cbp_tail->b_trans_next = cbp;
	1472	cbp_tail = cbp;
	1473	} else {
	1474	cbp_head = cbp;
	1475	cbp_tail = cbp;
	1476
	1477	if ( (cbp_head->b_real_bp = real_bp) )
	1478	real_bp = (buf_t)NULL;
	1479	}
	1480	(buf_t )(&cbp->b_trans_head) = cbp_head;
	1481
	1482	trans_count++;
	1483
	1484	upl_offset += io_size;
	1485	f_offset += io_size;
	1486	size -= io_size;
	1487	/*
	1488	* keep track of how much of the original request
	1489	* that we've actually completed... non_rounded_size
	1490	* may go negative due to us rounding the request
	1491	* to a page size multiple (i.e. size > non_rounded_size)
	1492	*/
	1493	non_rounded_size -= io_size;
	1494
	1495	if (non_rounded_size <= 0) {
	1496	/*
	1497	* we've transferred all of the data in the original
	1498	* request, but we were unable to complete the tail
	1499	* of the last page because the file didn't have
	1500	* an allocation to back that portion... this is ok.
	1501	*/
	1502	size = 0;
	1503	}
	1504	if (size == 0) {
	1505	/*
	1506	* we have no more I/O to issue, so go
	1507	* finish the final transaction
	1508	*/
	1509	need_EOT = TRUE;
	1510	} else if ( ((flags & CL_DEV_MEMORY) \|\| (upl_offset & PAGE_MASK) == 0) &&
	1511	((flags & CL_ASYNC) \|\| trans_count > max_trans_count) ) {
	1512	/*
	1513	* I/O directed to physically contiguous memory...
	1514	* which doesn't have a requirement to make sure we 'fill' a page
	1515	* or...
	1516	* the current I/O we've prepared fully
	1517	* completes the last page in this request
	1518	* and ...
	1519	* it's either an ASYNC request or
	1520	* we've already accumulated more than 8 I/O's into
	1521	* this transaction so mark it as complete so that
	1522	* it can finish asynchronously or via the cluster_complete_transaction
	1523	* below if the request is synchronous
	1524	*/
	1525	need_EOT = TRUE;
	1526	}
	1527	if (need_EOT == TRUE)
	1528	cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
	1529
	1530	if (flags & CL_THROTTLE)
	1531	(void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
	1532
	1533	if ( !(io_flags & B_READ))
	1534	vnode_startwrite(vp);
	1535
	1536	if (flags & CL_RAW_ENCRYPTED) {
	1537	/*
	1538	* User requested raw encrypted bytes.
	1539	* Twiddle the bit in the ba_flags for the buffer
	1540	*/
	1541	cbp->b_attr.ba_flags \|= BA_RAW_ENCRYPTED_IO;
	1542	}
	1543
	1544	(void) VNOP_STRATEGY(cbp);
	1545
	1546	if (need_EOT == TRUE) {
	1547	if ( !(flags & CL_ASYNC))
	1548	cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
	1549
	1550	need_EOT = FALSE;
	1551	trans_count = 0;
	1552	cbp_head = NULL;
	1553	}
	1554	}
	1555	if (error) {
	1556	int abort_size;
	1557
	1558	io_size = 0;
	1559
	1560	if (cbp_head) {
	1561	/*
	1562	* first wait until all of the outstanding I/O
	1563	* for this partial transaction has completed
	1564	*/
	1565	cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
	1566
	1567	/*
	1568	* Rewind the upl offset to the beginning of the
	1569	* transaction.
	1570	*/
	1571	upl_offset = cbp_head->b_uploffset;
	1572
	1573	for (cbp = cbp_head; cbp;) {
	1574	buf_t cbp_next;
	1575
	1576	size += cbp->b_bcount;
	1577	io_size += cbp->b_bcount;
	1578
	1579	cbp_next = cbp->b_trans_next;
	1580	free_io_buf(cbp);
	1581	cbp = cbp_next;
	1582	}
	1583	}
	1584	if (iostate) {
	1585	int need_wakeup = 0;
	1586
	1587	/*
	1588	* update the error condition for this stream
	1589	* since we never really issued the io
	1590	* just go ahead and adjust it back
	1591	*/
	1592	lck_mtx_lock_spin(&iostate->io_mtxp);
	1593
	1594	if (iostate->io_error == 0)
	1595	iostate->io_error = error;
	1596	iostate->io_issued -= io_size;
	1597
	1598	if (iostate->io_wanted) {
	1599	/*
	1600	* someone is waiting for the state of
	1601	* this io stream to change
	1602	*/
	1603	iostate->io_wanted = 0;
	1604	need_wakeup = 1;
	1605	}
	1606	lck_mtx_unlock(&iostate->io_mtxp);
	1607
	1608	if (need_wakeup)
	1609	wakeup((caddr_t)&iostate->io_wanted);
	1610	}
	1611	if (flags & CL_COMMIT) {
	1612	int upl_flags;
	1613
	1614	pg_offset = upl_offset & PAGE_MASK;
	1615	abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
	1616
	1617	upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
	1618
	1619	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) \| DBG_FUNC_NONE,
	1620	upl, upl_offset - pg_offset, abort_size, (error << 24) \| upl_flags, 0);
	1621	}
	1622	if (retval == 0)
	1623	retval = error;
	1624	} else if (cbp_head)
	1625	panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
	1626
	1627	if (real_bp) {
	1628	/*
	1629	* can get here if we either encountered an error
	1630	* or we completely zero-filled the request and
	1631	* no I/O was issued
	1632	*/
	1633	if (error) {
	1634	real_bp->b_flags \|= B_ERROR;
	1635	real_bp->b_error = error;
	1636	}
	1637	buf_biodone(real_bp);
	1638	}
	1639	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) \| DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
	1640
	1641	return (retval);
	1642	}
	1643
	1644	#define reset_vector_run_state() \
	1645	issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
	1646
	1647	static int
	1648	vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
	1649	int io_flag, buf_t real_bp, struct clios iostate, int (callback)(buf_t, void ), void callback_arg)
	1650	{
	1651	vector_upl_set_pagelist(vector_upl);
	1652
	1653	if(io_flag & CL_READ) {
	1654	if(vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK)==0))
	1655	io_flag &= ~CL_PRESERVE; /don't zero fill/
	1656	else
	1657	io_flag \|= CL_PRESERVE; /zero fill/
	1658	}
	1659	return (cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg));
	1660
	1661	}
	1662
	1663	static int
	1664	cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	1665	{
	1666	int pages_in_prefetch;
	1667
	1668	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_START,
	1669	(int)f_offset, size, (int)filesize, 0, 0);
	1670
	1671	if (f_offset >= filesize) {
	1672	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1673	(int)f_offset, 0, 0, 0, 0);
	1674	return(0);
	1675	}
	1676	if ((off_t)size > (filesize - f_offset))
	1677	size = filesize - f_offset;
	1678	pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
	1679
	1680	advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
	1681
	1682	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) \| DBG_FUNC_END,
	1683	(int)f_offset + size, pages_in_prefetch, 0, 1, 0);
	1684
	1685	return (pages_in_prefetch);
	1686	}
	1687
	1688
	1689
	1690	static void
	1691	cluster_read_ahead(vnode_t vp, struct cl_extent extent, off_t filesize, struct cl_readahead rap, int (callback)(buf_t, void ), void *callback_arg,
	1692	int bflag)
	1693	{
	1694	daddr64_t r_addr;
	1695	off_t f_offset;
	1696	int size_of_prefetch;
	1697	u_int max_prefetch;
	1698
	1699
	1700	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_START,
	1701	(int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
	1702
	1703	if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
	1704	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1705	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
	1706	return;
	1707	}
	1708	if (rap->cl_lastr == -1 \|\| (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
	1709	rap->cl_ralen = 0;
	1710	rap->cl_maxra = 0;
	1711
	1712	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1713	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
	1714
	1715	return;
	1716	}
	1717	max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD));
	1718
	1719	if (max_prefetch > speculative_prefetch_max)
	1720	max_prefetch = speculative_prefetch_max;
	1721
	1722	if (max_prefetch <= PAGE_SIZE) {
	1723	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1724	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
	1725	return;
	1726	}
	1727	if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
	1728	if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
	1729
	1730	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1731	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
	1732	return;
	1733	}
	1734	}
	1735	r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
	1736	f_offset = (off_t)(r_addr * PAGE_SIZE_64);
	1737
	1738	size_of_prefetch = 0;
	1739
	1740	ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
	1741
	1742	if (size_of_prefetch) {
	1743	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1744	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
	1745	return;
	1746	}
	1747	if (f_offset < filesize) {
	1748	daddr64_t read_size;
	1749
	1750	rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
	1751
	1752	read_size = (extent->e_addr + 1) - extent->b_addr;
	1753
	1754	if (read_size > rap->cl_ralen) {
	1755	if (read_size > max_prefetch / PAGE_SIZE)
	1756	rap->cl_ralen = max_prefetch / PAGE_SIZE;
	1757	else
	1758	rap->cl_ralen = read_size;
	1759	}
	1760	size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
	1761
	1762	if (size_of_prefetch)
	1763	rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
	1764	}
	1765	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) \| DBG_FUNC_END,
	1766	rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
	1767	}
	1768
	1769
	1770	int
	1771	cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	1772	int size, off_t filesize, int flags)
	1773	{
	1774	return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	1775
	1776	}
	1777
	1778
	1779	int
	1780	cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	1781	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	1782	{
	1783	int io_size;
	1784	int rounded_size;
	1785	off_t max_size;
	1786	int local_flags;
	1787
	1788	local_flags = CL_PAGEOUT \| CL_THROTTLE;
	1789
	1790	if ((flags & UPL_IOSYNC) == 0)
	1791	local_flags \|= CL_ASYNC;
	1792	if ((flags & UPL_NOCOMMIT) == 0)
	1793	local_flags \|= CL_COMMIT;
	1794	if ((flags & UPL_KEEPCACHED))
	1795	local_flags \|= CL_KEEPCACHED;
	1796	if (flags & UPL_PAGING_ENCRYPTED)
	1797	local_flags \|= CL_ENCRYPTED;
	1798
	1799
	1800	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) \| DBG_FUNC_NONE,
	1801	(int)f_offset, size, (int)filesize, local_flags, 0);
	1802
	1803	/*
	1804	* If they didn't specify any I/O, then we are done...
	1805	* we can't issue an abort because we don't know how
	1806	* big the upl really is
	1807	*/
	1808	if (size <= 0)
	1809	return (EINVAL);
	1810
	1811	if (vp->v_mount->mnt_flag & MNT_RDONLY) {
	1812	if (local_flags & CL_COMMIT)
	1813	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	1814	return (EROFS);
	1815	}
	1816	/*
	1817	* can't page-in from a negative offset
	1818	* or if we're starting beyond the EOF
	1819	* or if the file offset isn't page aligned
	1820	* or the size requested isn't a multiple of PAGE_SIZE
	1821	*/
	1822	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	1823	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK)) {
	1824	if (local_flags & CL_COMMIT)
	1825	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
	1826	return (EINVAL);
	1827	}
	1828	max_size = filesize - f_offset;
	1829
	1830	if (size < max_size)
	1831	io_size = size;
	1832	else
	1833	io_size = max_size;
	1834
	1835	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1836
	1837	if (size > rounded_size) {
	1838	if (local_flags & CL_COMMIT)
	1839	ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
	1840	UPL_ABORT_FREE_ON_EMPTY);
	1841	}
	1842	return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
	1843	local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg));
	1844	}
	1845
	1846
	1847	int
	1848	cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	1849	int size, off_t filesize, int flags)
	1850	{
	1851	return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
	1852	}
	1853
	1854
	1855	int
	1856	cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
	1857	int size, off_t filesize, int flags, int (callback)(buf_t, void ), void *callback_arg)
	1858	{
	1859	u_int io_size;
	1860	int rounded_size;
	1861	off_t max_size;
	1862	int retval;
	1863	int local_flags = 0;
	1864
	1865	if (upl == NULL \|\| size < 0)
	1866	panic("cluster_pagein: NULL upl passed in");
	1867
	1868	if ((flags & UPL_IOSYNC) == 0)
	1869	local_flags \|= CL_ASYNC;
	1870	if ((flags & UPL_NOCOMMIT) == 0)
	1871	local_flags \|= CL_COMMIT;
	1872	if (flags & UPL_IOSTREAMING)
	1873	local_flags \|= CL_IOSTREAMING;
	1874	if (flags & UPL_PAGING_ENCRYPTED)
	1875	local_flags \|= CL_ENCRYPTED;
	1876
	1877
	1878	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) \| DBG_FUNC_NONE,
	1879	(int)f_offset, size, (int)filesize, local_flags, 0);
	1880
	1881	/*
	1882	* can't page-in from a negative offset
	1883	* or if we're starting beyond the EOF
	1884	* or if the file offset isn't page aligned
	1885	* or the size requested isn't a multiple of PAGE_SIZE
	1886	*/
	1887	if (f_offset < 0 \|\| f_offset >= filesize \|\|
	1888	(f_offset & PAGE_MASK_64) \|\| (size & PAGE_MASK) \|\| (upl_offset & PAGE_MASK)) {
	1889	if (local_flags & CL_COMMIT)
	1890	ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	1891	return (EINVAL);
	1892	}
	1893	max_size = filesize - f_offset;
	1894
	1895	if (size < max_size)
	1896	io_size = size;
	1897	else
	1898	io_size = max_size;
	1899
	1900	rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	1901
	1902	if (size > rounded_size && (local_flags & CL_COMMIT))
	1903	ubc_upl_abort_range(upl, upl_offset + rounded_size,
	1904	size - rounded_size, UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_ERROR);
	1905
	1906	retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
	1907	local_flags \| CL_READ \| CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	1908
	1909	return (retval);
	1910	}
	1911
	1912
	1913	int
	1914	cluster_bp(buf_t bp)
	1915	{
	1916	return cluster_bp_ext(bp, NULL, NULL);
	1917	}
	1918
	1919
	1920	int
	1921	cluster_bp_ext(buf_t bp, int (callback)(buf_t, void ), void *callback_arg)
	1922	{
	1923	off_t f_offset;
	1924	int flags;
	1925
	1926	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) \| DBG_FUNC_START,
	1927	bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
	1928
	1929	if (bp->b_flags & B_READ)
	1930	flags = CL_ASYNC \| CL_READ;
	1931	else
	1932	flags = CL_ASYNC;
	1933	if (bp->b_flags & B_PASSIVE)
	1934	flags \|= CL_PASSIVE;
	1935
	1936	f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
	1937
	1938	return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg));
	1939	}
	1940
	1941
	1942
	1943	int
	1944	cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
	1945	{
	1946	return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
	1947	}
	1948
	1949
	1950	int
	1951	cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
	1952	int xflags, int (callback)(buf_t, void ), void *callback_arg)
	1953	{
	1954	user_ssize_t cur_resid;
	1955	int retval = 0;
	1956	int flags;
	1957	int zflags;
	1958	int bflag;
	1959	int write_type = IO_COPY;
	1960	u_int32_t write_length;
	1961
	1962	flags = xflags;
	1963
	1964	if (flags & IO_PASSIVE)
	1965	bflag = CL_PASSIVE;
	1966	else
	1967	bflag = 0;
	1968
	1969	if (vp->v_flag & VNOCACHE_DATA){
	1970	flags \|= IO_NOCACHE;
	1971	bflag \|= CL_NOCACHE;
	1972	}
	1973	if (uio == NULL) {
	1974	/*
	1975	* no user data...
	1976	* this call is being made to zero-fill some range in the file
	1977	*/
	1978	retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
	1979
	1980	return(retval);
	1981	}
	1982	/*
	1983	* do a write through the cache if one of the following is true....
	1984	* NOCACHE is not true or NODIRECT is true
	1985	* the uio request doesn't target USERSPACE
	1986	* otherwise, find out if we want the direct or contig variant for
	1987	* the first vector in the uio request
	1988	*/
	1989	if ( ((flags & (IO_NOCACHE \| IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) )
	1990	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	1991
	1992	if ( (flags & (IO_TAILZEROFILL \| IO_HEADZEROFILL)) && write_type == IO_DIRECT)
	1993	/*
	1994	* must go through the cached variant in this case
	1995	*/
	1996	write_type = IO_COPY;
	1997
	1998	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
	1999
	2000	switch (write_type) {
	2001
	2002	case IO_COPY:
	2003	/*
	2004	* make sure the uio_resid isn't too big...
	2005	* internally, we want to handle all of the I/O in
	2006	* chunk sizes that fit in a 32 bit int
	2007	*/
	2008	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
	2009	/*
	2010	* we're going to have to call cluster_write_copy
	2011	* more than once...
	2012	*
	2013	* only want the last call to cluster_write_copy to
	2014	* have the IO_TAILZEROFILL flag set and only the
	2015	* first call should have IO_HEADZEROFILL
	2016	*/
	2017	zflags = flags & ~IO_TAILZEROFILL;
	2018	flags &= ~IO_HEADZEROFILL;
	2019
	2020	write_length = MAX_IO_REQUEST_SIZE;
	2021	} else {
	2022	/*
	2023	* last call to cluster_write_copy
	2024	*/
	2025	zflags = flags;
	2026
	2027	write_length = (u_int32_t)cur_resid;
	2028	}
	2029	retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
	2030	break;
	2031
	2032	case IO_CONTIG:
	2033	zflags = flags & ~(IO_TAILZEROFILL \| IO_HEADZEROFILL);
	2034
	2035	if (flags & IO_HEADZEROFILL) {
	2036	/*
	2037	* only do this once per request
	2038	*/
	2039	flags &= ~IO_HEADZEROFILL;
	2040
	2041	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
	2042	headOff, (off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2043	if (retval)
	2044	break;
	2045	}
	2046	retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
	2047
	2048	if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
	2049	/*
	2050	* we're done with the data from the user specified buffer(s)
	2051	* and we've been requested to zero fill at the tail
	2052	* treat this as an IO_HEADZEROFILL which doesn't require a uio
	2053	* by rearranging the args and passing in IO_HEADZEROFILL
	2054	*/
	2055	retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
	2056	(off_t)0, zflags \| IO_HEADZEROFILL \| IO_SYNC, callback, callback_arg);
	2057	}
	2058	break;
	2059
	2060	case IO_DIRECT:
	2061	/*
	2062	* cluster_write_direct is never called with IO_TAILZEROFILL \|\| IO_HEADZEROFILL
	2063	*/
	2064	retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
	2065	break;
	2066
	2067	case IO_UNKNOWN:
	2068	retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
	2069	break;
	2070	}
	2071	/*
	2072	* in case we end up calling cluster_write_copy (from cluster_write_direct)
	2073	* multiple times to service a multi-vector request that is not aligned properly
	2074	* we need to update the oldEOF so that we
	2075	* don't zero-fill the head of a page if we've successfully written
	2076	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2077	* page that is beyond the oldEOF if the write is unaligned... we only
	2078	* want that to happen for the very first page of the cluster_write,
	2079	* NOT the first page of each vector making up a multi-vector write.
	2080	*/
	2081	if (uio->uio_offset > oldEOF)
	2082	oldEOF = uio->uio_offset;
	2083	}
	2084	return (retval);
	2085	}
	2086
	2087
	2088	static int
	2089	cluster_write_direct(vnode_t vp, struct uio uio, off_t oldEOF, off_t newEOF, int write_type, u_int32_t *write_length,
	2090	int flags, int (callback)(buf_t, void ), void *callback_arg)
	2091	{
	2092	upl_t upl;
	2093	upl_page_info_t *pl;
	2094	vm_offset_t upl_offset;
	2095	vm_offset_t vector_upl_offset = 0;
	2096	u_int32_t io_req_size;
	2097	u_int32_t offset_in_file;
	2098	u_int32_t offset_in_iovbase;
	2099	u_int32_t io_size;
	2100	int io_flag = 0;
	2101	upl_size_t upl_size, vector_upl_size = 0;
	2102	vm_size_t upl_needed_size;
	2103	mach_msg_type_number_t pages_in_pl;
	2104	int upl_flags;
	2105	kern_return_t kret;
	2106	mach_msg_type_number_t i;
	2107	int force_data_sync;
	2108	int retval = 0;
	2109	int first_IO = 1;
	2110	struct clios iostate;
	2111	user_addr_t iov_base;
	2112	u_int32_t mem_alignment_mask;
	2113	u_int32_t devblocksize;
	2114	u_int32_t max_io_size;
	2115	u_int32_t max_upl_size;
	2116	u_int32_t max_vector_size;
	2117	boolean_t io_throttled = FALSE;
	2118
	2119	u_int32_t vector_upl_iosize = 0;
	2120	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	2121	off_t v_upl_uio_offset = 0;
	2122	int vector_upl_index=0;
	2123	upl_t vector_upl = NULL;
	2124
	2125
	2126	/*
	2127	* When we enter this routine, we know
	2128	* -- the resid will not exceed iov_len
	2129	*/
	2130	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_START,
	2131	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2132
	2133	max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2134
	2135	io_flag = CL_ASYNC \| CL_PRESERVE \| CL_COMMIT \| CL_THROTTLE \| CL_DIRECT_IO;
	2136
	2137	if (flags & IO_PASSIVE)
	2138	io_flag \|= CL_PASSIVE;
	2139
	2140	if (flags & IO_NOCACHE)
	2141	io_flag \|= CL_NOCACHE;
	2142
	2143	if (flags & IO_SKIP_ENCRYPTION)
	2144	io_flag \|= CL_ENCRYPTED;
	2145
	2146	iostate.io_completed = 0;
	2147	iostate.io_issued = 0;
	2148	iostate.io_error = 0;
	2149	iostate.io_wanted = 0;
	2150
	2151	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2152
	2153	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2154	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2155
	2156	if (devblocksize == 1) {
	2157	/*
	2158	* the AFP client advertises a devblocksize of 1
	2159	* however, its BLOCKMAP routine maps to physical
	2160	* blocks that are PAGE_SIZE in size...
	2161	* therefore we can't ask for I/Os that aren't page aligned
	2162	* or aren't multiples of PAGE_SIZE in size
	2163	* by setting devblocksize to PAGE_SIZE, we re-instate
	2164	* the old behavior we had before the mem_alignment_mask
	2165	* changes went in...
	2166	*/
	2167	devblocksize = PAGE_SIZE;
	2168	}
	2169
	2170	next_dwrite:
	2171	io_req_size = *write_length;
	2172	iov_base = uio_curriovbase(uio);
	2173
	2174	offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
	2175	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	2176
	2177	if (offset_in_file \|\| offset_in_iovbase) {
	2178	/*
	2179	* one of the 2 important offsets is misaligned
	2180	* so fire an I/O through the cache for this entire vector
	2181	*/
	2182	goto wait_for_dwrites;
	2183	}
	2184	if (iov_base & (devblocksize - 1)) {
	2185	/*
	2186	* the offset in memory must be on a device block boundary
	2187	* so that we can guarantee that we can generate an
	2188	* I/O that ends on a page boundary in cluster_io
	2189	*/
	2190	goto wait_for_dwrites;
	2191	}
	2192
	2193	while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
	2194	int throttle_type;
	2195
	2196	if ( (throttle_type = cluster_is_throttled(vp)) ) {
	2197	/*
	2198	* we're in the throttle window, at the very least
	2199	* we want to limit the size of the I/O we're about
	2200	* to issue
	2201	*/
	2202	if ( (flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
	2203	/*
	2204	* we're in the throttle window and at least 1 I/O
	2205	* has already been issued by a throttleable thread
	2206	* in this window, so return with EAGAIN to indicate
	2207	* to the FS issuing the cluster_write call that it
	2208	* should now throttle after dropping any locks
	2209	*/
	2210	throttle_info_update_by_mount(vp->v_mount);
	2211
	2212	io_throttled = TRUE;
	2213	goto wait_for_dwrites;
	2214	}
	2215	max_vector_size = THROTTLE_MAX_IOSIZE;
	2216	max_io_size = THROTTLE_MAX_IOSIZE;
	2217	} else {
	2218	max_vector_size = MAX_VECTOR_UPL_SIZE;
	2219	max_io_size = max_upl_size;
	2220	}
	2221
	2222	if (first_IO) {
	2223	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2224	first_IO = 0;
	2225	}
	2226	io_size = io_req_size & ~PAGE_MASK;
	2227	iov_base = uio_curriovbase(uio);
	2228
	2229	if (io_size > max_io_size)
	2230	io_size = max_io_size;
	2231
	2232	if(useVectorUPL && (iov_base & PAGE_MASK)) {
	2233	/*
	2234	* We have an iov_base that's not page-aligned.
	2235	* Issue all I/O's that have been collected within
	2236	* this Vectored UPL.
	2237	*/
	2238	if(vector_upl_index) {
	2239	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2240	reset_vector_run_state();
	2241	}
	2242
	2243	/*
	2244	* After this point, if we are using the Vector UPL path and the base is
	2245	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	2246	*/
	2247	}
	2248
	2249	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2250	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	2251
	2252	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_START,
	2253	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	2254
	2255	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	2256	pages_in_pl = 0;
	2257	upl_size = upl_needed_size;
	2258	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2259	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2260
	2261	kret = vm_map_get_upl(current_map(),
	2262	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2263	&upl_size,
	2264	&upl,
	2265	NULL,
	2266	&pages_in_pl,
	2267	&upl_flags,
	2268	force_data_sync);
	2269
	2270	if (kret != KERN_SUCCESS) {
	2271	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2272	0, 0, 0, kret, 0);
	2273	/*
	2274	* failed to get pagelist
	2275	*
	2276	* we may have already spun some portion of this request
	2277	* off as async requests... we need to wait for the I/O
	2278	* to complete before returning
	2279	*/
	2280	goto wait_for_dwrites;
	2281	}
	2282	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	2283	pages_in_pl = upl_size / PAGE_SIZE;
	2284
	2285	for (i = 0; i < pages_in_pl; i++) {
	2286	if (!upl_valid_page(pl, i))
	2287	break;
	2288	}
	2289	if (i == pages_in_pl)
	2290	break;
	2291
	2292	/*
	2293	* didn't get all the pages back that we
	2294	* needed... release this upl and try again
	2295	*/
	2296	ubc_upl_abort(upl, 0);
	2297	}
	2298	if (force_data_sync >= 3) {
	2299	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2300	i, pages_in_pl, upl_size, kret, 0);
	2301	/*
	2302	* for some reason, we couldn't acquire a hold on all
	2303	* the pages needed in the user's address space
	2304	*
	2305	* we may have already spun some portion of this request
	2306	* off as async requests... we need to wait for the I/O
	2307	* to complete before returning
	2308	*/
	2309	goto wait_for_dwrites;
	2310	}
	2311
	2312	/*
	2313	* Consider the possibility that upl_size wasn't satisfied.
	2314	*/
	2315	if (upl_size < upl_needed_size) {
	2316	if (upl_size && upl_offset == 0)
	2317	io_size = upl_size;
	2318	else
	2319	io_size = 0;
	2320	}
	2321	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) \| DBG_FUNC_END,
	2322	(int)upl_offset, upl_size, (int)iov_base, io_size, 0);
	2323
	2324	if (io_size == 0) {
	2325	ubc_upl_abort(upl, 0);
	2326	/*
	2327	* we may have already spun some portion of this request
	2328	* off as async requests... we need to wait for the I/O
	2329	* to complete before returning
	2330	*/
	2331	goto wait_for_dwrites;
	2332	}
	2333
	2334	if(useVectorUPL) {
	2335	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	2336	if(end_off)
	2337	issueVectorUPL = 1;
	2338	/*
	2339	* After this point, if we are using a vector UPL, then
	2340	* either all the UPL elements end on a page boundary OR
	2341	* this UPL is the last element because it does not end
	2342	* on a page boundary.
	2343	*/
	2344	}
	2345
	2346	/*
	2347	* Now look for pages already in the cache
	2348	* and throw them away.
	2349	* uio->uio_offset is page aligned within the file
	2350	* io_size is a multiple of PAGE_SIZE
	2351	*/
	2352	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
	2353
	2354	/*
	2355	* we want push out these writes asynchronously so that we can overlap
	2356	* the preparation of the next I/O
	2357	* if there are already too many outstanding writes
	2358	* wait until some complete before issuing the next
	2359	*/
	2360	cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct");
	2361
	2362	if (iostate.io_error) {
	2363	/*
	2364	* one of the earlier writes we issued ran into a hard error
	2365	* don't issue any more writes, cleanup the UPL
	2366	* that was just created but not used, then
	2367	* go wait for all writes that are part of this stream
	2368	* to complete before returning the error to the caller
	2369	*/
	2370	ubc_upl_abort(upl, 0);
	2371
	2372	goto wait_for_dwrites;
	2373	}
	2374
	2375	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_START,
	2376	(int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
	2377
	2378	if(!useVectorUPL)
	2379	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
	2380	io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2381
	2382	else {
	2383	if(!vector_upl_index) {
	2384	vector_upl = vector_upl_create(upl_offset);
	2385	v_upl_uio_offset = uio->uio_offset;
	2386	vector_upl_offset = upl_offset;
	2387	}
	2388
	2389	vector_upl_set_subupl(vector_upl,upl,upl_size);
	2390	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	2391	vector_upl_index++;
	2392	vector_upl_iosize += io_size;
	2393	vector_upl_size += upl_size;
	2394
	2395	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	2396	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2397	reset_vector_run_state();
	2398	}
	2399	}
	2400
	2401	/*
	2402	* update the uio structure to
	2403	* reflect the I/O that we just issued
	2404	*/
	2405	uio_update(uio, (user_size_t)io_size);
	2406
	2407	/*
	2408	* in case we end up calling through to cluster_write_copy to finish
	2409	* the tail of this request, we need to update the oldEOF so that we
	2410	* don't zero-fill the head of a page if we've successfully written
	2411	* data to that area... 'cluster_write_copy' will zero-fill the head of a
	2412	* page that is beyond the oldEOF if the write is unaligned... we only
	2413	* want that to happen for the very first page of the cluster_write,
	2414	* NOT the first page of each vector making up a multi-vector write.
	2415	*/
	2416	if (uio->uio_offset > oldEOF)
	2417	oldEOF = uio->uio_offset;
	2418
	2419	io_req_size -= io_size;
	2420
	2421	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) \| DBG_FUNC_END,
	2422	(int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
	2423
	2424	} /* end while */
	2425
	2426	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
	2427
	2428	retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
	2429
	2430	if (retval == 0 && *write_type == IO_DIRECT) {
	2431
	2432	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_NONE,
	2433	(int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
	2434
	2435	goto next_dwrite;
	2436	}
	2437	}
	2438
	2439	wait_for_dwrites:
	2440
	2441	if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	2442	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	2443	reset_vector_run_state();
	2444	}
	2445	/*
	2446	* make sure all async writes issued as part of this stream
	2447	* have completed before we return
	2448	*/
	2449	cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
	2450
	2451	if (iostate.io_error)
	2452	retval = iostate.io_error;
	2453
	2454	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2455
	2456	if (io_throttled == TRUE && retval == 0)
	2457	retval = EAGAIN;
	2458
	2459	if (io_req_size && retval == 0) {
	2460	/*
	2461	* we couldn't handle the tail of this request in DIRECT mode
	2462	* so fire it through the copy path
	2463	*
	2464	* note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
	2465	* so we can just pass 0 in for the headOff and tailOff
	2466	*/
	2467	if (uio->uio_offset > oldEOF)
	2468	oldEOF = uio->uio_offset;
	2469
	2470	retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
	2471
	2472	*write_type = IO_UNKNOWN;
	2473	}
	2474	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) \| DBG_FUNC_END,
	2475	(int)uio->uio_offset, io_req_size, retval, 4, 0);
	2476
	2477	return (retval);
	2478	}
	2479
	2480
	2481	static int
	2482	cluster_write_contig(vnode_t vp, struct uio uio, off_t newEOF, int write_type, u_int32_t *write_length,
	2483	int (callback)(buf_t, void ), void *callback_arg, int bflag)
	2484	{
	2485	upl_page_info_t *pl;
	2486	addr64_t src_paddr = 0;
	2487	upl_t upl[MAX_VECTS];
	2488	vm_offset_t upl_offset;
	2489	u_int32_t tail_size = 0;
	2490	u_int32_t io_size;
	2491	u_int32_t xsize;
	2492	upl_size_t upl_size;
	2493	vm_size_t upl_needed_size;
	2494	mach_msg_type_number_t pages_in_pl;
	2495	int upl_flags;
	2496	kern_return_t kret;
	2497	struct clios iostate;
	2498	int error = 0;
	2499	int cur_upl = 0;
	2500	int num_upl = 0;
	2501	int n;
	2502	user_addr_t iov_base;
	2503	u_int32_t devblocksize;
	2504	u_int32_t mem_alignment_mask;
	2505
	2506	/*
	2507	* When we enter this routine, we know
	2508	* -- the io_req_size will not exceed iov_len
	2509	* -- the target address is physically contiguous
	2510	*/
	2511	cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
	2512
	2513	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	2514	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	2515
	2516	iostate.io_completed = 0;
	2517	iostate.io_issued = 0;
	2518	iostate.io_error = 0;
	2519	iostate.io_wanted = 0;
	2520
	2521	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	2522
	2523	next_cwrite:
	2524	io_size = *write_length;
	2525
	2526	iov_base = uio_curriovbase(uio);
	2527
	2528	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	2529	upl_needed_size = upl_offset + io_size;
	2530
	2531	pages_in_pl = 0;
	2532	upl_size = upl_needed_size;
	2533	upl_flags = UPL_FILE_IO \| UPL_COPYOUT_FROM \| UPL_NO_SYNC \|
	2534	UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	2535
	2536	kret = vm_map_get_upl(current_map(),
	2537	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	2538	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
	2539
	2540	if (kret != KERN_SUCCESS) {
	2541	/*
	2542	* failed to get pagelist
	2543	*/
	2544	error = EINVAL;
	2545	goto wait_for_cwrites;
	2546	}
	2547	num_upl++;
	2548
	2549	/*
	2550	* Consider the possibility that upl_size wasn't satisfied.
	2551	*/
	2552	if (upl_size < upl_needed_size) {
	2553	/*
	2554	* This is a failure in the physical memory case.
	2555	*/
	2556	error = EINVAL;
	2557	goto wait_for_cwrites;
	2558	}
	2559	pl = ubc_upl_pageinfo(upl[cur_upl]);
	2560
	2561	src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	2562
	2563	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	2564	u_int32_t head_size;
	2565
	2566	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	2567
	2568	if (head_size > io_size)
	2569	head_size = io_size;
	2570
	2571	error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
	2572
	2573	if (error)
	2574	goto wait_for_cwrites;
	2575
	2576	upl_offset += head_size;
	2577	src_paddr += head_size;
	2578	io_size -= head_size;
	2579
	2580	iov_base += head_size;
	2581	}
	2582	if ((u_int32_t)iov_base & mem_alignment_mask) {
	2583	/*
	2584	* request doesn't set up on a memory boundary
	2585	* the underlying DMA engine can handle...
	2586	* return an error instead of going through
	2587	* the slow copy path since the intent of this
	2588	* path is direct I/O from device memory
	2589	*/
	2590	error = EINVAL;
	2591	goto wait_for_cwrites;
	2592	}
	2593
	2594	tail_size = io_size & (devblocksize - 1);
	2595	io_size -= tail_size;
	2596
	2597	while (io_size && error == 0) {
	2598
	2599	if (io_size > MAX_IO_CONTIG_SIZE)
	2600	xsize = MAX_IO_CONTIG_SIZE;
	2601	else
	2602	xsize = io_size;
	2603	/*
	2604	* request asynchronously so that we can overlap
	2605	* the preparation of the next I/O... we'll do
	2606	* the commit after all the I/O has completed
	2607	* since its all issued against the same UPL
	2608	* if there are already too many outstanding writes
	2609	* wait until some have completed before issuing the next
	2610	*/
	2611	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
	2612
	2613	if (iostate.io_error) {
	2614	/*
	2615	* one of the earlier writes we issued ran into a hard error
	2616	* don't issue any more writes...
	2617	* go wait for all writes that are part of this stream
	2618	* to complete before returning the error to the caller
	2619	*/
	2620	goto wait_for_cwrites;
	2621	}
	2622	/*
	2623	* issue an asynchronous write to cluster_io
	2624	*/
	2625	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
	2626	xsize, CL_DEV_MEMORY \| CL_ASYNC \| bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
	2627
	2628	if (error == 0) {
	2629	/*
	2630	* The cluster_io write completed successfully,
	2631	* update the uio structure
	2632	*/
	2633	uio_update(uio, (user_size_t)xsize);
	2634
	2635	upl_offset += xsize;
	2636	src_paddr += xsize;
	2637	io_size -= xsize;
	2638	}
	2639	}
	2640	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
	2641
	2642	error = cluster_io_type(uio, write_type, write_length, 0);
	2643
	2644	if (error == 0 && *write_type == IO_CONTIG) {
	2645	cur_upl++;
	2646	goto next_cwrite;
	2647	}
	2648	} else
	2649	*write_type = IO_UNKNOWN;
	2650
	2651	wait_for_cwrites:
	2652	/*
	2653	* make sure all async writes that are part of this stream
	2654	* have completed before we proceed
	2655	*/
	2656	cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
	2657
	2658	if (iostate.io_error)
	2659	error = iostate.io_error;
	2660
	2661	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	2662
	2663	if (error == 0 && tail_size)
	2664	error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
	2665
	2666	for (n = 0; n < num_upl; n++)
	2667	/*
	2668	* just release our hold on each physically contiguous
	2669	* region without changing any state
	2670	*/
	2671	ubc_upl_abort(upl[n], 0);
	2672
	2673	return (error);
	2674	}
	2675
	2676
	2677	/*
	2678	* need to avoid a race between an msync of a range of pages dirtied via mmap
	2679	* vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
	2680	* zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
	2681	*
	2682	* we should never force-zero-fill pages that are already valid in the cache...
	2683	* the entire page contains valid data (either from disk, zero-filled or dirtied
	2684	* via an mmap) so we can only do damage by trying to zero-fill
	2685	*
	2686	*/
	2687	static int
	2688	cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
	2689	{
	2690	int zero_pg_index;
	2691	boolean_t need_cluster_zero = TRUE;
	2692
	2693	if ((flags & (IO_NOZEROVALID \| IO_NOZERODIRTY))) {
	2694
	2695	bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
	2696	zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
	2697
	2698	if (upl_valid_page(pl, zero_pg_index)) {
	2699	/*
	2700	* never force zero valid pages - dirty or clean
	2701	* we'll leave these in the UPL for cluster_write_copy to deal with
	2702	*/
	2703	need_cluster_zero = FALSE;
	2704	}
	2705	}
	2706	if (need_cluster_zero == TRUE)
	2707	cluster_zero(upl, io_offset, bytes_to_zero, NULL);
	2708
	2709	return (bytes_to_zero);
	2710	}
	2711
	2712
	2713	static int
	2714	cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
	2715	off_t tailOff, int flags, int (callback)(buf_t, void ), void *callback_arg)
	2716	{
	2717	upl_page_info_t *pl;
	2718	upl_t upl;
	2719	vm_offset_t upl_offset = 0;
	2720	vm_size_t upl_size;
	2721	off_t upl_f_offset;
	2722	int pages_in_upl;
	2723	int start_offset;
	2724	int xfer_resid;
	2725	int io_size;
	2726	int io_offset;
	2727	int bytes_to_zero;
	2728	int bytes_to_move;
	2729	kern_return_t kret;
	2730	int retval = 0;
	2731	int io_resid;
	2732	long long total_size;
	2733	long long zero_cnt;
	2734	off_t zero_off;
	2735	long long zero_cnt1;
	2736	off_t zero_off1;
	2737	off_t write_off = 0;
	2738	int write_cnt = 0;
	2739	boolean_t first_pass = FALSE;
	2740	struct cl_extent cl;
	2741	struct cl_writebehind *wbp;
	2742	int bflag;
	2743	u_int max_cluster_pgcount;
	2744	u_int max_io_size;
	2745
	2746	if (uio) {
	2747	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	2748	(int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
	2749
	2750	io_resid = io_req_size;
	2751	} else {
	2752	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_START,
	2753	0, 0, (int)oldEOF, (int)newEOF, 0);
	2754
	2755	io_resid = 0;
	2756	}
	2757	if (flags & IO_PASSIVE)
	2758	bflag = CL_PASSIVE;
	2759	else
	2760	bflag = 0;
	2761	if (flags & IO_NOCACHE)
	2762	bflag \|= CL_NOCACHE;
	2763
	2764	if (flags & IO_SKIP_ENCRYPTION)
	2765	bflag \|= CL_ENCRYPTED;
	2766
	2767	zero_cnt = 0;
	2768	zero_cnt1 = 0;
	2769	zero_off = 0;
	2770	zero_off1 = 0;
	2771
	2772	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	2773	max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
	2774
	2775	if (flags & IO_HEADZEROFILL) {
	2776	/*
	2777	* some filesystems (HFS is one) don't support unallocated holes within a file...
	2778	* so we zero fill the intervening space between the old EOF and the offset
	2779	* where the next chunk of real data begins.... ftruncate will also use this
	2780	* routine to zero fill to the new EOF when growing a file... in this case, the
	2781	* uio structure will not be provided
	2782	*/
	2783	if (uio) {
	2784	if (headOff < uio->uio_offset) {
	2785	zero_cnt = uio->uio_offset - headOff;
	2786	zero_off = headOff;
	2787	}
	2788	} else if (headOff < newEOF) {
	2789	zero_cnt = newEOF - headOff;
	2790	zero_off = headOff;
	2791	}
	2792	} else {
	2793	if (uio && uio->uio_offset > oldEOF) {
	2794	zero_off = uio->uio_offset & ~PAGE_MASK_64;
	2795
	2796	if (zero_off >= oldEOF) {
	2797	zero_cnt = uio->uio_offset - zero_off;
	2798
	2799	flags \|= IO_HEADZEROFILL;
	2800	}
	2801	}
	2802	}
	2803	if (flags & IO_TAILZEROFILL) {
	2804	if (uio) {
	2805	zero_off1 = uio->uio_offset + io_req_size;
	2806
	2807	if (zero_off1 < tailOff)
	2808	zero_cnt1 = tailOff - zero_off1;
	2809	}
	2810	} else {
	2811	if (uio && newEOF > oldEOF) {
	2812	zero_off1 = uio->uio_offset + io_req_size;
	2813
	2814	if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
	2815	zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
	2816
	2817	flags \|= IO_TAILZEROFILL;
	2818	}
	2819	}
	2820	}
	2821	if (zero_cnt == 0 && uio == (struct uio *) 0) {
	2822	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END,
	2823	retval, 0, 0, 0, 0);
	2824	return (0);
	2825	}
	2826	if (uio) {
	2827	write_off = uio->uio_offset;
	2828	write_cnt = uio_resid(uio);
	2829	/*
	2830	* delay updating the sequential write info
	2831	* in the control block until we've obtained
	2832	* the lock for it
	2833	*/
	2834	first_pass = TRUE;
	2835	}
	2836	while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
	2837	/*
	2838	* for this iteration of the loop, figure out where our starting point is
	2839	*/
	2840	if (zero_cnt) {
	2841	start_offset = (int)(zero_off & PAGE_MASK_64);
	2842	upl_f_offset = zero_off - start_offset;
	2843	} else if (io_resid) {
	2844	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	2845	upl_f_offset = uio->uio_offset - start_offset;
	2846	} else {
	2847	start_offset = (int)(zero_off1 & PAGE_MASK_64);
	2848	upl_f_offset = zero_off1 - start_offset;
	2849	}
	2850	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) \| DBG_FUNC_NONE,
	2851	(int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
	2852
	2853	if (total_size > max_io_size)
	2854	total_size = max_io_size;
	2855
	2856	cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
	2857
	2858	if (uio && ((flags & (IO_SYNC \| IO_HEADZEROFILL \| IO_TAILZEROFILL)) == 0)) {
	2859	/*
	2860	* assumption... total_size <= io_resid
	2861	* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
	2862	*/
	2863	if ((start_offset + total_size) > max_io_size)
	2864	total_size = max_io_size - start_offset;
	2865	xfer_resid = total_size;
	2866
	2867	retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
	2868
	2869	if (retval)
	2870	break;
	2871
	2872	io_resid -= (total_size - xfer_resid);
	2873	total_size = xfer_resid;
	2874	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	2875	upl_f_offset = uio->uio_offset - start_offset;
	2876
	2877	if (total_size == 0) {
	2878	if (start_offset) {
	2879	/*
	2880	* the write did not finish on a page boundary
	2881	* which will leave upl_f_offset pointing to the
	2882	* beginning of the last page written instead of
	2883	* the page beyond it... bump it in this case
	2884	* so that the cluster code records the last page
	2885	* written as dirty
	2886	*/
	2887	upl_f_offset += PAGE_SIZE_64;
	2888	}
	2889	upl_size = 0;
	2890
	2891	goto check_cluster;
	2892	}
	2893	}
	2894	/*
	2895	* compute the size of the upl needed to encompass
	2896	* the requested write... limit each call to cluster_io
	2897	* to the maximum UPL size... cluster_io will clip if
	2898	* this exceeds the maximum io_size for the device,
	2899	* make sure to account for
	2900	* a starting offset that's not page aligned
	2901	*/
	2902	upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	2903
	2904	if (upl_size > max_io_size)
	2905	upl_size = max_io_size;
	2906
	2907	pages_in_upl = upl_size / PAGE_SIZE;
	2908	io_size = upl_size - start_offset;
	2909
	2910	if ((long long)io_size > total_size)
	2911	io_size = total_size;
	2912
	2913	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
	2914
	2915
	2916	/*
	2917	* Gather the pages from the buffer cache.
	2918	* The UPL_WILL_MODIFY flag lets the UPL subsystem know
	2919	* that we intend to modify these pages.
	2920	*/
	2921	kret = ubc_create_upl(vp,
	2922	upl_f_offset,
	2923	upl_size,
	2924	&upl,
	2925	&pl,
	2926	UPL_SET_LITE \| (( uio!=NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY));
	2927	if (kret != KERN_SUCCESS)
	2928	panic("cluster_write_copy: failed to get pagelist");
	2929
	2930	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END,
	2931	upl, (int)upl_f_offset, start_offset, 0, 0);
	2932
	2933	if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
	2934	int read_size;
	2935
	2936	/*
	2937	* we're starting in the middle of the first page of the upl
	2938	* and the page isn't currently valid, so we're going to have
	2939	* to read it in first... this is a synchronous operation
	2940	*/
	2941	read_size = PAGE_SIZE;
	2942
	2943	if ((upl_f_offset + read_size) > oldEOF)
	2944	read_size = oldEOF - upl_f_offset;
	2945
	2946	retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
	2947	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2948	if (retval) {
	2949	/*
	2950	* we had an error during the read which causes us to abort
	2951	* the current cluster_write request... before we do, we need
	2952	* to release the rest of the pages in the upl without modifying
	2953	* there state and mark the failed page in error
	2954	*/
	2955	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	2956
	2957	if (upl_size > PAGE_SIZE)
	2958	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	2959
	2960	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	2961	upl, 0, 0, retval, 0);
	2962	break;
	2963	}
	2964	}
	2965	if ((start_offset == 0 \|\| upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
	2966	/*
	2967	* the last offset we're writing to in this upl does not end on a page
	2968	* boundary... if it's not beyond the old EOF, then we'll also need to
	2969	* pre-read this page in if it isn't already valid
	2970	*/
	2971	upl_offset = upl_size - PAGE_SIZE;
	2972
	2973	if ((upl_f_offset + start_offset + io_size) < oldEOF &&
	2974	!upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
	2975	int read_size;
	2976
	2977	read_size = PAGE_SIZE;
	2978
	2979	if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF)
	2980	read_size = oldEOF - (upl_f_offset + upl_offset);
	2981
	2982	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
	2983	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	2984	if (retval) {
	2985	/*
	2986	* we had an error during the read which causes us to abort
	2987	* the current cluster_write request... before we do, we
	2988	* need to release the rest of the pages in the upl without
	2989	* modifying there state and mark the failed page in error
	2990	*/
	2991	ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES\|UPL_ABORT_FREE_ON_EMPTY);
	2992
	2993	if (upl_size > PAGE_SIZE)
	2994	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	2995
	2996	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	2997	upl, 0, 0, retval, 0);
	2998	break;
	2999	}
	3000	}
	3001	}
	3002	xfer_resid = io_size;
	3003	io_offset = start_offset;
	3004
	3005	while (zero_cnt && xfer_resid) {
	3006
	3007	if (zero_cnt < (long long)xfer_resid)
	3008	bytes_to_zero = zero_cnt;
	3009	else
	3010	bytes_to_zero = xfer_resid;
	3011
	3012	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
	3013
	3014	xfer_resid -= bytes_to_zero;
	3015	zero_cnt -= bytes_to_zero;
	3016	zero_off += bytes_to_zero;
	3017	io_offset += bytes_to_zero;
	3018	}
	3019	if (xfer_resid && io_resid) {
	3020	u_int32_t io_requested;
	3021
	3022	bytes_to_move = min(io_resid, xfer_resid);
	3023	io_requested = bytes_to_move;
	3024
	3025	retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
	3026
	3027	if (retval) {
	3028	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3029
	3030	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) \| DBG_FUNC_NONE,
	3031	upl, 0, 0, retval, 0);
	3032	} else {
	3033	io_resid -= bytes_to_move;
	3034	xfer_resid -= bytes_to_move;
	3035	io_offset += bytes_to_move;
	3036	}
	3037	}
	3038	while (xfer_resid && zero_cnt1 && retval == 0) {
	3039
	3040	if (zero_cnt1 < (long long)xfer_resid)
	3041	bytes_to_zero = zero_cnt1;
	3042	else
	3043	bytes_to_zero = xfer_resid;
	3044
	3045	bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
	3046
	3047	xfer_resid -= bytes_to_zero;
	3048	zero_cnt1 -= bytes_to_zero;
	3049	zero_off1 += bytes_to_zero;
	3050	io_offset += bytes_to_zero;
	3051	}
	3052	if (retval == 0) {
	3053	int cl_index;
	3054	int ret_cluster_try_push;
	3055
	3056	io_size += start_offset;
	3057
	3058	if ((upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
	3059	/*
	3060	* if we're extending the file with this write
	3061	* we'll zero fill the rest of the page so that
	3062	* if the file gets extended again in such a way as to leave a
	3063	* hole starting at this EOF, we'll have zero's in the correct spot
	3064	*/
	3065	cluster_zero(upl, io_size, upl_size - io_size, NULL);
	3066	}
	3067	/*
	3068	* release the upl now if we hold one since...
	3069	* 1) pages in it may be present in the sparse cluster map
	3070	* and may span 2 separate buckets there... if they do and
	3071	* we happen to have to flush a bucket to make room and it intersects
	3072	* this upl, a deadlock may result on page BUSY
	3073	* 2) we're delaying the I/O... from this point forward we're just updating
	3074	* the cluster state... no need to hold the pages, so commit them
	3075	* 3) IO_SYNC is set...
	3076	* because we had to ask for a UPL that provides currenty non-present pages, the
	3077	* UPL has been automatically set to clear the dirty flags (both software and hardware)
	3078	* upon committing it... this is not the behavior we want since it's possible for
	3079	* pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
	3080	* we'll pick these pages back up later with the correct behavior specified.
	3081	* 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
	3082	* of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
	3083	* we hold since the flushing context is holding the cluster lock.
	3084	*/
	3085	ubc_upl_commit_range(upl, 0, upl_size,
	3086	UPL_COMMIT_SET_DIRTY \| UPL_COMMIT_INACTIVATE \| UPL_COMMIT_FREE_ON_EMPTY);
	3087	check_cluster:
	3088	/*
	3089	* calculate the last logical block number
	3090	* that this delayed I/O encompassed
	3091	*/
	3092	cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
	3093
	3094	if (flags & IO_SYNC) {
	3095	/*
	3096	* if the IO_SYNC flag is set than we need to
	3097	* bypass any clusters and immediately issue
	3098	* the I/O
	3099	*/
	3100	goto issue_io;
	3101	}
	3102	/*
	3103	* take the lock to protect our accesses
	3104	* of the writebehind and sparse cluster state
	3105	*/
	3106	wbp = cluster_get_wbp(vp, CLW_ALLOCATE \| CLW_RETURNLOCKED);
	3107
	3108	if (wbp->cl_scmap) {
	3109
	3110	if ( !(flags & IO_NOCACHE)) {
	3111	/*
	3112	* we've fallen into the sparse
	3113	* cluster method of delaying dirty pages
	3114	*/
	3115	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3116
	3117	lck_mtx_unlock(&wbp->cl_lockw);
	3118
	3119	continue;
	3120	}
	3121	/*
	3122	* must have done cached writes that fell into
	3123	* the sparse cluster mechanism... we've switched
	3124	* to uncached writes on the file, so go ahead
	3125	* and push whatever's in the sparse map
	3126	* and switch back to normal clustering
	3127	*/
	3128	wbp->cl_number = 0;
	3129
	3130	sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg);
	3131	/*
	3132	* no clusters of either type present at this point
	3133	* so just go directly to start_new_cluster since
	3134	* we know we need to delay this I/O since we've
	3135	* already released the pages back into the cache
	3136	* to avoid the deadlock with sparse_cluster_push
	3137	*/
	3138	goto start_new_cluster;
	3139	}
	3140	if (first_pass) {
	3141	if (write_off == wbp->cl_last_write)
	3142	wbp->cl_seq_written += write_cnt;
	3143	else
	3144	wbp->cl_seq_written = write_cnt;
	3145
	3146	wbp->cl_last_write = write_off + write_cnt;
	3147
	3148	first_pass = FALSE;
	3149	}
	3150	if (wbp->cl_number == 0)
	3151	/*
	3152	* no clusters currently present
	3153	*/
	3154	goto start_new_cluster;
	3155
	3156	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	3157	/*
	3158	* check each cluster that we currently hold
	3159	* try to merge some or all of this write into
	3160	* one or more of the existing clusters... if
	3161	* any portion of the write remains, start a
	3162	* new cluster
	3163	*/
	3164	if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
	3165	/*
	3166	* the current write starts at or after the current cluster
	3167	*/
	3168	if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3169	/*
	3170	* we have a write that fits entirely
	3171	* within the existing cluster limits
	3172	*/
	3173	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
	3174	/*
	3175	* update our idea of where the cluster ends
	3176	*/
	3177	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3178	break;
	3179	}
	3180	if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
	3181	/*
	3182	* we have a write that starts in the middle of the current cluster
	3183	* but extends beyond the cluster's limit... we know this because
	3184	* of the previous checks
	3185	* we'll extend the current cluster to the max
	3186	* and update the b_addr for the current write to reflect that
	3187	* the head of it was absorbed into this cluster...
	3188	* note that we'll always have a leftover tail in this case since
	3189	* full absorbtion would have occurred in the clause above
	3190	*/
	3191	wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
	3192
	3193	cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
	3194	}
	3195	/*
	3196	* we come here for the case where the current write starts
	3197	* beyond the limit of the existing cluster or we have a leftover
	3198	* tail after a partial absorbtion
	3199	*
	3200	* in either case, we'll check the remaining clusters before
	3201	* starting a new one
	3202	*/
	3203	} else {
	3204	/*
	3205	* the current write starts in front of the cluster we're currently considering
	3206	*/
	3207	if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
	3208	/*
	3209	* we can just merge the new request into
	3210	* this cluster and leave it in the cache
	3211	* since the resulting cluster is still
	3212	* less than the maximum allowable size
	3213	*/
	3214	wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
	3215
	3216	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
	3217	/*
	3218	* the current write completely
	3219	* envelops the existing cluster and since
	3220	* each write is limited to at most max_cluster_pgcount pages
	3221	* we can just use the start and last blocknos of the write
	3222	* to generate the cluster limits
	3223	*/
	3224	wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
	3225	}
	3226	break;
	3227	}
	3228
	3229	/*
	3230	* if we were to combine this write with the current cluster
	3231	* we would exceed the cluster size limit.... so,
	3232	* let's see if there's any overlap of the new I/O with
	3233	* the cluster we're currently considering... in fact, we'll
	3234	* stretch the cluster out to it's full limit and see if we
	3235	* get an intersection with the current write
	3236	*
	3237	*/
	3238	if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
	3239	/*
	3240	* the current write extends into the proposed cluster
	3241	* clip the length of the current write after first combining it's
	3242	* tail with the newly shaped cluster
	3243	*/
	3244	wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
	3245
	3246	cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
	3247	}
	3248	/*
	3249	* if we get here, there was no way to merge
	3250	* any portion of this write with this cluster
	3251	* or we could only merge part of it which
	3252	* will leave a tail...
	3253	* we'll check the remaining clusters before starting a new one
	3254	*/
	3255	}
	3256	}
	3257	if (cl_index < wbp->cl_number)
	3258	/*
	3259	* we found an existing cluster(s) that we
	3260	* could entirely merge this I/O into
	3261	*/
	3262	goto delay_io;
	3263
	3264	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) &&
	3265	wbp->cl_number == MAX_CLUSTERS &&
	3266	wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
	3267	uint32_t n;
	3268
	3269	if (vp->v_mount->mnt_kern_flag & MNTK_SSD)
	3270	n = WRITE_BEHIND_SSD;
	3271	else
	3272	n = WRITE_BEHIND;
	3273
	3274	while (n--)
	3275	cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg);
	3276	}
	3277	if (wbp->cl_number < MAX_CLUSTERS) {
	3278	/*
	3279	* we didn't find an existing cluster to
	3280	* merge into, but there's room to start
	3281	* a new one
	3282	*/
	3283	goto start_new_cluster;
	3284	}
	3285	/*
	3286	* no exisitng cluster to merge with and no
	3287	* room to start a new one... we'll try
	3288	* pushing one of the existing ones... if none of
	3289	* them are able to be pushed, we'll switch
	3290	* to the sparse cluster mechanism
	3291	* cluster_try_push updates cl_number to the
	3292	* number of remaining clusters... and
	3293	* returns the number of currently unused clusters
	3294	*/
	3295	ret_cluster_try_push = 0;
	3296
	3297	/*
	3298	* if writes are not deferred, call cluster push immediately
	3299	*/
	3300	if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
	3301
	3302	ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg);
	3303	}
	3304
	3305	/*
	3306	* execute following regardless of writes being deferred or not
	3307	*/
	3308	if (ret_cluster_try_push == 0) {
	3309	/*
	3310	* no more room in the normal cluster mechanism
	3311	* so let's switch to the more expansive but expensive
	3312	* sparse mechanism....
	3313	*/
	3314	sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
	3315	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
	3316
	3317	lck_mtx_unlock(&wbp->cl_lockw);
	3318
	3319	continue;
	3320	}
	3321	start_new_cluster:
	3322	wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
	3323	wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
	3324
	3325	wbp->cl_clusters[wbp->cl_number].io_flags = 0;
	3326
	3327	if (flags & IO_NOCACHE)
	3328	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IONOCACHE;
	3329
	3330	if (bflag & CL_PASSIVE)
	3331	wbp->cl_clusters[wbp->cl_number].io_flags \|= CLW_IOPASSIVE;
	3332
	3333	wbp->cl_number++;
	3334	delay_io:
	3335	lck_mtx_unlock(&wbp->cl_lockw);
	3336
	3337	continue;
	3338	issue_io:
	3339	/*
	3340	* we don't hold the lock at this point
	3341	*
	3342	* we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
	3343	* so that we correctly deal with a change in state of the hardware modify bit...
	3344	* we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
	3345	* cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
	3346	* responsible for generating the correct sized I/O(s)
	3347	*/
	3348	retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
	3349	}
	3350	}
	3351	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) \| DBG_FUNC_END, retval, 0, io_resid, 0, 0);
	3352
	3353	return (retval);
	3354	}
	3355
	3356
	3357
	3358	int
	3359	cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
	3360	{
	3361	return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
	3362	}
	3363
	3364
	3365	int
	3366	cluster_read_ext(vnode_t vp, struct uio uio, off_t filesize, int xflags, int (callback)(buf_t, void ), void callback_arg)
	3367	{
	3368	int retval = 0;
	3369	int flags;
	3370	user_ssize_t cur_resid;
	3371	u_int32_t io_size;
	3372	u_int32_t read_length = 0;
	3373	int read_type = IO_COPY;
	3374
	3375	flags = xflags;
	3376
	3377	if (vp->v_flag & VNOCACHE_DATA)
	3378	flags \|= IO_NOCACHE;
	3379	if ((vp->v_flag & VRAOFF) \|\| speculative_reads_disabled)
	3380	flags \|= IO_RAOFF;
	3381
	3382	if (flags & IO_SKIP_ENCRYPTION)
	3383	flags \|= IO_ENCRYPTED;
	3384	/*
	3385	* If we're doing an encrypted IO, then first check to see
	3386	* if the IO requested was page aligned. If not, then bail
	3387	* out immediately.
	3388	*/
	3389	if (flags & IO_ENCRYPTED) {
	3390	if (read_length & PAGE_MASK) {
	3391	retval = EINVAL;
	3392	return retval;
	3393	}
	3394	}
	3395
	3396	/*
	3397	* do a read through the cache if one of the following is true....
	3398	* NOCACHE is not true
	3399	* the uio request doesn't target USERSPACE
	3400	* Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
	3401	* Reading encrypted data from a CP filesystem should never result in the data touching
	3402	* the UBC.
	3403	*
	3404	* otherwise, find out if we want the direct or contig variant for
	3405	* the first vector in the uio request
	3406	*/
	3407	if ( ((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) \|\| (flags & IO_ENCRYPTED) ) {
	3408
	3409	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3410	}
	3411
	3412	while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
	3413
	3414	switch (read_type) {
	3415
	3416	case IO_COPY:
	3417	/*
	3418	* make sure the uio_resid isn't too big...
	3419	* internally, we want to handle all of the I/O in
	3420	* chunk sizes that fit in a 32 bit int
	3421	*/
	3422	if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE))
	3423	io_size = MAX_IO_REQUEST_SIZE;
	3424	else
	3425	io_size = (u_int32_t)cur_resid;
	3426
	3427	retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
	3428	break;
	3429
	3430	case IO_DIRECT:
	3431	retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
	3432	break;
	3433
	3434	case IO_CONTIG:
	3435	retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
	3436	break;
	3437
	3438	case IO_UNKNOWN:
	3439	retval = cluster_io_type(uio, &read_type, &read_length, 0);
	3440	break;
	3441	}
	3442	}
	3443	return (retval);
	3444	}
	3445
	3446
	3447
	3448	static void
	3449	cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
	3450	{
	3451	int range;
	3452	int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	3453
	3454	if ((range = last_pg - start_pg)) {
	3455	if (take_reference)
	3456	abort_flags \|= UPL_ABORT_REFERENCE;
	3457
	3458	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
	3459	}
	3460	}
	3461
	3462
	3463	static int
	3464	cluster_read_copy(vnode_t vp, struct uio uio, u_int32_t io_req_size, off_t filesize, int flags, int (callback)(buf_t, void ), void callback_arg)
	3465	{
	3466	upl_page_info_t *pl;
	3467	upl_t upl;
	3468	vm_offset_t upl_offset;
	3469	u_int32_t upl_size;
	3470	off_t upl_f_offset;
	3471	int start_offset;
	3472	int start_pg;
	3473	int last_pg;
	3474	int uio_last = 0;
	3475	int pages_in_upl;
	3476	off_t max_size;
	3477	off_t last_ioread_offset;
	3478	off_t last_request_offset;
	3479	kern_return_t kret;
	3480	int error = 0;
	3481	int retval = 0;
	3482	u_int32_t size_of_prefetch;
	3483	u_int32_t xsize;
	3484	u_int32_t io_size;
	3485	u_int32_t max_rd_size;
	3486	u_int32_t max_io_size;
	3487	u_int32_t max_prefetch;
	3488	u_int rd_ahead_enabled = 1;
	3489	u_int prefetch_enabled = 1;
	3490	struct cl_readahead * rap;
	3491	struct clios iostate;
	3492	struct cl_extent extent;
	3493	int bflag;
	3494	int take_reference = 1;
	3495	int policy = IOPOL_DEFAULT;
	3496	boolean_t iolock_inited = FALSE;
	3497
	3498	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_START,
	3499	(int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
	3500
	3501	if (flags & IO_ENCRYPTED) {
	3502	panic ("encrypted blocks will hit UBC!");
	3503	}
	3504
	3505	policy = throttle_get_io_policy(NULL);
	3506
	3507	if (policy == THROTTLE_LEVEL_TIER3 \|\| policy == THROTTLE_LEVEL_TIER2 \|\| (flags & IO_NOCACHE))
	3508	take_reference = 0;
	3509
	3510	if (flags & IO_PASSIVE)
	3511	bflag = CL_PASSIVE;
	3512	else
	3513	bflag = 0;
	3514
	3515	if (flags & IO_NOCACHE)
	3516	bflag \|= CL_NOCACHE;
	3517
	3518	if (flags & IO_SKIP_ENCRYPTION)
	3519	bflag \|= CL_ENCRYPTED;
	3520
	3521	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	3522	max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD));
	3523	max_rd_size = max_prefetch;
	3524
	3525	last_request_offset = uio->uio_offset + io_req_size;
	3526
	3527	if (last_request_offset > filesize)
	3528	last_request_offset = filesize;
	3529
	3530	if ((flags & (IO_RAOFF\|IO_NOCACHE)) \|\| ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
	3531	rd_ahead_enabled = 0;
	3532	rap = NULL;
	3533	} else {
	3534	if (cluster_is_throttled(vp)) {
	3535	/*
	3536	* we're in the throttle window, at the very least
	3537	* we want to limit the size of the I/O we're about
	3538	* to issue
	3539	*/
	3540	rd_ahead_enabled = 0;
	3541	prefetch_enabled = 0;
	3542
	3543	max_rd_size = THROTTLE_MAX_IOSIZE;
	3544	}
	3545	if ((rap = cluster_get_rap(vp)) == NULL)
	3546	rd_ahead_enabled = 0;
	3547	else {
	3548	extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
	3549	extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
	3550	}
	3551	}
	3552	if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr \|\| (rap->cl_lastr + 1) == extent.b_addr)) {
	3553	/*
	3554	* determine if we already have a read-ahead in the pipe courtesy of the
	3555	* last read systemcall that was issued...
	3556	* if so, pick up it's extent to determine where we should start
	3557	* with respect to any read-ahead that might be necessary to
	3558	* garner all the data needed to complete this read systemcall
	3559	*/
	3560	last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
	3561
	3562	if (last_ioread_offset < uio->uio_offset)
	3563	last_ioread_offset = (off_t)0;
	3564	else if (last_ioread_offset > last_request_offset)
	3565	last_ioread_offset = last_request_offset;
	3566	} else
	3567	last_ioread_offset = (off_t)0;
	3568
	3569	while (io_req_size && uio->uio_offset < filesize && retval == 0) {
	3570
	3571	max_size = filesize - uio->uio_offset;
	3572
	3573	if ((off_t)(io_req_size) < max_size)
	3574	io_size = io_req_size;
	3575	else
	3576	io_size = max_size;
	3577
	3578	if (!(flags & IO_NOCACHE)) {
	3579
	3580	while (io_size) {
	3581	u_int32_t io_resid;
	3582	u_int32_t io_requested;
	3583
	3584	/*
	3585	* if we keep finding the pages we need already in the cache, then
	3586	* don't bother to call cluster_read_prefetch since it costs CPU cycles
	3587	* to determine that we have all the pages we need... once we miss in
	3588	* the cache and have issued an I/O, than we'll assume that we're likely
	3589	* to continue to miss in the cache and it's to our advantage to try and prefetch
	3590	*/
	3591	if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
	3592	if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
	3593	/*
	3594	* we've already issued I/O for this request and
	3595	* there's still work to do and
	3596	* our prefetch stream is running dry, so issue a
	3597	* pre-fetch I/O... the I/O latency will overlap
	3598	* with the copying of the data
	3599	*/
	3600	if (size_of_prefetch > max_rd_size)
	3601	size_of_prefetch = max_rd_size;
	3602
	3603	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	3604
	3605	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	3606
	3607	if (last_ioread_offset > last_request_offset)
	3608	last_ioread_offset = last_request_offset;
	3609	}
	3610	}
	3611	/*
	3612	* limit the size of the copy we're about to do so that
	3613	* we can notice that our I/O pipe is running dry and
	3614	* get the next I/O issued before it does go dry
	3615	*/
	3616	if (last_ioread_offset && io_size > (max_io_size / 4))
	3617	io_resid = (max_io_size / 4);
	3618	else
	3619	io_resid = io_size;
	3620
	3621	io_requested = io_resid;
	3622
	3623	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
	3624
	3625	xsize = io_requested - io_resid;
	3626
	3627	io_size -= xsize;
	3628	io_req_size -= xsize;
	3629
	3630	if (retval \|\| io_resid)
	3631	/*
	3632	* if we run into a real error or
	3633	* a page that is not in the cache
	3634	* we need to leave streaming mode
	3635	*/
	3636	break;
	3637
	3638	if (rd_ahead_enabled && (io_size == 0 \|\| last_ioread_offset == last_request_offset)) {
	3639	/*
	3640	* we're already finished the I/O for this read request
	3641	* let's see if we should do a read-ahead
	3642	*/
	3643	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	3644	}
	3645	}
	3646	if (retval)
	3647	break;
	3648	if (io_size == 0) {
	3649	if (rap != NULL) {
	3650	if (extent.e_addr < rap->cl_lastr)
	3651	rap->cl_maxra = 0;
	3652	rap->cl_lastr = extent.e_addr;
	3653	}
	3654	break;
	3655	}
	3656	/*
	3657	* recompute max_size since cluster_copy_ubc_data_internal
	3658	* may have advanced uio->uio_offset
	3659	*/
	3660	max_size = filesize - uio->uio_offset;
	3661	}
	3662
	3663	iostate.io_completed = 0;
	3664	iostate.io_issued = 0;
	3665	iostate.io_error = 0;
	3666	iostate.io_wanted = 0;
	3667
	3668	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	3669	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	3670	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	3671	/*
	3672	* we're in the throttle window and at least 1 I/O
	3673	* has already been issued by a throttleable thread
	3674	* in this window, so return with EAGAIN to indicate
	3675	* to the FS issuing the cluster_read call that it
	3676	* should now throttle after dropping any locks
	3677	*/
	3678	throttle_info_update_by_mount(vp->v_mount);
	3679
	3680	retval = EAGAIN;
	3681	break;
	3682	}
	3683	}
	3684	}
	3685
	3686	/*
	3687	* compute the size of the upl needed to encompass
	3688	* the requested read... limit each call to cluster_io
	3689	* to the maximum UPL size... cluster_io will clip if
	3690	* this exceeds the maximum io_size for the device,
	3691	* make sure to account for
	3692	* a starting offset that's not page aligned
	3693	*/
	3694	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	3695	upl_f_offset = uio->uio_offset - (off_t)start_offset;
	3696
	3697	if (io_size > max_rd_size)
	3698	io_size = max_rd_size;
	3699
	3700	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	3701
	3702	if (flags & IO_NOCACHE) {
	3703	if (upl_size > max_io_size)
	3704	upl_size = max_io_size;
	3705	} else {
	3706	if (upl_size > max_io_size / 4) {
	3707	upl_size = max_io_size / 4;
	3708	upl_size &= ~PAGE_MASK;
	3709
	3710	if (upl_size == 0)
	3711	upl_size = PAGE_SIZE;
	3712	}
	3713	}
	3714	pages_in_upl = upl_size / PAGE_SIZE;
	3715
	3716	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_START,
	3717	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3718
	3719	kret = ubc_create_upl(vp,
	3720	upl_f_offset,
	3721	upl_size,
	3722	&upl,
	3723	&pl,
	3724	UPL_FILE_IO \| UPL_SET_LITE);
	3725	if (kret != KERN_SUCCESS)
	3726	panic("cluster_read_copy: failed to get pagelist");
	3727
	3728	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) \| DBG_FUNC_END,
	3729	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	3730
	3731	/*
	3732	* scan from the beginning of the upl looking for the first
	3733	* non-valid page.... this will become the first page in
	3734	* the request we're going to make to 'cluster_io'... if all
	3735	* of the pages are valid, we won't call through to 'cluster_io'
	3736	*/
	3737	for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
	3738	if (!upl_valid_page(pl, start_pg))
	3739	break;
	3740	}
	3741
	3742	/*
	3743	* scan from the starting invalid page looking for a valid
	3744	* page before the end of the upl is reached, if we
	3745	* find one, then it will be the last page of the request to
	3746	* 'cluster_io'
	3747	*/
	3748	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	3749	if (upl_valid_page(pl, last_pg))
	3750	break;
	3751	}
	3752
	3753	if (start_pg < last_pg) {
	3754	/*
	3755	* we found a range of 'invalid' pages that must be filled
	3756	* if the last page in this range is the last page of the file
	3757	* we may have to clip the size of it to keep from reading past
	3758	* the end of the last physical block associated with the file
	3759	*/
	3760	if (iolock_inited == FALSE) {
	3761	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	3762
	3763	iolock_inited = TRUE;
	3764	}
	3765	upl_offset = start_pg * PAGE_SIZE;
	3766	io_size = (last_pg - start_pg) * PAGE_SIZE;
	3767
	3768	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	3769	io_size = filesize - (upl_f_offset + upl_offset);
	3770
	3771	/*
	3772	* issue an asynchronous read to cluster_io
	3773	*/
	3774
	3775	error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
	3776	io_size, CL_READ \| CL_ASYNC \| bflag, (buf_t)NULL, &iostate, callback, callback_arg);
	3777
	3778	if (rap) {
	3779	if (extent.e_addr < rap->cl_maxra) {
	3780	/*
	3781	* we've just issued a read for a block that should have been
	3782	* in the cache courtesy of the read-ahead engine... something
	3783	* has gone wrong with the pipeline, so reset the read-ahead
	3784	* logic which will cause us to restart from scratch
	3785	*/
	3786	rap->cl_maxra = 0;
	3787	}
	3788	}
	3789	}
	3790	if (error == 0) {
	3791	/*
	3792	* if the read completed successfully, or there was no I/O request
	3793	* issued, than copy the data into user land via 'cluster_upl_copy_data'
	3794	* we'll first add on any 'valid'
	3795	* pages that were present in the upl when we acquired it.
	3796	*/
	3797	u_int val_size;
	3798
	3799	for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
	3800	if (!upl_valid_page(pl, uio_last))
	3801	break;
	3802	}
	3803	if (uio_last < pages_in_upl) {
	3804	/*
	3805	* there were some invalid pages beyond the valid pages
	3806	* that we didn't issue an I/O for, just release them
	3807	* unchanged now, so that any prefetch/readahed can
	3808	* include them
	3809	*/
	3810	ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
	3811	(pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	3812	}
	3813
	3814	/*
	3815	* compute size to transfer this round, if io_req_size is
	3816	* still non-zero after this attempt, we'll loop around and
	3817	* set up for another I/O.
	3818	*/
	3819	val_size = (uio_last * PAGE_SIZE) - start_offset;
	3820
	3821	if (val_size > max_size)
	3822	val_size = max_size;
	3823
	3824	if (val_size > io_req_size)
	3825	val_size = io_req_size;
	3826
	3827	if ((uio->uio_offset + val_size) > last_ioread_offset)
	3828	last_ioread_offset = uio->uio_offset + val_size;
	3829
	3830	if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
	3831
	3832	if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
	3833	/*
	3834	* if there's still I/O left to do for this request, and...
	3835	* we're not in hard throttle mode, and...
	3836	* we're close to using up the previous prefetch, then issue a
	3837	* new pre-fetch I/O... the I/O latency will overlap
	3838	* with the copying of the data
	3839	*/
	3840	if (size_of_prefetch > max_rd_size)
	3841	size_of_prefetch = max_rd_size;
	3842
	3843	size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
	3844
	3845	last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
	3846
	3847	if (last_ioread_offset > last_request_offset)
	3848	last_ioread_offset = last_request_offset;
	3849	}
	3850
	3851	} else if ((uio->uio_offset + val_size) == last_request_offset) {
	3852	/*
	3853	* this transfer will finish this request, so...
	3854	* let's try to read ahead if we're in
	3855	* a sequential access pattern and we haven't
	3856	* explicitly disabled it
	3857	*/
	3858	if (rd_ahead_enabled)
	3859	cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
	3860
	3861	if (rap != NULL) {
	3862	if (extent.e_addr < rap->cl_lastr)
	3863	rap->cl_maxra = 0;
	3864	rap->cl_lastr = extent.e_addr;
	3865	}
	3866	}
	3867	if (iolock_inited == TRUE)
	3868	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	3869
	3870	if (iostate.io_error)
	3871	error = iostate.io_error;
	3872	else {
	3873	u_int32_t io_requested;
	3874
	3875	io_requested = val_size;
	3876
	3877	retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
	3878
	3879	io_req_size -= (val_size - io_requested);
	3880	}
	3881	} else {
	3882	if (iolock_inited == TRUE)
	3883	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	3884	}
	3885	if (start_pg < last_pg) {
	3886	/*
	3887	* compute the range of pages that we actually issued an I/O for
	3888	* and either commit them as valid if the I/O succeeded
	3889	* or abort them if the I/O failed or we're not supposed to
	3890	* keep them in the cache
	3891	*/
	3892	io_size = (last_pg - start_pg) * PAGE_SIZE;
	3893
	3894	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	3895
	3896	if (error \|\| (flags & IO_NOCACHE))
	3897	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
	3898	UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	3899	else {
	3900	int commit_flags = UPL_COMMIT_CLEAR_DIRTY \| UPL_COMMIT_FREE_ON_EMPTY;
	3901
	3902	if (take_reference)
	3903	commit_flags \|= UPL_COMMIT_INACTIVATE;
	3904	else
	3905	commit_flags \|= UPL_COMMIT_SPECULATE;
	3906
	3907	ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
	3908	}
	3909	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
	3910	}
	3911	if ((last_pg - start_pg) < pages_in_upl) {
	3912	/*
	3913	* the set of pages that we issued an I/O for did not encompass
	3914	* the entire upl... so just release these without modifying
	3915	* their state
	3916	*/
	3917	if (error)
	3918	ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
	3919	else {
	3920
	3921	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_START,
	3922	upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
	3923
	3924	/*
	3925	* handle any valid pages at the beginning of
	3926	* the upl... release these appropriately
	3927	*/
	3928	cluster_read_upl_release(upl, 0, start_pg, take_reference);
	3929
	3930	/*
	3931	* handle any valid pages immediately after the
	3932	* pages we issued I/O for... ... release these appropriately
	3933	*/
	3934	cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
	3935
	3936	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) \| DBG_FUNC_END, upl, -1, -1, 0, 0);
	3937	}
	3938	}
	3939	if (retval == 0)
	3940	retval = error;
	3941
	3942	if (io_req_size) {
	3943	if (cluster_is_throttled(vp)) {
	3944	/*
	3945	* we're in the throttle window, at the very least
	3946	* we want to limit the size of the I/O we're about
	3947	* to issue
	3948	*/
	3949	rd_ahead_enabled = 0;
	3950	prefetch_enabled = 0;
	3951	max_rd_size = THROTTLE_MAX_IOSIZE;
	3952	} else {
	3953	if (max_rd_size == THROTTLE_MAX_IOSIZE) {
	3954	/*
	3955	* coming out of throttled state
	3956	*/
	3957	if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
	3958	if (rap != NULL)
	3959	rd_ahead_enabled = 1;
	3960	prefetch_enabled = 1;
	3961	}
	3962	max_rd_size = max_prefetch;
	3963	last_ioread_offset = 0;
	3964	}
	3965	}
	3966	}
	3967	}
	3968	if (iolock_inited == TRUE) {
	3969	/*
	3970	* cluster_io returned an error after it
	3971	* had already issued some I/O. we need
	3972	* to wait for that I/O to complete before
	3973	* we can destroy the iostate mutex...
	3974	* 'retval' already contains the early error
	3975	* so no need to pick it up from iostate.io_error
	3976	*/
	3977	cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
	3978
	3979	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	3980	}
	3981	if (rap != NULL) {
	3982	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	3983	(int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
	3984
	3985	lck_mtx_unlock(&rap->cl_lockr);
	3986	} else {
	3987	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) \| DBG_FUNC_END,
	3988	(int)uio->uio_offset, io_req_size, 0, retval, 0);
	3989	}
	3990
	3991	return (retval);
	3992	}
	3993
	3994	static int
	3995	cluster_read_direct(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	3996	int flags, int (callback)(buf_t, void ), void *callback_arg)
	3997	{
	3998	upl_t upl;
	3999	upl_page_info_t *pl;
	4000	off_t max_io_size;
	4001	vm_offset_t upl_offset, vector_upl_offset = 0;
	4002	upl_size_t upl_size, vector_upl_size = 0;
	4003	vm_size_t upl_needed_size;
	4004	unsigned int pages_in_pl;
	4005	int upl_flags;
	4006	kern_return_t kret;
	4007	unsigned int i;
	4008	int force_data_sync;
	4009	int retval = 0;
	4010	int no_zero_fill = 0;
	4011	int io_flag = 0;
	4012	int misaligned = 0;
	4013	struct clios iostate;
	4014	user_addr_t iov_base;
	4015	u_int32_t io_req_size;
	4016	u_int32_t offset_in_file;
	4017	u_int32_t offset_in_iovbase;
	4018	u_int32_t io_size;
	4019	u_int32_t io_min;
	4020	u_int32_t xsize;
	4021	u_int32_t devblocksize;
	4022	u_int32_t mem_alignment_mask;
	4023	u_int32_t max_upl_size;
	4024	u_int32_t max_rd_size;
	4025	u_int32_t max_rd_ahead;
	4026	u_int32_t max_vector_size;
	4027	boolean_t strict_uncached_IO = FALSE;
	4028	boolean_t io_throttled = FALSE;
	4029
	4030	u_int32_t vector_upl_iosize = 0;
	4031	int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1);
	4032	off_t v_upl_uio_offset = 0;
	4033	int vector_upl_index=0;
	4034	upl_t vector_upl = NULL;
	4035
	4036	user_addr_t orig_iov_base = 0;
	4037	user_addr_t last_iov_base = 0;
	4038	user_addr_t next_iov_base = 0;
	4039
	4040	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_START,
	4041	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4042
	4043	max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4044
	4045	max_rd_size = max_upl_size;
	4046	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4047
	4048	io_flag = CL_COMMIT \| CL_READ \| CL_ASYNC \| CL_NOZERO \| CL_DIRECT_IO;
	4049
	4050	if (flags & IO_PASSIVE)
	4051	io_flag \|= CL_PASSIVE;
	4052
	4053	if (flags & IO_ENCRYPTED) {
	4054	io_flag \|= CL_RAW_ENCRYPTED;
	4055	}
	4056
	4057	if (flags & IO_NOCACHE) {
	4058	io_flag \|= CL_NOCACHE;
	4059	}
	4060
	4061	if (flags & IO_SKIP_ENCRYPTION)
	4062	io_flag \|= CL_ENCRYPTED;
	4063
	4064	iostate.io_completed = 0;
	4065	iostate.io_issued = 0;
	4066	iostate.io_error = 0;
	4067	iostate.io_wanted = 0;
	4068
	4069	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4070
	4071	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4072	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4073
	4074	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4075	(int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
	4076
	4077	if (devblocksize == 1) {
	4078	/*
	4079	* the AFP client advertises a devblocksize of 1
	4080	* however, its BLOCKMAP routine maps to physical
	4081	* blocks that are PAGE_SIZE in size...
	4082	* therefore we can't ask for I/Os that aren't page aligned
	4083	* or aren't multiples of PAGE_SIZE in size
	4084	* by setting devblocksize to PAGE_SIZE, we re-instate
	4085	* the old behavior we had before the mem_alignment_mask
	4086	* changes went in...
	4087	*/
	4088	devblocksize = PAGE_SIZE;
	4089	}
	4090
	4091	strict_uncached_IO = ubc_strict_uncached_IO(vp);
	4092
	4093	orig_iov_base = uio_curriovbase(uio);
	4094	last_iov_base = orig_iov_base;
	4095
	4096	next_dread:
	4097	io_req_size = *read_length;
	4098	iov_base = uio_curriovbase(uio);
	4099
	4100	max_io_size = filesize - uio->uio_offset;
	4101
	4102	if ((off_t)io_req_size > max_io_size)
	4103	io_req_size = max_io_size;
	4104
	4105	offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
	4106	offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
	4107
	4108	if (offset_in_file \|\| offset_in_iovbase) {
	4109	/*
	4110	* one of the 2 important offsets is misaligned
	4111	* so fire an I/O through the cache for this entire vector
	4112	*/
	4113	misaligned = 1;
	4114	}
	4115	if (iov_base & (devblocksize - 1)) {
	4116	/*
	4117	* the offset in memory must be on a device block boundary
	4118	* so that we can guarantee that we can generate an
	4119	* I/O that ends on a page boundary in cluster_io
	4120	*/
	4121	misaligned = 1;
	4122	}
	4123
	4124	/*
	4125	* The user must request IO in aligned chunks. If the
	4126	* offset into the file is bad, or the userland pointer
	4127	* is non-aligned, then we cannot service the encrypted IO request.
	4128	*/
	4129	if ((flags & IO_ENCRYPTED) && (misaligned)) {
	4130	retval = EINVAL;
	4131	}
	4132
	4133	/*
	4134	* When we get to this point, we know...
	4135	* -- the offset into the file is on a devblocksize boundary
	4136	*/
	4137
	4138	while (io_req_size && retval == 0) {
	4139	u_int32_t io_start;
	4140
	4141	if (cluster_is_throttled(vp)) {
	4142	/*
	4143	* we're in the throttle window, at the very least
	4144	* we want to limit the size of the I/O we're about
	4145	* to issue
	4146	*/
	4147	max_rd_size = THROTTLE_MAX_IOSIZE;
	4148	max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
	4149	max_vector_size = THROTTLE_MAX_IOSIZE;
	4150	} else {
	4151	max_rd_size = max_upl_size;
	4152	max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
	4153	max_vector_size = MAX_VECTOR_UPL_SIZE;
	4154	}
	4155	io_start = io_size = io_req_size;
	4156
	4157	/*
	4158	* First look for pages already in the cache
	4159	* and move them to user space. But only do this
	4160	* check if we are not retrieving encrypted data directly
	4161	* from the filesystem; those blocks should never
	4162	* be in the UBC.
	4163	*
	4164	* cluster_copy_ubc_data returns the resid
	4165	* in io_size
	4166	*/
	4167	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4168	retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
	4169	}
	4170	/*
	4171	* calculate the number of bytes actually copied
	4172	* starting size - residual
	4173	*/
	4174	xsize = io_start - io_size;
	4175
	4176	io_req_size -= xsize;
	4177
	4178	if(useVectorUPL && (xsize \|\| (iov_base & PAGE_MASK))) {
	4179	/*
	4180	* We found something in the cache or we have an iov_base that's not
	4181	* page-aligned.
	4182	*
	4183	* Issue all I/O's that have been collected within this Vectored UPL.
	4184	*/
	4185	if(vector_upl_index) {
	4186	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4187	reset_vector_run_state();
	4188	}
	4189
	4190	if(xsize)
	4191	useVectorUPL = 0;
	4192
	4193	/*
	4194	* After this point, if we are using the Vector UPL path and the base is
	4195	* not page-aligned then the UPL with that base will be the first in the vector UPL.
	4196	*/
	4197	}
	4198
	4199	/*
	4200	* check to see if we are finished with this request.
	4201	*
	4202	* If we satisfied this IO already, then io_req_size will be 0.
	4203	* Otherwise, see if the IO was mis-aligned and needs to go through
	4204	* the UBC to deal with the 'tail'.
	4205	*
	4206	*/
	4207	if (io_req_size == 0 \|\| (misaligned)) {
	4208	/*
	4209	* see if there's another uio vector to
	4210	* process that's of type IO_DIRECT
	4211	*
	4212	* break out of while loop to get there
	4213	*/
	4214	break;
	4215	}
	4216	/*
	4217	* assume the request ends on a device block boundary
	4218	*/
	4219	io_min = devblocksize;
	4220
	4221	/*
	4222	* we can handle I/O's in multiples of the device block size
	4223	* however, if io_size isn't a multiple of devblocksize we
	4224	* want to clip it back to the nearest page boundary since
	4225	* we are going to have to go through cluster_read_copy to
	4226	* deal with the 'overhang'... by clipping it to a PAGE_SIZE
	4227	* multiple, we avoid asking the drive for the same physical
	4228	* blocks twice.. once for the partial page at the end of the
	4229	* request and a 2nd time for the page we read into the cache
	4230	* (which overlaps the end of the direct read) in order to
	4231	* get at the overhang bytes
	4232	*/
	4233	if (io_size & (devblocksize - 1)) {
	4234	if (flags & IO_ENCRYPTED) {
	4235	/*
	4236	* Normally, we'd round down to the previous page boundary to
	4237	* let the UBC manage the zero-filling of the file past the EOF.
	4238	* But if we're doing encrypted IO, we can't let any of
	4239	* the data hit the UBC. This means we have to do the full
	4240	* IO to the upper block boundary of the device block that
	4241	* contains the EOF. The user will be responsible for not
	4242	* interpreting data PAST the EOF in its buffer.
	4243	*
	4244	* So just bump the IO back up to a multiple of devblocksize
	4245	*/
	4246	io_size = ((io_size + devblocksize) & ~(devblocksize - 1));
	4247	io_min = io_size;
	4248	}
	4249	else {
	4250	/*
	4251	* Clip the request to the previous page size boundary
	4252	* since request does NOT end on a device block boundary
	4253	*/
	4254	io_size &= ~PAGE_MASK;
	4255	io_min = PAGE_SIZE;
	4256	}
	4257
	4258	}
	4259	if (retval \|\| io_size < io_min) {
	4260	/*
	4261	* either an error or we only have the tail left to
	4262	* complete via the copy path...
	4263	* we may have already spun some portion of this request
	4264	* off as async requests... we need to wait for the I/O
	4265	* to complete before returning
	4266	*/
	4267	goto wait_for_dreads;
	4268	}
	4269
	4270	/*
	4271	* Don't re-check the UBC data if we are looking for uncached IO
	4272	* or asking for encrypted blocks.
	4273	*/
	4274	if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
	4275
	4276	if ((xsize = io_size) > max_rd_size)
	4277	xsize = max_rd_size;
	4278
	4279	io_size = 0;
	4280
	4281	ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
	4282
	4283	if (io_size == 0) {
	4284	/*
	4285	* a page must have just come into the cache
	4286	* since the first page in this range is no
	4287	* longer absent, go back and re-evaluate
	4288	*/
	4289	continue;
	4290	}
	4291	}
	4292	if ( (flags & IO_RETURN_ON_THROTTLE) ) {
	4293	if (cluster_is_throttled(vp) == THROTTLE_NOW) {
	4294	if ( !cluster_io_present_in_BC(vp, uio->uio_offset)) {
	4295	/*
	4296	* we're in the throttle window and at least 1 I/O
	4297	* has already been issued by a throttleable thread
	4298	* in this window, so return with EAGAIN to indicate
	4299	* to the FS issuing the cluster_read call that it
	4300	* should now throttle after dropping any locks
	4301	*/
	4302	throttle_info_update_by_mount(vp->v_mount);
	4303
	4304	io_throttled = TRUE;
	4305	goto wait_for_dreads;
	4306	}
	4307	}
	4308	}
	4309	if (io_size > max_rd_size)
	4310	io_size = max_rd_size;
	4311
	4312	iov_base = uio_curriovbase(uio);
	4313
	4314	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4315	upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
	4316
	4317	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_START,
	4318	(int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
	4319
	4320	if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0))
	4321	no_zero_fill = 1;
	4322	else
	4323	no_zero_fill = 0;
	4324
	4325	for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
	4326	pages_in_pl = 0;
	4327	upl_size = upl_needed_size;
	4328	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4329
	4330	if (no_zero_fill)
	4331	upl_flags \|= UPL_NOZEROFILL;
	4332	if (force_data_sync)
	4333	upl_flags \|= UPL_FORCE_DATA_SYNC;
	4334
	4335	kret = vm_map_create_upl(current_map(),
	4336	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4337	&upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
	4338
	4339	if (kret != KERN_SUCCESS) {
	4340	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4341	(int)upl_offset, upl_size, io_size, kret, 0);
	4342	/*
	4343	* failed to get pagelist
	4344	*
	4345	* we may have already spun some portion of this request
	4346	* off as async requests... we need to wait for the I/O
	4347	* to complete before returning
	4348	*/
	4349	goto wait_for_dreads;
	4350	}
	4351	pages_in_pl = upl_size / PAGE_SIZE;
	4352	pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
	4353
	4354	for (i = 0; i < pages_in_pl; i++) {
	4355	if (!upl_page_present(pl, i))
	4356	break;
	4357	}
	4358	if (i == pages_in_pl)
	4359	break;
	4360
	4361	ubc_upl_abort(upl, 0);
	4362	}
	4363	if (force_data_sync >= 3) {
	4364	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4365	(int)upl_offset, upl_size, io_size, kret, 0);
	4366
	4367	goto wait_for_dreads;
	4368	}
	4369	/*
	4370	* Consider the possibility that upl_size wasn't satisfied.
	4371	*/
	4372	if (upl_size < upl_needed_size) {
	4373	if (upl_size && upl_offset == 0)
	4374	io_size = upl_size;
	4375	else
	4376	io_size = 0;
	4377	}
	4378	if (io_size == 0) {
	4379	ubc_upl_abort(upl, 0);
	4380	goto wait_for_dreads;
	4381	}
	4382	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) \| DBG_FUNC_END,
	4383	(int)upl_offset, upl_size, io_size, kret, 0);
	4384
	4385	if(useVectorUPL) {
	4386	vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
	4387	if(end_off)
	4388	issueVectorUPL = 1;
	4389	/*
	4390	* After this point, if we are using a vector UPL, then
	4391	* either all the UPL elements end on a page boundary OR
	4392	* this UPL is the last element because it does not end
	4393	* on a page boundary.
	4394	*/
	4395	}
	4396
	4397	/*
	4398	* request asynchronously so that we can overlap
	4399	* the preparation of the next I/O
	4400	* if there are already too many outstanding reads
	4401	* wait until some have completed before issuing the next read
	4402	*/
	4403	cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
	4404
	4405	if (iostate.io_error) {
	4406	/*
	4407	* one of the earlier reads we issued ran into a hard error
	4408	* don't issue any more reads, cleanup the UPL
	4409	* that was just created but not used, then
	4410	* go wait for any other reads to complete before
	4411	* returning the error to the caller
	4412	*/
	4413	ubc_upl_abort(upl, 0);
	4414
	4415	goto wait_for_dreads;
	4416	}
	4417	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_START,
	4418	upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
	4419
	4420
	4421	if(!useVectorUPL) {
	4422	if (no_zero_fill)
	4423	io_flag &= ~CL_PRESERVE;
	4424	else
	4425	io_flag \|= CL_PRESERVE;
	4426
	4427	retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4428
	4429	} else {
	4430
	4431	if(!vector_upl_index) {
	4432	vector_upl = vector_upl_create(upl_offset);
	4433	v_upl_uio_offset = uio->uio_offset;
	4434	vector_upl_offset = upl_offset;
	4435	}
	4436
	4437	vector_upl_set_subupl(vector_upl,upl, upl_size);
	4438	vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
	4439	vector_upl_index++;
	4440	vector_upl_size += upl_size;
	4441	vector_upl_iosize += io_size;
	4442
	4443	if(issueVectorUPL \|\| vector_upl_index == MAX_VECTOR_UPL_ELEMENTS \|\| vector_upl_size >= max_vector_size) {
	4444	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4445	reset_vector_run_state();
	4446	}
	4447	}
	4448	last_iov_base = iov_base + io_size;
	4449
	4450	/*
	4451	* update the uio structure
	4452	*/
	4453	if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
	4454	uio_update(uio, (user_size_t)max_io_size);
	4455	}
	4456	else {
	4457	uio_update(uio, (user_size_t)io_size);
	4458	}
	4459	/*
	4460	* Under normal circumstances, the io_size should not be
	4461	* bigger than the io_req_size, but we may have had to round up
	4462	* to the end of the page in the encrypted IO case. In that case only,
	4463	* ensure that we only decrement io_req_size to 0.
	4464	*/
	4465	if ((flags & IO_ENCRYPTED) && (io_size > io_req_size)) {
	4466	io_req_size = 0;
	4467	}
	4468	else {
	4469	io_req_size -= io_size;
	4470	}
	4471
	4472	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) \| DBG_FUNC_END,
	4473	upl, (int)uio->uio_offset, io_req_size, retval, 0);
	4474
	4475	} /* end while */
	4476
	4477	if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
	4478
	4479	retval = cluster_io_type(uio, read_type, read_length, 0);
	4480
	4481	if (retval == 0 && *read_type == IO_DIRECT) {
	4482
	4483	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_NONE,
	4484	(int)uio->uio_offset, (int)filesize, read_type, read_length, 0);
	4485
	4486	goto next_dread;
	4487	}
	4488	}
	4489
	4490	wait_for_dreads:
	4491
	4492	if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
	4493	retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
	4494	reset_vector_run_state();
	4495	}
	4496	/*
	4497	* make sure all async reads that are part of this stream
	4498	* have completed before we return
	4499	*/
	4500	cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
	4501
	4502	if (iostate.io_error)
	4503	retval = iostate.io_error;
	4504
	4505	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4506
	4507	if (io_throttled == TRUE && retval == 0)
	4508	retval = EAGAIN;
	4509
	4510	for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
	4511	/*
	4512	* This is specifically done for pmap accounting purposes.
	4513	* vm_pre_fault() will call vm_fault() to enter the page into
	4514	* the pmap if there isn't _a_ physical page for that VA already.
	4515	*/
	4516	vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK));
	4517	}
	4518
	4519	if (io_req_size && retval == 0) {
	4520	/*
	4521	* we couldn't handle the tail of this request in DIRECT mode
	4522	* so fire it through the copy path
	4523	*/
	4524	retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
	4525
	4526	*read_type = IO_UNKNOWN;
	4527	}
	4528	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) \| DBG_FUNC_END,
	4529	(int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
	4530
	4531	return (retval);
	4532	}
	4533
	4534
	4535	static int
	4536	cluster_read_contig(vnode_t vp, struct uio uio, off_t filesize, int read_type, u_int32_t *read_length,
	4537	int (callback)(buf_t, void ), void *callback_arg, int flags)
	4538	{
	4539	upl_page_info_t *pl;
	4540	upl_t upl[MAX_VECTS];
	4541	vm_offset_t upl_offset;
	4542	addr64_t dst_paddr = 0;
	4543	user_addr_t iov_base;
	4544	off_t max_size;
	4545	upl_size_t upl_size;
	4546	vm_size_t upl_needed_size;
	4547	mach_msg_type_number_t pages_in_pl;
	4548	int upl_flags;
	4549	kern_return_t kret;
	4550	struct clios iostate;
	4551	int error= 0;
	4552	int cur_upl = 0;
	4553	int num_upl = 0;
	4554	int n;
	4555	u_int32_t xsize;
	4556	u_int32_t io_size;
	4557	u_int32_t devblocksize;
	4558	u_int32_t mem_alignment_mask;
	4559	u_int32_t tail_size = 0;
	4560	int bflag;
	4561
	4562	if (flags & IO_PASSIVE)
	4563	bflag = CL_PASSIVE;
	4564	else
	4565	bflag = 0;
	4566
	4567	if (flags & IO_NOCACHE)
	4568	bflag \|= CL_NOCACHE;
	4569
	4570	/*
	4571	* When we enter this routine, we know
	4572	* -- the read_length will not exceed the current iov_len
	4573	* -- the target address is physically contiguous for read_length
	4574	*/
	4575	cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
	4576
	4577	devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
	4578	mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
	4579
	4580	iostate.io_completed = 0;
	4581	iostate.io_issued = 0;
	4582	iostate.io_error = 0;
	4583	iostate.io_wanted = 0;
	4584
	4585	lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
	4586
	4587	next_cread:
	4588	io_size = *read_length;
	4589
	4590	max_size = filesize - uio->uio_offset;
	4591
	4592	if (io_size > max_size)
	4593	io_size = max_size;
	4594
	4595	iov_base = uio_curriovbase(uio);
	4596
	4597	upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
	4598	upl_needed_size = upl_offset + io_size;
	4599
	4600	pages_in_pl = 0;
	4601	upl_size = upl_needed_size;
	4602	upl_flags = UPL_FILE_IO \| UPL_NO_SYNC \| UPL_CLEAN_IN_PLACE \| UPL_SET_INTERNAL \| UPL_SET_LITE \| UPL_SET_IO_WIRE;
	4603
	4604
	4605	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_START,
	4606	(int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
	4607
	4608	kret = vm_map_get_upl(current_map(),
	4609	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4610	&upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, 0);
	4611
	4612	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) \| DBG_FUNC_END,
	4613	(int)upl_offset, upl_size, io_size, kret, 0);
	4614
	4615	if (kret != KERN_SUCCESS) {
	4616	/*
	4617	* failed to get pagelist
	4618	*/
	4619	error = EINVAL;
	4620	goto wait_for_creads;
	4621	}
	4622	num_upl++;
	4623
	4624	if (upl_size < upl_needed_size) {
	4625	/*
	4626	* The upl_size wasn't satisfied.
	4627	*/
	4628	error = EINVAL;
	4629	goto wait_for_creads;
	4630	}
	4631	pl = ubc_upl_pageinfo(upl[cur_upl]);
	4632
	4633	dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
	4634
	4635	while (((uio->uio_offset & (devblocksize - 1)) \|\| io_size < devblocksize) && io_size) {
	4636	u_int32_t head_size;
	4637
	4638	head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
	4639
	4640	if (head_size > io_size)
	4641	head_size = io_size;
	4642
	4643	error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
	4644
	4645	if (error)
	4646	goto wait_for_creads;
	4647
	4648	upl_offset += head_size;
	4649	dst_paddr += head_size;
	4650	io_size -= head_size;
	4651
	4652	iov_base += head_size;
	4653	}
	4654	if ((u_int32_t)iov_base & mem_alignment_mask) {
	4655	/*
	4656	* request doesn't set up on a memory boundary
	4657	* the underlying DMA engine can handle...
	4658	* return an error instead of going through
	4659	* the slow copy path since the intent of this
	4660	* path is direct I/O to device memory
	4661	*/
	4662	error = EINVAL;
	4663	goto wait_for_creads;
	4664	}
	4665
	4666	tail_size = io_size & (devblocksize - 1);
	4667
	4668	io_size -= tail_size;
	4669
	4670	while (io_size && error == 0) {
	4671
	4672	if (io_size > MAX_IO_CONTIG_SIZE)
	4673	xsize = MAX_IO_CONTIG_SIZE;
	4674	else
	4675	xsize = io_size;
	4676	/*
	4677	* request asynchronously so that we can overlap
	4678	* the preparation of the next I/O... we'll do
	4679	* the commit after all the I/O has completed
	4680	* since its all issued against the same UPL
	4681	* if there are already too many outstanding reads
	4682	* wait until some have completed before issuing the next
	4683	*/
	4684	cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
	4685
	4686	if (iostate.io_error) {
	4687	/*
	4688	* one of the earlier reads we issued ran into a hard error
	4689	* don't issue any more reads...
	4690	* go wait for any other reads to complete before
	4691	* returning the error to the caller
	4692	*/
	4693	goto wait_for_creads;
	4694	}
	4695	error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
	4696	CL_READ \| CL_NOZERO \| CL_DEV_MEMORY \| CL_ASYNC \| bflag,
	4697	(buf_t)NULL, &iostate, callback, callback_arg);
	4698	/*
	4699	* The cluster_io read was issued successfully,
	4700	* update the uio structure
	4701	*/
	4702	if (error == 0) {
	4703	uio_update(uio, (user_size_t)xsize);
	4704
	4705	dst_paddr += xsize;
	4706	upl_offset += xsize;
	4707	io_size -= xsize;
	4708	}
	4709	}
	4710	if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
	4711
	4712	error = cluster_io_type(uio, read_type, read_length, 0);
	4713
	4714	if (error == 0 && *read_type == IO_CONTIG) {
	4715	cur_upl++;
	4716	goto next_cread;
	4717	}
	4718	} else
	4719	*read_type = IO_UNKNOWN;
	4720
	4721	wait_for_creads:
	4722	/*
	4723	* make sure all async reads that are part of this stream
	4724	* have completed before we proceed
	4725	*/
	4726	cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
	4727
	4728	if (iostate.io_error)
	4729	error = iostate.io_error;
	4730
	4731	lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
	4732
	4733	if (error == 0 && tail_size)
	4734	error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
	4735
	4736	for (n = 0; n < num_upl; n++)
	4737	/*
	4738	* just release our hold on each physically contiguous
	4739	* region without changing any state
	4740	*/
	4741	ubc_upl_abort(upl[n], 0);
	4742
	4743	return (error);
	4744	}
	4745
	4746
	4747	static int
	4748	cluster_io_type(struct uio uio, int io_type, u_int32_t *io_length, u_int32_t min_length)
	4749	{
	4750	user_size_t iov_len;
	4751	user_addr_t iov_base = 0;
	4752	upl_t upl;
	4753	upl_size_t upl_size;
	4754	int upl_flags;
	4755	int retval = 0;
	4756
	4757	/*
	4758	* skip over any emtpy vectors
	4759	*/
	4760	uio_update(uio, (user_size_t)0);
	4761
	4762	iov_len = uio_curriovlen(uio);
	4763
	4764	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
	4765
	4766	if (iov_len) {
	4767	iov_base = uio_curriovbase(uio);
	4768	/*
	4769	* make sure the size of the vector isn't too big...
	4770	* internally, we want to handle all of the I/O in
	4771	* chunk sizes that fit in a 32 bit int
	4772	*/
	4773	if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE)
	4774	upl_size = MAX_IO_REQUEST_SIZE;
	4775	else
	4776	upl_size = (u_int32_t)iov_len;
	4777
	4778	upl_flags = UPL_QUERY_OBJECT_TYPE;
	4779
	4780	if ((vm_map_get_upl(current_map(),
	4781	(vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
	4782	&upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
	4783	/*
	4784	* the user app must have passed in an invalid address
	4785	*/
	4786	retval = EFAULT;
	4787	}
	4788	if (upl_size == 0)
	4789	retval = EFAULT;
	4790
	4791	*io_length = upl_size;
	4792
	4793	if (upl_flags & UPL_PHYS_CONTIG)
	4794	*io_type = IO_CONTIG;
	4795	else if (iov_len >= min_length)
	4796	*io_type = IO_DIRECT;
	4797	else
	4798	*io_type = IO_COPY;
	4799	} else {
	4800	/*
	4801	* nothing left to do for this uio
	4802	*/
	4803	*io_length = 0;
	4804	*io_type = IO_UNKNOWN;
	4805	}
	4806	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) \| DBG_FUNC_END, iov_base, io_type, io_length, retval, 0);
	4807
	4808	return (retval);
	4809	}
	4810
	4811
	4812	/*
	4813	* generate advisory I/O's in the largest chunks possible
	4814	* the completed pages will be released into the VM cache
	4815	*/
	4816	int
	4817	advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
	4818	{
	4819	return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
	4820	}
	4821
	4822	int
	4823	advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (callback)(buf_t, void ), void *callback_arg, int bflag)
	4824	{
	4825	upl_page_info_t *pl;
	4826	upl_t upl;
	4827	vm_offset_t upl_offset;
	4828	int upl_size;
	4829	off_t upl_f_offset;
	4830	int start_offset;
	4831	int start_pg;
	4832	int last_pg;
	4833	int pages_in_upl;
	4834	off_t max_size;
	4835	int io_size;
	4836	kern_return_t kret;
	4837	int retval = 0;
	4838	int issued_io;
	4839	int skip_range;
	4840	uint32_t max_io_size;
	4841
	4842
	4843	if ( !UBCINFOEXISTS(vp))
	4844	return(EINVAL);
	4845
	4846	if (resid < 0)
	4847	return(EINVAL);
	4848
	4849	max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
	4850
	4851	if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) {
	4852	if (max_io_size > speculative_prefetch_max_iosize)
	4853	max_io_size = speculative_prefetch_max_iosize;
	4854	}
	4855
	4856	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_START,
	4857	(int)f_offset, resid, (int)filesize, 0, 0);
	4858
	4859	while (resid && f_offset < filesize && retval == 0) {
	4860	/*
	4861	* compute the size of the upl needed to encompass
	4862	* the requested read... limit each call to cluster_io
	4863	* to the maximum UPL size... cluster_io will clip if
	4864	* this exceeds the maximum io_size for the device,
	4865	* make sure to account for
	4866	* a starting offset that's not page aligned
	4867	*/
	4868	start_offset = (int)(f_offset & PAGE_MASK_64);
	4869	upl_f_offset = f_offset - (off_t)start_offset;
	4870	max_size = filesize - f_offset;
	4871
	4872	if (resid < max_size)
	4873	io_size = resid;
	4874	else
	4875	io_size = max_size;
	4876
	4877	upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	4878	if ((uint32_t)upl_size > max_io_size)
	4879	upl_size = max_io_size;
	4880
	4881	skip_range = 0;
	4882	/*
	4883	* return the number of contiguously present pages in the cache
	4884	* starting at upl_f_offset within the file
	4885	*/
	4886	ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
	4887
	4888	if (skip_range) {
	4889	/*
	4890	* skip over pages already present in the cache
	4891	*/
	4892	io_size = skip_range - start_offset;
	4893
	4894	f_offset += io_size;
	4895	resid -= io_size;
	4896
	4897	if (skip_range == upl_size)
	4898	continue;
	4899	/*
	4900	* have to issue some real I/O
	4901	* at this point, we know it's starting on a page boundary
	4902	* because we've skipped over at least the first page in the request
	4903	*/
	4904	start_offset = 0;
	4905	upl_f_offset += skip_range;
	4906	upl_size -= skip_range;
	4907	}
	4908	pages_in_upl = upl_size / PAGE_SIZE;
	4909
	4910	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_START,
	4911	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4912
	4913	kret = ubc_create_upl(vp,
	4914	upl_f_offset,
	4915	upl_size,
	4916	&upl,
	4917	&pl,
	4918	UPL_RET_ONLY_ABSENT \| UPL_SET_LITE);
	4919	if (kret != KERN_SUCCESS)
	4920	return(retval);
	4921	issued_io = 0;
	4922
	4923	/*
	4924	* before we start marching forward, we must make sure we end on
	4925	* a present page, otherwise we will be working with a freed
	4926	* upl
	4927	*/
	4928	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	4929	if (upl_page_present(pl, last_pg))
	4930	break;
	4931	}
	4932	pages_in_upl = last_pg + 1;
	4933
	4934
	4935	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) \| DBG_FUNC_END,
	4936	upl, (int)upl_f_offset, upl_size, start_offset, 0);
	4937
	4938
	4939	for (last_pg = 0; last_pg < pages_in_upl; ) {
	4940	/*
	4941	* scan from the beginning of the upl looking for the first
	4942	* page that is present.... this will become the first page in
	4943	* the request we're going to make to 'cluster_io'... if all
	4944	* of the pages are absent, we won't call through to 'cluster_io'
	4945	*/
	4946	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	4947	if (upl_page_present(pl, start_pg))
	4948	break;
	4949	}
	4950
	4951	/*
	4952	* scan from the starting present page looking for an absent
	4953	* page before the end of the upl is reached, if we
	4954	* find one, then it will terminate the range of pages being
	4955	* presented to 'cluster_io'
	4956	*/
	4957	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	4958	if (!upl_page_present(pl, last_pg))
	4959	break;
	4960	}
	4961
	4962	if (last_pg > start_pg) {
	4963	/*
	4964	* we found a range of pages that must be filled
	4965	* if the last page in this range is the last page of the file
	4966	* we may have to clip the size of it to keep from reading past
	4967	* the end of the last physical block associated with the file
	4968	*/
	4969	upl_offset = start_pg * PAGE_SIZE;
	4970	io_size = (last_pg - start_pg) * PAGE_SIZE;
	4971
	4972	if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize)
	4973	io_size = filesize - (upl_f_offset + upl_offset);
	4974
	4975	/*
	4976	* issue an asynchronous read to cluster_io
	4977	*/
	4978	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	4979	CL_ASYNC \| CL_READ \| CL_COMMIT \| CL_AGE \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	4980
	4981	issued_io = 1;
	4982	}
	4983	}
	4984	if (issued_io == 0)
	4985	ubc_upl_abort(upl, 0);
	4986
	4987	io_size = upl_size - start_offset;
	4988
	4989	if (io_size > resid)
	4990	io_size = resid;
	4991	f_offset += io_size;
	4992	resid -= io_size;
	4993	}
	4994
	4995	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) \| DBG_FUNC_END,
	4996	(int)f_offset, resid, retval, 0, 0);
	4997
	4998	return(retval);
	4999	}
	5000
	5001
	5002	int
	5003	cluster_push(vnode_t vp, int flags)
	5004	{
	5005	return cluster_push_ext(vp, flags, NULL, NULL);
	5006	}
	5007
	5008
	5009	int
	5010	cluster_push_ext(vnode_t vp, int flags, int (callback)(buf_t, void ), void *callback_arg)
	5011	{
	5012	int retval;
	5013	int my_sparse_wait = 0;
	5014	struct cl_writebehind *wbp;
	5015
	5016	if ( !UBCINFOEXISTS(vp)) {
	5017	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, vp, flags, 0, -1, 0);
	5018	return (0);
	5019	}
	5020	/* return if deferred write is set */
	5021	if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
	5022	return (0);
	5023	}
	5024	if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
	5025	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, vp, flags, 0, -2, 0);
	5026	return (0);
	5027	}
	5028	if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
	5029	lck_mtx_unlock(&wbp->cl_lockw);
	5030
	5031	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_NONE, vp, flags, 0, -3, 0);
	5032	return(0);
	5033	}
	5034	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_START,
	5035	wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
	5036
	5037	/*
	5038	* if we have an fsync in progress, we don't want to allow any additional
	5039	* sync/fsync/close(s) to occur until it finishes.
	5040	* note that its possible for writes to continue to occur to this file
	5041	* while we're waiting and also once the fsync starts to clean if we're
	5042	* in the sparse map case
	5043	*/
	5044	while (wbp->cl_sparse_wait) {
	5045	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_START, vp, 0, 0, 0, 0);
	5046
	5047	msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5048
	5049	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) \| DBG_FUNC_END, vp, 0, 0, 0, 0);
	5050	}
	5051	if (flags & IO_SYNC) {
	5052	my_sparse_wait = 1;
	5053	wbp->cl_sparse_wait = 1;
	5054
	5055	/*
	5056	* this is an fsync (or equivalent)... we must wait for any existing async
	5057	* cleaning operations to complete before we evaulate the current state
	5058	* and finish cleaning... this insures that all writes issued before this
	5059	* fsync actually get cleaned to the disk before this fsync returns
	5060	*/
	5061	while (wbp->cl_sparse_pushes) {
	5062	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_START, vp, 0, 0, 0, 0);
	5063
	5064	msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
	5065
	5066	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) \| DBG_FUNC_END, vp, 0, 0, 0, 0);
	5067	}
	5068	}
	5069	if (wbp->cl_scmap) {
	5070	void *scmap;
	5071
	5072	if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
	5073
	5074	scmap = wbp->cl_scmap;
	5075	wbp->cl_scmap = NULL;
	5076
	5077	wbp->cl_sparse_pushes++;
	5078
	5079	lck_mtx_unlock(&wbp->cl_lockw);
	5080
	5081	sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5082
	5083	lck_mtx_lock(&wbp->cl_lockw);
	5084
	5085	wbp->cl_sparse_pushes--;
	5086
	5087	if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
	5088	wakeup((caddr_t)&wbp->cl_sparse_pushes);
	5089	} else {
	5090	sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5091	}
	5092	retval = 1;
	5093	} else {
	5094	retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
	5095	}
	5096	lck_mtx_unlock(&wbp->cl_lockw);
	5097
	5098	if (flags & IO_SYNC)
	5099	(void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
	5100
	5101	if (my_sparse_wait) {
	5102	/*
	5103	* I'm the owner of the serialization token
	5104	* clear it and wakeup anyone that is waiting
	5105	* for me to finish
	5106	*/
	5107	lck_mtx_lock(&wbp->cl_lockw);
	5108
	5109	wbp->cl_sparse_wait = 0;
	5110	wakeup((caddr_t)&wbp->cl_sparse_wait);
	5111
	5112	lck_mtx_unlock(&wbp->cl_lockw);
	5113	}
	5114	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) \| DBG_FUNC_END,
	5115	wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
	5116
	5117	return (retval);
	5118	}
	5119
	5120
	5121	__private_extern__ void
	5122	cluster_release(struct ubc_info *ubc)
	5123	{
	5124	struct cl_writebehind *wbp;
	5125	struct cl_readahead *rap;
	5126
	5127	if ((wbp = ubc->cl_wbehind)) {
	5128
	5129	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
	5130
	5131	if (wbp->cl_scmap)
	5132	vfs_drt_control(&(wbp->cl_scmap), 0);
	5133	} else {
	5134	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_START, ubc, 0, 0, 0, 0);
	5135	}
	5136
	5137	rap = ubc->cl_rahead;
	5138
	5139	if (wbp != NULL) {
	5140	lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
	5141	FREE_ZONE((void )wbp, sizeof wbp, M_CLWRBEHIND);
	5142	}
	5143	if ((rap = ubc->cl_rahead)) {
	5144	lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
	5145	FREE_ZONE((void )rap, sizeof rap, M_CLRDAHEAD);
	5146	}
	5147	ubc->cl_rahead = NULL;
	5148	ubc->cl_wbehind = NULL;
	5149
	5150	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) \| DBG_FUNC_END, ubc, rap, wbp, 0, 0);
	5151	}
	5152
	5153
	5154	static int
	5155	cluster_try_push(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg)
	5156	{
	5157	int cl_index;
	5158	int cl_index1;
	5159	int min_index;
	5160	int cl_len;
	5161	int cl_pushed = 0;
	5162	struct cl_wextent l_clusters[MAX_CLUSTERS];
	5163	u_int max_cluster_pgcount;
	5164
	5165
	5166	max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
	5167	/*
	5168	* the write behind context exists and has
	5169	* already been locked...
	5170	*/
	5171	if (wbp->cl_number == 0)
	5172	/*
	5173	* no clusters to push
	5174	* return number of empty slots
	5175	*/
	5176	return (MAX_CLUSTERS);
	5177
	5178	/*
	5179	* make a local 'sorted' copy of the clusters
	5180	* and clear wbp->cl_number so that new clusters can
	5181	* be developed
	5182	*/
	5183	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5184	for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
	5185	if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
	5186	continue;
	5187	if (min_index == -1)
	5188	min_index = cl_index1;
	5189	else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
	5190	min_index = cl_index1;
	5191	}
	5192	if (min_index == -1)
	5193	break;
	5194
	5195	l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
	5196	l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
	5197	l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
	5198
	5199	wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
	5200	}
	5201	wbp->cl_number = 0;
	5202
	5203	cl_len = cl_index;
	5204
	5205	if ( (push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS ) {
	5206	int i;
	5207
	5208	/*
	5209	* determine if we appear to be writing the file sequentially
	5210	* if not, by returning without having pushed any clusters
	5211	* we will cause this vnode to be pushed into the sparse cluster mechanism
	5212	* used for managing more random I/O patterns
	5213	*
	5214	* we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
	5215	* that's why we're in try_push with PUSH_DELAY...
	5216	*
	5217	* check to make sure that all the clusters except the last one are 'full'... and that each cluster
	5218	* is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
	5219	* so we can just make a simple pass through, up to, but not including the last one...
	5220	* note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
	5221	* are sequential
	5222	*
	5223	* we let the last one be partial as long as it was adjacent to the previous one...
	5224	* we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
	5225	* of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
	5226	*/
	5227	for (i = 0; i < MAX_CLUSTERS - 1; i++) {
	5228	if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount)
	5229	goto dont_try;
	5230	if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
	5231	goto dont_try;
	5232	}
	5233	}
	5234	for (cl_index = 0; cl_index < cl_len; cl_index++) {
	5235	int flags;
	5236	struct cl_extent cl;
	5237
	5238	flags = io_flags & (IO_PASSIVE\|IO_CLOSE);
	5239
	5240	/*
	5241	* try to push each cluster in turn...
	5242	*/
	5243	if (l_clusters[cl_index].io_flags & CLW_IONOCACHE)
	5244	flags \|= IO_NOCACHE;
	5245
	5246	if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE)
	5247	flags \|= IO_PASSIVE;
	5248
	5249	if (push_flag & PUSH_SYNC)
	5250	flags \|= IO_SYNC;
	5251
	5252	cl.b_addr = l_clusters[cl_index].b_addr;
	5253	cl.e_addr = l_clusters[cl_index].e_addr;
	5254
	5255	cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
	5256
	5257	l_clusters[cl_index].b_addr = 0;
	5258	l_clusters[cl_index].e_addr = 0;
	5259
	5260	cl_pushed++;
	5261
	5262	if ( !(push_flag & PUSH_ALL) )
	5263	break;
	5264	}
	5265	dont_try:
	5266	if (cl_len > cl_pushed) {
	5267	/*
	5268	* we didn't push all of the clusters, so
	5269	* lets try to merge them back in to the vnode
	5270	*/
	5271	if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
	5272	/*
	5273	* we picked up some new clusters while we were trying to
	5274	* push the old ones... this can happen because I've dropped
	5275	* the vnode lock... the sum of the
	5276	* leftovers plus the new cluster count exceeds our ability
	5277	* to represent them, so switch to the sparse cluster mechanism
	5278	*
	5279	* collect the active public clusters...
	5280	*/
	5281	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5282
	5283	for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
	5284	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5285	continue;
	5286	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5287	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5288	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5289
	5290	cl_index1++;
	5291	}
	5292	/*
	5293	* update the cluster count
	5294	*/
	5295	wbp->cl_number = cl_index1;
	5296
	5297	/*
	5298	* and collect the original clusters that were moved into the
	5299	* local storage for sorting purposes
	5300	*/
	5301	sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
	5302
	5303	} else {
	5304	/*
	5305	* we've got room to merge the leftovers back in
	5306	* just append them starting at the next 'hole'
	5307	* represented by wbp->cl_number
	5308	*/
	5309	for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
	5310	if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
	5311	continue;
	5312
	5313	wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
	5314	wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
	5315	wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
	5316
	5317	cl_index1++;
	5318	}
	5319	/*
	5320	* update the cluster count
	5321	*/
	5322	wbp->cl_number = cl_index1;
	5323	}
	5324	}
	5325	return (MAX_CLUSTERS - wbp->cl_number);
	5326	}
	5327
	5328
	5329
	5330	static int
	5331	cluster_push_now(vnode_t vp, struct cl_extent cl, off_t EOF, int flags, int (callback)(buf_t, void ), void callback_arg)
	5332	{
	5333	upl_page_info_t *pl;
	5334	upl_t upl;
	5335	vm_offset_t upl_offset;
	5336	int upl_size;
	5337	off_t upl_f_offset;
	5338	int pages_in_upl;
	5339	int start_pg;
	5340	int last_pg;
	5341	int io_size;
	5342	int io_flags;
	5343	int upl_flags;
	5344	int bflag;
	5345	int size;
	5346	int error = 0;
	5347	int retval;
	5348	kern_return_t kret;
	5349
	5350	if (flags & IO_PASSIVE)
	5351	bflag = CL_PASSIVE;
	5352	else
	5353	bflag = 0;
	5354
	5355	if (flags & IO_SKIP_ENCRYPTION)
	5356	bflag \|= CL_ENCRYPTED;
	5357
	5358	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_START,
	5359	(int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
	5360
	5361	if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
	5362	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 0, 0, 0, 0);
	5363
	5364	return (0);
	5365	}
	5366	upl_size = pages_in_upl * PAGE_SIZE;
	5367	upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5368
	5369	if (upl_f_offset + upl_size >= EOF) {
	5370
	5371	if (upl_f_offset >= EOF) {
	5372	/*
	5373	* must have truncated the file and missed
	5374	* clearing a dangling cluster (i.e. it's completely
	5375	* beyond the new EOF
	5376	*/
	5377	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 1, 0, 0, 0);
	5378
	5379	return(0);
	5380	}
	5381	size = EOF - upl_f_offset;
	5382
	5383	upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
	5384	pages_in_upl = upl_size / PAGE_SIZE;
	5385	} else
	5386	size = upl_size;
	5387
	5388	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_START, upl_size, size, 0, 0, 0);
	5389
	5390	/*
	5391	* by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
	5392	*
	5393	* - only pages that are currently dirty are returned... these are the ones we need to clean
	5394	* - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
	5395	* - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
	5396	* - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
	5397	* someone dirties this page while the I/O is in progress, we don't lose track of the new state
	5398	*
	5399	* when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
	5400	*/
	5401
	5402	if ((vp->v_flag & VNOCACHE_DATA) \|\| (flags & IO_NOCACHE))
	5403	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE \| UPL_WILL_BE_DUMPED;
	5404	else
	5405	upl_flags = UPL_COPYOUT_FROM \| UPL_RET_ONLY_DIRTY \| UPL_SET_LITE;
	5406
	5407	kret = ubc_create_upl(vp,
	5408	upl_f_offset,
	5409	upl_size,
	5410	&upl,
	5411	&pl,
	5412	upl_flags);
	5413	if (kret != KERN_SUCCESS)
	5414	panic("cluster_push: failed to get pagelist");
	5415
	5416	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) \| DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
	5417
	5418	/*
	5419	* since we only asked for the dirty pages back
	5420	* it's possible that we may only get a few or even none, so...
	5421	* before we start marching forward, we must make sure we know
	5422	* where the last present page is in the UPL, otherwise we could
	5423	* end up working with a freed upl due to the FREE_ON_EMPTY semantics
	5424	* employed by commit_range and abort_range.
	5425	*/
	5426	for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
	5427	if (upl_page_present(pl, last_pg))
	5428	break;
	5429	}
	5430	pages_in_upl = last_pg + 1;
	5431
	5432	if (pages_in_upl == 0) {
	5433	ubc_upl_abort(upl, 0);
	5434
	5435	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 2, 0, 0, 0);
	5436	return(0);
	5437	}
	5438
	5439	for (last_pg = 0; last_pg < pages_in_upl; ) {
	5440	/*
	5441	* find the next dirty page in the UPL
	5442	* this will become the first page in the
	5443	* next I/O to generate
	5444	*/
	5445	for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
	5446	if (upl_dirty_page(pl, start_pg))
	5447	break;
	5448	if (upl_page_present(pl, start_pg))
	5449	/*
	5450	* RET_ONLY_DIRTY will return non-dirty 'precious' pages
	5451	* just release these unchanged since we're not going
	5452	* to steal them or change their state
	5453	*/
	5454	ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
	5455	}
	5456	if (start_pg >= pages_in_upl)
	5457	/*
	5458	* done... no more dirty pages to push
	5459	*/
	5460	break;
	5461	if (start_pg > last_pg)
	5462	/*
	5463	* skipped over some non-dirty pages
	5464	*/
	5465	size -= ((start_pg - last_pg) * PAGE_SIZE);
	5466
	5467	/*
	5468	* find a range of dirty pages to write
	5469	*/
	5470	for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
	5471	if (!upl_dirty_page(pl, last_pg))
	5472	break;
	5473	}
	5474	upl_offset = start_pg * PAGE_SIZE;
	5475
	5476	io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
	5477
	5478	io_flags = CL_THROTTLE \| CL_COMMIT \| CL_AGE \| bflag;
	5479
	5480	if ( !(flags & IO_SYNC))
	5481	io_flags \|= CL_ASYNC;
	5482
	5483	if (flags & IO_CLOSE)
	5484	io_flags \|= CL_CLOSE;
	5485
	5486	if (flags & IO_NOCACHE)
	5487	io_flags \|= CL_NOCACHE;
	5488
	5489	retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
	5490	io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5491
	5492	if (error == 0 && retval)
	5493	error = retval;
	5494
	5495	size -= io_size;
	5496	}
	5497	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) \| DBG_FUNC_END, 1, 3, 0, 0, 0);
	5498
	5499	return(error);
	5500	}
	5501
	5502
	5503	/*
	5504	* sparse_cluster_switch is called with the write behind lock held
	5505	*/
	5506	static void
	5507	sparse_cluster_switch(struct cl_writebehind wbp, vnode_t vp, off_t EOF, int (callback)(buf_t, void ), void callback_arg)
	5508	{
	5509	int cl_index;
	5510
	5511	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_START, vp, wbp->cl_scmap, 0, 0, 0);
	5512
	5513	for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
	5514	int flags;
	5515	struct cl_extent cl;
	5516
	5517	for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
	5518
	5519	if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
	5520	if (flags & UPL_POP_DIRTY) {
	5521	cl.e_addr = cl.b_addr + 1;
	5522
	5523	sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg);
	5524	}
	5525	}
	5526	}
	5527	}
	5528	wbp->cl_number = 0;
	5529
	5530	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) \| DBG_FUNC_END, vp, wbp->cl_scmap, 0, 0, 0);
	5531	}
	5532
	5533
	5534	/*
	5535	* sparse_cluster_push must be called with the write-behind lock held if the scmap is
	5536	* still associated with the write-behind context... however, if the scmap has been disassociated
	5537	* from the write-behind context (the cluster_push case), the wb lock is not held
	5538	*/
	5539	static void
	5540	sparse_cluster_push(void *scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (callback)(buf_t, void ), void callback_arg)
	5541	{
	5542	struct cl_extent cl;
	5543	off_t offset;
	5544	u_int length;
	5545
	5546	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_START, vp, (*scmap), 0, push_flag, 0);
	5547
	5548	if (push_flag & PUSH_ALL)
	5549	vfs_drt_control(scmap, 1);
	5550
	5551	for (;;) {
	5552	if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
	5553	break;
	5554
	5555	cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
	5556	cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
	5557
	5558	cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE\|IO_CLOSE), callback, callback_arg);
	5559
	5560	if ( !(push_flag & PUSH_ALL) )
	5561	break;
	5562	}
	5563	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) \| DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
	5564	}
	5565
	5566
	5567	/*
	5568	* sparse_cluster_add is called with the write behind lock held
	5569	*/
	5570	static void
	5571	sparse_cluster_add(void *scmap, vnode_t vp, struct cl_extent cl, off_t EOF, int (callback)(buf_t, void ), void *callback_arg)
	5572	{
	5573	u_int new_dirty;
	5574	u_int length;
	5575	off_t offset;
	5576
	5577	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
	5578
	5579	offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
	5580	length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
	5581
	5582	while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
	5583	/*
	5584	* no room left in the map
	5585	* only a partial update was done
	5586	* push out some pages and try again
	5587	*/
	5588	sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
	5589
	5590	offset += (new_dirty * PAGE_SIZE_64);
	5591	length -= (new_dirty * PAGE_SIZE);
	5592	}
	5593	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) \| DBG_FUNC_END, vp, (*scmap), 0, 0, 0);
	5594	}
	5595
	5596
	5597	static int
	5598	cluster_align_phys_io(vnode_t vp, struct uio uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (callback)(buf_t, void ), void callback_arg)
	5599	{
	5600	upl_page_info_t *pl;
	5601	upl_t upl;
	5602	addr64_t ubc_paddr;
	5603	kern_return_t kret;
	5604	int error = 0;
	5605	int did_read = 0;
	5606	int abort_flags;
	5607	int upl_flags;
	5608	int bflag;
	5609
	5610	if (flags & IO_PASSIVE)
	5611	bflag = CL_PASSIVE;
	5612	else
	5613	bflag = 0;
	5614
	5615	if (flags & IO_NOCACHE)
	5616	bflag \|= CL_NOCACHE;
	5617
	5618	upl_flags = UPL_SET_LITE;
	5619
	5620	if ( !(flags & CL_READ) ) {
	5621	/*
	5622	* "write" operation: let the UPL subsystem know
	5623	* that we intend to modify the buffer cache pages
	5624	* we're gathering.
	5625	*/
	5626	upl_flags \|= UPL_WILL_MODIFY;
	5627	} else {
	5628	/*
	5629	* indicate that there is no need to pull the
	5630	* mapping for this page... we're only going
	5631	* to read from it, not modify it.
	5632	*/
	5633	upl_flags \|= UPL_FILE_IO;
	5634	}
	5635	kret = ubc_create_upl(vp,
	5636	uio->uio_offset & ~PAGE_MASK_64,
	5637	PAGE_SIZE,
	5638	&upl,
	5639	&pl,
	5640	upl_flags);
	5641
	5642	if (kret != KERN_SUCCESS)
	5643	return(EINVAL);
	5644
	5645	if (!upl_valid_page(pl, 0)) {
	5646	/*
	5647	* issue a synchronous read to cluster_io
	5648	*/
	5649	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	5650	CL_READ \| bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5651	if (error) {
	5652	ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES \| UPL_ABORT_FREE_ON_EMPTY);
	5653
	5654	return(error);
	5655	}
	5656	did_read = 1;
	5657	}
	5658	ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
	5659
	5660	/*
	5661	* NOTE: There is no prototype for the following in BSD. It, and the definitions
	5662	* of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
	5663	* osfmk/ppc/mappings.h. They are not included here because there appears to be no
	5664	* way to do so without exporting them to kexts as well.
	5665	*/
	5666	if (flags & CL_READ)
	5667	// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsnk); /* Copy physical to physical and flush the destination */
	5668	copypv(ubc_paddr, usr_paddr, xsize, 2 \| 1 \| 4); /* Copy physical to physical and flush the destination */
	5669	else
	5670	// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc \| cppvPsnk \| cppvFsrc); /* Copy physical to physical and flush the source */
	5671	copypv(usr_paddr, ubc_paddr, xsize, 2 \| 1 \| 8); /* Copy physical to physical and flush the source */
	5672
	5673	if ( !(flags & CL_READ) \|\| (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
	5674	/*
	5675	* issue a synchronous write to cluster_io
	5676	*/
	5677	error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
	5678	bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
	5679	}
	5680	if (error == 0)
	5681	uio_update(uio, (user_size_t)xsize);
	5682
	5683	if (did_read)
	5684	abort_flags = UPL_ABORT_FREE_ON_EMPTY;
	5685	else
	5686	abort_flags = UPL_ABORT_FREE_ON_EMPTY \| UPL_ABORT_DUMP_PAGES;
	5687
	5688	ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
	5689
	5690	return (error);
	5691	}
	5692
	5693
	5694
	5695	int
	5696	cluster_copy_upl_data(struct uio uio, upl_t upl, int upl_offset, int io_resid)
	5697	{
	5698	int pg_offset;
	5699	int pg_index;
	5700	int csize;
	5701	int segflg;
	5702	int retval = 0;
	5703	int xsize;
	5704	upl_page_info_t *pl;
	5705
	5706	xsize = *io_resid;
	5707
	5708	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	5709	(int)uio->uio_offset, upl_offset, xsize, 0, 0);
	5710
	5711	segflg = uio->uio_segflg;
	5712
	5713	switch(segflg) {
	5714
	5715	case UIO_USERSPACE32:
	5716	case UIO_USERISPACE32:
	5717	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	5718	break;
	5719
	5720	case UIO_USERSPACE:
	5721	case UIO_USERISPACE:
	5722	uio->uio_segflg = UIO_PHYS_USERSPACE;
	5723	break;
	5724
	5725	case UIO_USERSPACE64:
	5726	case UIO_USERISPACE64:
	5727	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	5728	break;
	5729
	5730	case UIO_SYSSPACE:
	5731	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	5732	break;
	5733
	5734	}
	5735	pl = ubc_upl_pageinfo(upl);
	5736
	5737	pg_index = upl_offset / PAGE_SIZE;
	5738	pg_offset = upl_offset & PAGE_MASK;
	5739	csize = min(PAGE_SIZE - pg_offset, xsize);
	5740
	5741	while (xsize && retval == 0) {
	5742	addr64_t paddr;
	5743
	5744	paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
	5745
	5746	retval = uiomove64(paddr, csize, uio);
	5747
	5748	pg_index += 1;
	5749	pg_offset = 0;
	5750	xsize -= csize;
	5751	csize = min(PAGE_SIZE, xsize);
	5752	}
	5753	*io_resid = xsize;
	5754
	5755	uio->uio_segflg = segflg;
	5756
	5757	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	5758	(int)uio->uio_offset, xsize, retval, segflg, 0);
	5759
	5760	return (retval);
	5761	}
	5762
	5763
	5764	int
	5765	cluster_copy_ubc_data(vnode_t vp, struct uio uio, int io_resid, int mark_dirty)
	5766	{
	5767
	5768	return (cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1));
	5769	}
	5770
	5771
	5772	static int
	5773	cluster_copy_ubc_data_internal(vnode_t vp, struct uio uio, int io_resid, int mark_dirty, int take_reference)
	5774	{
	5775	int segflg;
	5776	int io_size;
	5777	int xsize;
	5778	int start_offset;
	5779	int retval = 0;
	5780	memory_object_control_t control;
	5781
	5782	io_size = *io_resid;
	5783
	5784	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_START,
	5785	(int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
	5786
	5787	control = ubc_getobject(vp, UBC_FLAGS_NONE);
	5788
	5789	if (control == MEMORY_OBJECT_CONTROL_NULL) {
	5790	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	5791	(int)uio->uio_offset, io_size, retval, 3, 0);
	5792
	5793	return(0);
	5794	}
	5795	segflg = uio->uio_segflg;
	5796
	5797	switch(segflg) {
	5798
	5799	case UIO_USERSPACE32:
	5800	case UIO_USERISPACE32:
	5801	uio->uio_segflg = UIO_PHYS_USERSPACE32;
	5802	break;
	5803
	5804	case UIO_USERSPACE64:
	5805	case UIO_USERISPACE64:
	5806	uio->uio_segflg = UIO_PHYS_USERSPACE64;
	5807	break;
	5808
	5809	case UIO_USERSPACE:
	5810	case UIO_USERISPACE:
	5811	uio->uio_segflg = UIO_PHYS_USERSPACE;
	5812	break;
	5813
	5814	case UIO_SYSSPACE:
	5815	uio->uio_segflg = UIO_PHYS_SYSSPACE;
	5816	break;
	5817	}
	5818
	5819	if ( (io_size = *io_resid) ) {
	5820	start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
	5821	xsize = uio_resid(uio);
	5822
	5823	retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
	5824	start_offset, io_size, mark_dirty, take_reference);
	5825	xsize -= uio_resid(uio);
	5826	io_size -= xsize;
	5827	}
	5828	uio->uio_segflg = segflg;
	5829	*io_resid = io_size;
	5830
	5831	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) \| DBG_FUNC_END,
	5832	(int)uio->uio_offset, io_size, retval, 0x80000000 \| segflg, 0);
	5833
	5834	return(retval);
	5835	}
	5836
	5837
	5838	int
	5839	is_file_clean(vnode_t vp, off_t filesize)
	5840	{
	5841	off_t f_offset;
	5842	int flags;
	5843	int total_dirty = 0;
	5844
	5845	for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
	5846	if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
	5847	if (flags & UPL_POP_DIRTY) {
	5848	total_dirty++;
	5849	}
	5850	}
	5851	}
	5852	if (total_dirty)
	5853	return(EINVAL);
	5854
	5855	return (0);
	5856	}
	5857
	5858
	5859
	5860	/*
	5861	* Dirty region tracking/clustering mechanism.
	5862	*
	5863	* This code (vfs_drt_*) provides a mechanism for tracking and clustering
	5864	* dirty regions within a larger space (file). It is primarily intended to
	5865	* support clustering in large files with many dirty areas.
	5866	*
	5867	* The implementation assumes that the dirty regions are pages.
	5868	*
	5869	* To represent dirty pages within the file, we store bit vectors in a
	5870	* variable-size circular hash.
	5871	*/
	5872
	5873	/*
	5874	* Bitvector size. This determines the number of pages we group in a
	5875	* single hashtable entry. Each hashtable entry is aligned to this
	5876	* size within the file.
	5877	*/
	5878	#define DRT_BITVECTOR_PAGES 256
	5879
	5880	/*
	5881	* File offset handling.
	5882	*
	5883	* DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
	5884	* the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
	5885	*/
	5886	#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
	5887	#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
	5888
	5889	/*
	5890	* Hashtable address field handling.
	5891	*
	5892	* The low-order bits of the hashtable address are used to conserve
	5893	* space.
	5894	*
	5895	* DRT_HASH_COUNT_MASK must be large enough to store the range
	5896	* 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
	5897	* to indicate that the bucket is actually unoccupied.
	5898	*/
	5899	#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
	5900	#define DRT_HASH_SET_ADDRESS(scm, i, a) \
	5901	do { \
	5902	(scm)->scm_hashtable[(i)].dhe_control = \
	5903	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) \| DRT_ALIGN_ADDRESS(a); \
	5904	} while (0)
	5905	#define DRT_HASH_COUNT_MASK 0x1ff
	5906	#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
	5907	#define DRT_HASH_SET_COUNT(scm, i, c) \
	5908	do { \
	5909	(scm)->scm_hashtable[(i)].dhe_control = \
	5910	((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) \| ((c) & DRT_HASH_COUNT_MASK); \
	5911	} while (0)
	5912	#define DRT_HASH_CLEAR(scm, i) \
	5913	do { \
	5914	(scm)->scm_hashtable[(i)].dhe_control = 0; \
	5915	} while (0)
	5916	#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
	5917	#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
	5918	#define DRT_HASH_COPY(oscm, oi, scm, i) \
	5919	do { \
	5920	(scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
	5921	DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
	5922	} while(0);
	5923
	5924
	5925	/*
	5926	* Hash table moduli.
	5927	*
	5928	* Since the hashtable entry's size is dependent on the size of
	5929	* the bitvector, and since the hashtable size is constrained to
	5930	* both being prime and fitting within the desired allocation
	5931	* size, these values need to be manually determined.
	5932	*
	5933	* For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
	5934	*
	5935	* The small hashtable allocation is 1024 bytes, so the modulus is 23.
	5936	* The large hashtable allocation is 16384 bytes, so the modulus is 401.
	5937	*/
	5938	#define DRT_HASH_SMALL_MODULUS 23
	5939	#define DRT_HASH_LARGE_MODULUS 401
	5940
	5941	/*
	5942	* Physical memory required before the large hash modulus is permitted.
	5943	*
	5944	* On small memory systems, the large hash modulus can lead to phsyical
	5945	* memory starvation, so we avoid using it there.
	5946	*/
	5947	#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
	5948
	5949	#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
	5950	#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
	5951
	5952	/* * nothing below here has secret dependencies on DRT_BITVECTOR_PAGES * */
	5953
	5954	/*
	5955	* Hashtable bitvector handling.
	5956	*
	5957	* Bitvector fields are 32 bits long.
	5958	*/
	5959
	5960	#define DRT_HASH_SET_BIT(scm, i, bit) \
	5961	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] \|= (1 << ((bit) % 32))
	5962
	5963	#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
	5964	(scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
	5965
	5966	#define DRT_HASH_TEST_BIT(scm, i, bit) \
	5967	((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
	5968
	5969	#define DRT_BITVECTOR_CLEAR(scm, i) \
	5970	bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	5971
	5972	#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
	5973	bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
	5974	&(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
	5975	(DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
	5976
	5977
	5978
	5979	/*
	5980	* Hashtable entry.
	5981	*/
	5982	struct vfs_drt_hashentry {
	5983	u_int64_t dhe_control;
	5984	u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
	5985	};
	5986
	5987	/*
	5988	* Dirty Region Tracking structure.
	5989	*
	5990	* The hashtable is allocated entirely inside the DRT structure.
	5991	*
	5992	* The hash is a simple circular prime modulus arrangement, the structure
	5993	* is resized from small to large if it overflows.
	5994	*/
	5995
	5996	struct vfs_drt_clustermap {
	5997	u_int32_t scm_magic; /* sanity/detection */
	5998	#define DRT_SCM_MAGIC 0x12020003
	5999	u_int32_t scm_modulus; /* current ring size */
	6000	u_int32_t scm_buckets; /* number of occupied buckets */
	6001	u_int32_t scm_lastclean; /* last entry we cleaned */
	6002	u_int32_t scm_iskips; /* number of slot skips */
	6003
	6004	struct vfs_drt_hashentry scm_hashtable[0];
	6005	};
	6006
	6007
	6008	#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
	6009	#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
	6010
	6011	/*
	6012	* Debugging codes and arguments.
	6013	*/
	6014	#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
	6015	#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
	6016	#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
	6017	#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
	6018	#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
	6019	* dirty */
	6020	/* 0, setcount */
	6021	/* 1 (clean, no map) */
	6022	/* 2 (map alloc fail) */
	6023	/* 3, resid (partial) */
	6024	#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
	6025	#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
	6026	* lastclean, iskips */
	6027
	6028
	6029	static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
	6030	static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
	6031	static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
	6032	u_int64_t offset, int *indexp);
	6033	static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
	6034	u_int64_t offset,
	6035	int *indexp,
	6036	int recursed);
	6037	static kern_return_t vfs_drt_do_mark_pages(
	6038	void **cmapp,
	6039	u_int64_t offset,
	6040	u_int length,
	6041	u_int *setcountp,
	6042	int dirty);
	6043	static void vfs_drt_trace(
	6044	struct vfs_drt_clustermap *cmap,
	6045	int code,
	6046	int arg1,
	6047	int arg2,
	6048	int arg3,
	6049	int arg4);
	6050
	6051
	6052	/*
	6053	* Allocate and initialise a sparse cluster map.
	6054	*
	6055	* Will allocate a new map, resize or compact an existing map.
	6056	*
	6057	* XXX we should probably have at least one intermediate map size,
	6058	* as the 1:16 ratio seems a bit drastic.
	6059	*/
	6060	static kern_return_t
	6061	vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
	6062	{
	6063	struct vfs_drt_clustermap cmap, ocmap;
	6064	kern_return_t kret;
	6065	u_int64_t offset;
	6066	u_int32_t i;
	6067	int nsize, active_buckets, index, copycount;
	6068
	6069	ocmap = NULL;
	6070	if (cmapp != NULL)
	6071	ocmap = *cmapp;
	6072
	6073	/*
	6074	* Decide on the size of the new map.
	6075	*/
	6076	if (ocmap == NULL) {
	6077	nsize = DRT_HASH_SMALL_MODULUS;
	6078	} else {
	6079	/* count the number of active buckets in the old map */
	6080	active_buckets = 0;
	6081	for (i = 0; i < ocmap->scm_modulus; i++) {
	6082	if (!DRT_HASH_VACANT(ocmap, i) &&
	6083	(DRT_HASH_GET_COUNT(ocmap, i) != 0))
	6084	active_buckets++;
	6085	}
	6086	/*
	6087	* If we're currently using the small allocation, check to
	6088	* see whether we should grow to the large one.
	6089	*/
	6090	if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
	6091	/*
	6092	* If the ring is nearly full and we are allowed to
	6093	* use the large modulus, upgrade.
	6094	*/
	6095	if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
	6096	(max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
	6097	nsize = DRT_HASH_LARGE_MODULUS;
	6098	} else {
	6099	nsize = DRT_HASH_SMALL_MODULUS;
	6100	}
	6101	} else {
	6102	/* already using the large modulus */
	6103	nsize = DRT_HASH_LARGE_MODULUS;
	6104	/*
	6105	* If the ring is completely full, there's
	6106	* nothing useful for us to do. Behave as
	6107	* though we had compacted into the new
	6108	* array and return.
	6109	*/
	6110	if (active_buckets >= DRT_HASH_LARGE_MODULUS)
	6111	return(KERN_SUCCESS);
	6112	}
	6113	}
	6114
	6115	/*
	6116	* Allocate and initialise the new map.
	6117	*/
	6118
	6119	kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
	6120	(nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	6121	if (kret != KERN_SUCCESS)
	6122	return(kret);
	6123	cmap->scm_magic = DRT_SCM_MAGIC;
	6124	cmap->scm_modulus = nsize;
	6125	cmap->scm_buckets = 0;
	6126	cmap->scm_lastclean = 0;
	6127	cmap->scm_iskips = 0;
	6128	for (i = 0; i < cmap->scm_modulus; i++) {
	6129	DRT_HASH_CLEAR(cmap, i);
	6130	DRT_HASH_VACATE(cmap, i);
	6131	DRT_BITVECTOR_CLEAR(cmap, i);
	6132	}
	6133
	6134	/*
	6135	* If there's an old map, re-hash entries from it into the new map.
	6136	*/
	6137	copycount = 0;
	6138	if (ocmap != NULL) {
	6139	for (i = 0; i < ocmap->scm_modulus; i++) {
	6140	/* skip empty buckets */
	6141	if (DRT_HASH_VACANT(ocmap, i) \|\|
	6142	(DRT_HASH_GET_COUNT(ocmap, i) == 0))
	6143	continue;
	6144	/* get new index */
	6145	offset = DRT_HASH_GET_ADDRESS(ocmap, i);
	6146	kret = vfs_drt_get_index(&cmap, offset, &index, 1);
	6147	if (kret != KERN_SUCCESS) {
	6148	/* XXX need to bail out gracefully here */
	6149	panic("vfs_drt: new cluster map mysteriously too small");
	6150	index = 0;
	6151	}
	6152	/* copy */
	6153	DRT_HASH_COPY(ocmap, i, cmap, index);
	6154	copycount++;
	6155	}
	6156	}
	6157
	6158	/* log what we've done */
	6159	vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
	6160
	6161	/*
	6162	* It's important to ensure that *cmapp always points to
	6163	* a valid map, so we must overwrite it before freeing
	6164	* the old map.
	6165	*/
	6166	*cmapp = cmap;
	6167	if (ocmap != NULL) {
	6168	/* emit stats into trace buffer */
	6169	vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
	6170	ocmap->scm_modulus,
	6171	ocmap->scm_buckets,
	6172	ocmap->scm_lastclean,
	6173	ocmap->scm_iskips);
	6174
	6175	vfs_drt_free_map(ocmap);
	6176	}
	6177	return(KERN_SUCCESS);
	6178	}
	6179
	6180
	6181	/*
	6182	* Free a sparse cluster map.
	6183	*/
	6184	static kern_return_t
	6185	vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
	6186	{
	6187	kmem_free(kernel_map, (vm_offset_t)cmap,
	6188	(cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
	6189	return(KERN_SUCCESS);
	6190	}
	6191
	6192
	6193	/*
	6194	* Find the hashtable slot currently occupied by an entry for the supplied offset.
	6195	*/
	6196	static kern_return_t
	6197	vfs_drt_search_index(struct vfs_drt_clustermap cmap, u_int64_t offset, int indexp)
	6198	{
	6199	int index;
	6200	u_int32_t i;
	6201
	6202	offset = DRT_ALIGN_ADDRESS(offset);
	6203	index = DRT_HASH(cmap, offset);
	6204
	6205	/* traverse the hashtable */
	6206	for (i = 0; i < cmap->scm_modulus; i++) {
	6207
	6208	/*
	6209	* If the slot is vacant, we can stop.
	6210	*/
	6211	if (DRT_HASH_VACANT(cmap, index))
	6212	break;
	6213
	6214	/*
	6215	* If the address matches our offset, we have success.
	6216	*/
	6217	if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
	6218	*indexp = index;
	6219	return(KERN_SUCCESS);
	6220	}
	6221
	6222	/*
	6223	* Move to the next slot, try again.
	6224	*/
	6225	index = DRT_HASH_NEXT(cmap, index);
	6226	}
	6227	/*
	6228	* It's not there.
	6229	*/
	6230	return(KERN_FAILURE);
	6231	}
	6232
	6233	/*
	6234	* Find the hashtable slot for the supplied offset. If we haven't allocated
	6235	* one yet, allocate one and populate the address field. Note that it will
	6236	* not have a nonzero page count and thus will still technically be free, so
	6237	* in the case where we are called to clean pages, the slot will remain free.
	6238	*/
	6239	static kern_return_t
	6240	vfs_drt_get_index(struct vfs_drt_clustermap *cmapp, u_int64_t offset, int indexp, int recursed)
	6241	{
	6242	struct vfs_drt_clustermap *cmap;
	6243	kern_return_t kret;
	6244	u_int32_t index;
	6245	u_int32_t i;
	6246
	6247	cmap = *cmapp;
	6248
	6249	/* look for an existing entry */
	6250	kret = vfs_drt_search_index(cmap, offset, indexp);
	6251	if (kret == KERN_SUCCESS)
	6252	return(kret);
	6253
	6254	/* need to allocate an entry */
	6255	offset = DRT_ALIGN_ADDRESS(offset);
	6256	index = DRT_HASH(cmap, offset);
	6257
	6258	/* scan from the index forwards looking for a vacant slot */
	6259	for (i = 0; i < cmap->scm_modulus; i++) {
	6260	/* slot vacant? */
	6261	if (DRT_HASH_VACANT(cmap, index) \|\| DRT_HASH_GET_COUNT(cmap,index) == 0) {
	6262	cmap->scm_buckets++;
	6263	if (index < cmap->scm_lastclean)
	6264	cmap->scm_lastclean = index;
	6265	DRT_HASH_SET_ADDRESS(cmap, index, offset);
	6266	DRT_HASH_SET_COUNT(cmap, index, 0);
	6267	DRT_BITVECTOR_CLEAR(cmap, index);
	6268	*indexp = index;
	6269	vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
	6270	return(KERN_SUCCESS);
	6271	}
	6272	cmap->scm_iskips += i;
	6273	index = DRT_HASH_NEXT(cmap, index);
	6274	}
	6275
	6276	/*
	6277	* We haven't found a vacant slot, so the map is full. If we're not
	6278	* already recursed, try reallocating/compacting it.
	6279	*/
	6280	if (recursed)
	6281	return(KERN_FAILURE);
	6282	kret = vfs_drt_alloc_map(cmapp);
	6283	if (kret == KERN_SUCCESS) {
	6284	/* now try to insert again */
	6285	kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
	6286	}
	6287	return(kret);
	6288	}
	6289
	6290	/*
	6291	* Implementation of set dirty/clean.
	6292	*
	6293	* In the 'clean' case, not finding a map is OK.
	6294	*/
	6295	static kern_return_t
	6296	vfs_drt_do_mark_pages(
	6297	void **private,
	6298	u_int64_t offset,
	6299	u_int length,
	6300	u_int *setcountp,
	6301	int dirty)
	6302	{
	6303	struct vfs_drt_clustermap cmap, *cmapp;
	6304	kern_return_t kret;
	6305	int i, index, pgoff, pgcount, setcount, ecount;
	6306
	6307	cmapp = (struct vfs_drt_clustermap **)private;
	6308	cmap = *cmapp;
	6309
	6310	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
	6311
	6312	if (setcountp != NULL)
	6313	*setcountp = 0;
	6314
	6315	/* allocate a cluster map if we don't already have one */
	6316	if (cmap == NULL) {
	6317	/* no cluster map, nothing to clean */
	6318	if (!dirty) {
	6319	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 1, 0, 0, 0);
	6320	return(KERN_SUCCESS);
	6321	}
	6322	kret = vfs_drt_alloc_map(cmapp);
	6323	if (kret != KERN_SUCCESS) {
	6324	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 2, 0, 0, 0);
	6325	return(kret);
	6326	}
	6327	}
	6328	setcount = 0;
	6329
	6330	/*
	6331	* Iterate over the length of the region.
	6332	*/
	6333	while (length > 0) {
	6334	/*
	6335	* Get the hashtable index for this offset.
	6336	*
	6337	* XXX this will add blank entries if we are clearing a range
	6338	* that hasn't been dirtied.
	6339	*/
	6340	kret = vfs_drt_get_index(cmapp, offset, &index, 0);
	6341	cmap = cmapp; / may have changed! */
	6342	/* this may be a partial-success return */
	6343	if (kret != KERN_SUCCESS) {
	6344	if (setcountp != NULL)
	6345	*setcountp = setcount;
	6346	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 3, (int)length, 0, 0);
	6347
	6348	return(kret);
	6349	}
	6350
	6351	/*
	6352	* Work out how many pages we're modifying in this
	6353	* hashtable entry.
	6354	*/
	6355	pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
	6356	pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
	6357
	6358	/*
	6359	* Iterate over pages, dirty/clearing as we go.
	6360	*/
	6361	ecount = DRT_HASH_GET_COUNT(cmap, index);
	6362	for (i = 0; i < pgcount; i++) {
	6363	if (dirty) {
	6364	if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6365	DRT_HASH_SET_BIT(cmap, index, pgoff + i);
	6366	ecount++;
	6367	setcount++;
	6368	}
	6369	} else {
	6370	if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
	6371	DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
	6372	ecount--;
	6373	setcount++;
	6374	}
	6375	}
	6376	}
	6377	DRT_HASH_SET_COUNT(cmap, index, ecount);
	6378
	6379	offset += pgcount * PAGE_SIZE;
	6380	length -= pgcount * PAGE_SIZE;
	6381	}
	6382	if (setcountp != NULL)
	6383	*setcountp = setcount;
	6384
	6385	vfs_drt_trace(cmap, DRT_DEBUG_MARK \| DBG_FUNC_END, 0, setcount, 0, 0);
	6386
	6387	return(KERN_SUCCESS);
	6388	}
	6389
	6390	/*
	6391	* Mark a set of pages as dirty/clean.
	6392	*
	6393	* This is a public interface.
	6394	*
	6395	* cmapp
	6396	* Pointer to storage suitable for holding a pointer. Note that
	6397	* this must either be NULL or a value set by this function.
	6398	*
	6399	* size
	6400	* Current file size in bytes.
	6401	*
	6402	* offset
	6403	* Offset of the first page to be marked as dirty, in bytes. Must be
	6404	* page-aligned.
	6405	*
	6406	* length
	6407	* Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
	6408	*
	6409	* setcountp
	6410	* Number of pages newly marked dirty by this call (optional).
	6411	*
	6412	* Returns KERN_SUCCESS if all the pages were successfully marked.
	6413	*/
	6414	static kern_return_t
	6415	vfs_drt_mark_pages(void *cmapp, off_t offset, u_int length, u_int setcountp)
	6416	{
	6417	/* XXX size unused, drop from interface */
	6418	return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
	6419	}
	6420
	6421	#if 0
	6422	static kern_return_t
	6423	vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
	6424	{
	6425	return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
	6426	}
	6427	#endif
	6428
	6429	/*
	6430	* Get a cluster of dirty pages.
	6431	*
	6432	* This is a public interface.
	6433	*
	6434	* cmapp
	6435	* Pointer to storage managed by drt_mark_pages. Note that this must
	6436	* be NULL or a value set by drt_mark_pages.
	6437	*
	6438	* offsetp
	6439	* Returns the byte offset into the file of the first page in the cluster.
	6440	*
	6441	* lengthp
	6442	* Returns the length in bytes of the cluster of dirty pages.
	6443	*
	6444	* Returns success if a cluster was found. If KERN_FAILURE is returned, there
	6445	* are no dirty pages meeting the minmum size criteria. Private storage will
	6446	* be released if there are no more dirty pages left in the map
	6447	*
	6448	*/
	6449	static kern_return_t
	6450	vfs_drt_get_cluster(void *cmapp, off_t offsetp, u_int *lengthp)
	6451	{
	6452	struct vfs_drt_clustermap *cmap;
	6453	u_int64_t offset;
	6454	u_int length;
	6455	u_int32_t j;
	6456	int index, i, fs, ls;
	6457
	6458	/* sanity */
	6459	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6460	return(KERN_FAILURE);
	6461	cmap = *cmapp;
	6462
	6463	/* walk the hashtable */
	6464	for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
	6465	index = DRT_HASH(cmap, offset);
	6466
	6467	if (DRT_HASH_VACANT(cmap, index) \|\| (DRT_HASH_GET_COUNT(cmap, index) == 0))
	6468	continue;
	6469
	6470	/* scan the bitfield for a string of bits */
	6471	fs = -1;
	6472
	6473	for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6474	if (DRT_HASH_TEST_BIT(cmap, index, i)) {
	6475	fs = i;
	6476	break;
	6477	}
	6478	}
	6479	if (fs == -1) {
	6480	/* didn't find any bits set */
	6481	panic("vfs_drt: entry summary count > 0 but no bits set in map");
	6482	}
	6483	for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
	6484	if (!DRT_HASH_TEST_BIT(cmap, index, i))
	6485	break;
	6486	}
	6487
	6488	/* compute offset and length, mark pages clean */
	6489	offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
	6490	length = ls * PAGE_SIZE;
	6491	vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
	6492	cmap->scm_lastclean = index;
	6493
	6494	/* return successful */
	6495	*offsetp = (off_t)offset;
	6496	*lengthp = length;
	6497
	6498	vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
	6499	return(KERN_SUCCESS);
	6500	}
	6501	/*
	6502	* We didn't find anything... hashtable is empty
	6503	* emit stats into trace buffer and
	6504	* then free it
	6505	*/
	6506	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6507	cmap->scm_modulus,
	6508	cmap->scm_buckets,
	6509	cmap->scm_lastclean,
	6510	cmap->scm_iskips);
	6511
	6512	vfs_drt_free_map(cmap);
	6513	*cmapp = NULL;
	6514
	6515	return(KERN_FAILURE);
	6516	}
	6517
	6518
	6519	static kern_return_t
	6520	vfs_drt_control(void **cmapp, int op_type)
	6521	{
	6522	struct vfs_drt_clustermap *cmap;
	6523
	6524	/* sanity */
	6525	if ((cmapp == NULL) \|\| (*cmapp == NULL))
	6526	return(KERN_FAILURE);
	6527	cmap = *cmapp;
	6528
	6529	switch (op_type) {
	6530	case 0:
	6531	/* emit stats into trace buffer */
	6532	vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
	6533	cmap->scm_modulus,
	6534	cmap->scm_buckets,
	6535	cmap->scm_lastclean,
	6536	cmap->scm_iskips);
	6537
	6538	vfs_drt_free_map(cmap);
	6539	*cmapp = NULL;
	6540	break;
	6541
	6542	case 1:
	6543	cmap->scm_lastclean = 0;
	6544	break;
	6545	}
	6546	return(KERN_SUCCESS);
	6547	}
	6548
	6549
	6550
	6551	/*
	6552	* Emit a summary of the state of the clustermap into the trace buffer
	6553	* along with some caller-provided data.
	6554	*/
	6555	#if KDEBUG
	6556	static void
	6557	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
	6558	{
	6559	KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
	6560	}
	6561	#else
	6562	static void
	6563	vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
	6564	__unused int arg1, __unused int arg2, __unused int arg3,
	6565	__unused int arg4)
	6566	{
	6567	}
	6568	#endif
	6569
	6570	#if 0
	6571	/*
	6572	* Perform basic sanity check on the hash entry summary count
	6573	* vs. the actual bits set in the entry.
	6574	*/
	6575	static void
	6576	vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
	6577	{
	6578	int index, i;
	6579	int bits_on;
	6580
	6581	for (index = 0; index < cmap->scm_modulus; index++) {
	6582	if (DRT_HASH_VACANT(cmap, index))
	6583	continue;
	6584
	6585	for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
	6586	if (DRT_HASH_TEST_BIT(cmap, index, i))
	6587	bits_on++;
	6588	}
	6589	if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
	6590	panic("bits_on = %d, index = %d\n", bits_on, index);
	6591	}
	6592	}
	6593	#endif