git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: vm_fault.c
	60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
	61	*
	62	* Page fault handling module.
	63	*/
	64
	65	#include <mach_cluster_stats.h>
	66	#include <mach_pagemap.h>
	67	#include <libkern/OSAtomic.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/kern_return.h>
	71	#include <mach/message.h> /* for error codes */
	72	#include <mach/vm_param.h>
	73	#include <mach/vm_behavior.h>
	74	#include <mach/memory_object.h>
	75	/* For memory_object_data_{request,unlock} */
	76	#include <mach/sdt.h>
	77
	78	#include <kern/kern_types.h>
	79	#include <kern/host_statistics.h>
	80	#include <kern/counters.h>
	81	#include <kern/task.h>
	82	#include <kern/thread.h>
	83	#include <kern/sched_prim.h>
	84	#include <kern/host.h>
	85	#include <kern/mach_param.h>
	86	#include <kern/macro_help.h>
	87	#include <kern/zalloc.h>
	88	#include <kern/misc_protos.h>
	89	#include <kern/policy_internal.h>
	90
	91	#include <vm/vm_compressor.h>
	92	#include <vm/vm_compressor_pager.h>
	93	#include <vm/vm_fault.h>
	94	#include <vm/vm_map.h>
	95	#include <vm/vm_object.h>
	96	#include <vm/vm_page.h>
	97	#include <vm/vm_kern.h>
	98	#include <vm/pmap.h>
	99	#include <vm/vm_pageout.h>
	100	#include <vm/vm_protos.h>
	101	#include <vm/vm_external.h>
	102	#include <vm/memory_object.h>
	103	#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
	104	#include <vm/vm_shared_region.h>
	105
	106	#include <sys/codesign.h>
	107	#include <sys/reason.h>
	108	#include <sys/signalvar.h>
	109
	110	#include <san/kasan.h>
	111
	112	#define VM_FAULT_CLASSIFY 0
	113
	114	#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
	115
	116	int vm_protect_privileged_from_untrusted = 1;
	117
	118	unsigned int vm_object_pagein_throttle = 16;
	119
	120	/*
	121	* We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
	122	* kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
	123	* of memory if they're buggy and can run the system completely out of swap space. If this happens, we
	124	* impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
	125	* keep the UI active so that the user has a chance to kill the offending task before the system
	126	* completely hangs.
	127	*
	128	* The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
	129	* to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
	130	* will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
	131	* delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
	132	*/
	133
	134	extern void throttle_lowpri_io(int);
	135
	136	extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
	137
	138	uint64_t vm_hard_throttle_threshold;
	139
	140
	141	OS_ALWAYS_INLINE
	142	boolean_t
	143	NEED_TO_HARD_THROTTLE_THIS_TASK(void)
	144	{
	145	return vm_wants_task_throttled(current_task()) \|\|
	146	((vm_page_free_count < vm_page_throttle_limit \|\|
	147	HARD_THROTTLE_LIMIT_REACHED()) &&
	148	proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
	149	}
	150
	151	#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
	152	#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
	153
	154	#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
	155	#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
	156
	157
	158	#define VM_STAT_DECOMPRESSIONS() \
	159	MACRO_BEGIN \
	160	VM_STAT_INCR(decompressions); \
	161	current_thread()->decompressions++; \
	162	MACRO_END
	163
	164	boolean_t current_thread_aborted(void);
	165
	166	/* Forward declarations of internal routines. */
	167	static kern_return_t vm_fault_wire_fast(
	168	vm_map_t map,
	169	vm_map_offset_t va,
	170	vm_prot_t prot,
	171	vm_tag_t wire_tag,
	172	vm_map_entry_t entry,
	173	pmap_t pmap,
	174	vm_map_offset_t pmap_addr,
	175	ppnum_t *physpage_p);
	176
	177	static kern_return_t vm_fault_internal(
	178	vm_map_t map,
	179	vm_map_offset_t vaddr,
	180	vm_prot_t caller_prot,
	181	boolean_t change_wiring,
	182	vm_tag_t wire_tag,
	183	int interruptible,
	184	pmap_t pmap,
	185	vm_map_offset_t pmap_addr,
	186	ppnum_t *physpage_p);
	187
	188	static void vm_fault_copy_cleanup(
	189	vm_page_t page,
	190	vm_page_t top_page);
	191
	192	static void vm_fault_copy_dst_cleanup(
	193	vm_page_t page);
	194
	195	#if VM_FAULT_CLASSIFY
	196	extern void vm_fault_classify(vm_object_t object,
	197	vm_object_offset_t offset,
	198	vm_prot_t fault_type);
	199
	200	extern void vm_fault_classify_init(void);
	201	#endif
	202
	203	unsigned long vm_pmap_enter_blocked = 0;
	204	unsigned long vm_pmap_enter_retried = 0;
	205
	206	unsigned long vm_cs_validates = 0;
	207	unsigned long vm_cs_revalidates = 0;
	208	unsigned long vm_cs_query_modified = 0;
	209	unsigned long vm_cs_validated_dirtied = 0;
	210	unsigned long vm_cs_bitmap_validated = 0;
	211
	212	void vm_pre_fault(vm_map_offset_t, vm_prot_t);
	213
	214	extern char *kdp_compressor_decompressed_page;
	215	extern addr64_t kdp_compressor_decompressed_page_paddr;
	216	extern ppnum_t kdp_compressor_decompressed_page_ppnum;
	217
	218	struct vmrtfr {
	219	int vmrtfr_maxi;
	220	int vmrtfr_curi;
	221	int64_t vmrtf_total;
	222	vm_rtfault_record_t *vm_rtf_records;
	223	} vmrtfrs;
	224	#define VMRTF_DEFAULT_BUFSIZE (4096)
	225	#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
	226	TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
	227
	228	static void vm_rtfrecord_lock(void);
	229	static void vm_rtfrecord_unlock(void);
	230	static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
	231
	232	extern lck_grp_t vm_page_lck_grp_bucket;
	233	extern lck_attr_t vm_page_lck_attr;
	234	LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
	235
	236	/*
	237	* Routine: vm_fault_init
	238	* Purpose:
	239	* Initialize our private data structures.
	240	*/
	241	__startup_func
	242	void
	243	vm_fault_init(void)
	244	{
	245	int i, vm_compressor_temp;
	246	boolean_t need_default_val = TRUE;
	247	/*
	248	* Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
	249	* computed as a percentage of available memory, and the percentage used is scaled inversely with
	250	* the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
	251	* and reduce the value down to 10% for very large memory configurations. This helps give us a
	252	* definition of a memory hog that makes more sense relative to the amount of ram in the machine.
	253	* The formula here simply uses the number of gigabytes of ram to adjust the percentage.
	254	*/
	255
	256	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
	257
	258	/*
	259	* Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
	260	*/
	261
	262	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
	263	for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
	264	if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
	265	need_default_val = FALSE;
	266	vm_compressor_mode = vm_compressor_temp;
	267	break;
	268	}
	269	}
	270	if (need_default_val) {
	271	printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
	272	}
	273	}
	274	if (need_default_val) {
	275	/* If no boot arg or incorrect boot arg, try device tree. */
	276	PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
	277	}
	278	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
	279
	280	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
	281	&vm_protect_privileged_from_untrusted,
	282	sizeof(vm_protect_privileged_from_untrusted));
	283	}
	284
	285	__startup_func
	286	static void
	287	vm_rtfault_record_init(void)
	288	{
	289	size_t size;
	290
	291	vmrtf_num_records = MAX(vmrtf_num_records, 1);
	292	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
	293	vmrtfrs.vm_rtf_records = zalloc_permanent(size,
	294	ZALIGN(vm_rtfault_record_t));
	295	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
	296	}
	297	STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
	298
	299	/*
	300	* Routine: vm_fault_cleanup
	301	* Purpose:
	302	* Clean up the result of vm_fault_page.
	303	* Results:
	304	* The paging reference for "object" is released.
	305	* "object" is unlocked.
	306	* If "top_page" is not null, "top_page" is
	307	* freed and the paging reference for the object
	308	* containing it is released.
	309	*
	310	* In/out conditions:
	311	* "object" must be locked.
	312	*/
	313	void
	314	vm_fault_cleanup(
	315	vm_object_t object,
	316	vm_page_t top_page)
	317	{
	318	vm_object_paging_end(object);
	319	vm_object_unlock(object);
	320
	321	if (top_page != VM_PAGE_NULL) {
	322	object = VM_PAGE_OBJECT(top_page);
	323
	324	vm_object_lock(object);
	325	VM_PAGE_FREE(top_page);
	326	vm_object_paging_end(object);
	327	vm_object_unlock(object);
	328	}
	329	}
	330
	331	#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
	332
	333
	334	boolean_t vm_page_deactivate_behind = TRUE;
	335	/*
	336	* default sizes given VM_BEHAVIOR_DEFAULT reference behavior
	337	*/
	338	#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
	339	#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
	340	/* we use it to size an array on the stack */
	341
	342	int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
	343
	344	#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
	345
	346	/*
	347	* vm_page_is_sequential
	348	*
	349	* Determine if sequential access is in progress
	350	* in accordance with the behavior specified.
	351	* Update state to indicate current access pattern.
	352	*
	353	* object must have at least the shared lock held
	354	*/
	355	static
	356	void
	357	vm_fault_is_sequential(
	358	vm_object_t object,
	359	vm_object_offset_t offset,
	360	vm_behavior_t behavior)
	361	{
	362	vm_object_offset_t last_alloc;
	363	int sequential;
	364	int orig_sequential;
	365
	366	last_alloc = object->last_alloc;
	367	sequential = object->sequential;
	368	orig_sequential = sequential;
	369
	370	offset = vm_object_trunc_page(offset);
	371	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
	372	/* re-faulting in the same page: no change in behavior */
	373	return;
	374	}
	375
	376	switch (behavior) {
	377	case VM_BEHAVIOR_RANDOM:
	378	/*
	379	* reset indicator of sequential behavior
	380	*/
	381	sequential = 0;
	382	break;
	383
	384	case VM_BEHAVIOR_SEQUENTIAL:
	385	if (offset && last_alloc == offset - PAGE_SIZE_64) {
	386	/*
	387	* advance indicator of sequential behavior
	388	*/
	389	if (sequential < MAX_SEQUENTIAL_RUN) {
	390	sequential += PAGE_SIZE;
	391	}
	392	} else {
	393	/*
	394	* reset indicator of sequential behavior
	395	*/
	396	sequential = 0;
	397	}
	398	break;
	399
	400	case VM_BEHAVIOR_RSEQNTL:
	401	if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
	402	/*
	403	* advance indicator of sequential behavior
	404	*/
	405	if (sequential > -MAX_SEQUENTIAL_RUN) {
	406	sequential -= PAGE_SIZE;
	407	}
	408	} else {
	409	/*
	410	* reset indicator of sequential behavior
	411	*/
	412	sequential = 0;
	413	}
	414	break;
	415
	416	case VM_BEHAVIOR_DEFAULT:
	417	default:
	418	if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
	419	/*
	420	* advance indicator of sequential behavior
	421	*/
	422	if (sequential < 0) {
	423	sequential = 0;
	424	}
	425	if (sequential < MAX_SEQUENTIAL_RUN) {
	426	sequential += PAGE_SIZE;
	427	}
	428	} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
	429	/*
	430	* advance indicator of sequential behavior
	431	*/
	432	if (sequential > 0) {
	433	sequential = 0;
	434	}
	435	if (sequential > -MAX_SEQUENTIAL_RUN) {
	436	sequential -= PAGE_SIZE;
	437	}
	438	} else {
	439	/*
	440	* reset indicator of sequential behavior
	441	*/
	442	sequential = 0;
	443	}
	444	break;
	445	}
	446	if (sequential != orig_sequential) {
	447	if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
	448	/*
	449	* if someone else has already updated object->sequential
	450	* don't bother trying to update it or object->last_alloc
	451	*/
	452	return;
	453	}
	454	}
	455	/*
	456	* I'd like to do this with a OSCompareAndSwap64, but that
	457	* doesn't exist for PPC... however, it shouldn't matter
	458	* that much... last_alloc is maintained so that we can determine
	459	* if a sequential access pattern is taking place... if only
	460	* one thread is banging on this object, no problem with the unprotected
	461	* update... if 2 or more threads are banging away, we run the risk of
	462	* someone seeing a mangled update... however, in the face of multiple
	463	* accesses, no sequential access pattern can develop anyway, so we
	464	* haven't lost any real info.
	465	*/
	466	object->last_alloc = offset;
	467	}
	468
	469
	470	int vm_page_deactivate_behind_count = 0;
	471
	472	/*
	473	* vm_page_deactivate_behind
	474	*
	475	* Determine if sequential access is in progress
	476	* in accordance with the behavior specified. If
	477	* so, compute a potential page to deactivate and
	478	* deactivate it.
	479	*
	480	* object must be locked.
	481	*
	482	* return TRUE if we actually deactivate a page
	483	*/
	484	static
	485	boolean_t
	486	vm_fault_deactivate_behind(
	487	vm_object_t object,
	488	vm_object_offset_t offset,
	489	vm_behavior_t behavior)
	490	{
	491	int n;
	492	int pages_in_run = 0;
	493	int max_pages_in_run = 0;
	494	int sequential_run;
	495	int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	496	vm_object_offset_t run_offset = 0;
	497	vm_object_offset_t pg_offset = 0;
	498	vm_page_t m;
	499	vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
	500
	501	pages_in_run = 0;
	502	#if TRACEFAULTPAGE
	503	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
	504	#endif
	505	if (object == kernel_object \|\| vm_page_deactivate_behind == FALSE \|\| (vm_object_trunc_page(offset) != offset)) {
	506	/*
	507	* Do not deactivate pages from the kernel object: they
	508	* are not intended to become pageable.
	509	* or we've disabled the deactivate behind mechanism
	510	* or we are dealing with an offset that is not aligned to
	511	* the system's PAGE_SIZE because in that case we will
	512	* handle the deactivation on the aligned offset and, thus,
	513	* the full PAGE_SIZE page once. This helps us avoid the redundant
	514	* deactivates and the extra faults.
	515	*/
	516	return FALSE;
	517	}
	518	if ((sequential_run = object->sequential)) {
	519	if (sequential_run < 0) {
	520	sequential_behavior = VM_BEHAVIOR_RSEQNTL;
	521	sequential_run = 0 - sequential_run;
	522	} else {
	523	sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	524	}
	525	}
	526	switch (behavior) {
	527	case VM_BEHAVIOR_RANDOM:
	528	break;
	529	case VM_BEHAVIOR_SEQUENTIAL:
	530	if (sequential_run >= (int)PAGE_SIZE) {
	531	run_offset = 0 - PAGE_SIZE_64;
	532	max_pages_in_run = 1;
	533	}
	534	break;
	535	case VM_BEHAVIOR_RSEQNTL:
	536	if (sequential_run >= (int)PAGE_SIZE) {
	537	run_offset = PAGE_SIZE_64;
	538	max_pages_in_run = 1;
	539	}
	540	break;
	541	case VM_BEHAVIOR_DEFAULT:
	542	default:
	543	{ vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
	544
	545	/*
	546	* determine if the run of sequential accesss has been
	547	* long enough on an object with default access behavior
	548	* to consider it for deactivation
	549	*/
	550	if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
	551	/*
	552	* the comparisons between offset and behind are done
	553	* in this kind of odd fashion in order to prevent wrap around
	554	* at the end points
	555	*/
	556	if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
	557	if (offset >= behind) {
	558	run_offset = 0 - behind;
	559	pg_offset = PAGE_SIZE_64;
	560	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	561	}
	562	} else {
	563	if (offset < -behind) {
	564	run_offset = behind;
	565	pg_offset = 0 - PAGE_SIZE_64;
	566	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	567	}
	568	}
	569	}
	570	break;}
	571	}
	572	for (n = 0; n < max_pages_in_run; n++) {
	573	m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
	574
	575	if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
	576	page_run[pages_in_run++] = m;
	577
	578	/*
	579	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
	580	*
	581	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
	582	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
	583	* new reference happens. If no futher references happen on the page after that remote TLB flushes
	584	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
	585	* by pageout_scan, which is just fine since the last reference would have happened quite far
	586	* in the past (TLB caches don't hang around for very long), and of course could just as easily
	587	* have happened before we did the deactivate_behind.
	588	*/
	589	pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
	590	}
	591	}
	592	if (pages_in_run) {
	593	vm_page_lockspin_queues();
	594
	595	for (n = 0; n < pages_in_run; n++) {
	596	m = page_run[n];
	597
	598	vm_page_deactivate_internal(m, FALSE);
	599
	600	vm_page_deactivate_behind_count++;
	601	#if TRACEFAULTPAGE
	602	dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	603	#endif
	604	}
	605	vm_page_unlock_queues();
	606
	607	return TRUE;
	608	}
	609	return FALSE;
	610	}
	611
	612
	613	#if (DEVELOPMENT \|\| DEBUG)
	614	uint32_t vm_page_creation_throttled_hard = 0;
	615	uint32_t vm_page_creation_throttled_soft = 0;
	616	uint64_t vm_page_creation_throttle_avoided = 0;
	617	#endif /* DEVELOPMENT \|\| DEBUG */
	618
	619	static int
	620	vm_page_throttled(boolean_t page_kept)
	621	{
	622	clock_sec_t elapsed_sec;
	623	clock_sec_t tv_sec;
	624	clock_usec_t tv_usec;
	625
	626	thread_t thread = current_thread();
	627
	628	if (thread->options & TH_OPT_VMPRIV) {
	629	return 0;
	630	}
	631
	632	if (thread->t_page_creation_throttled) {
	633	thread->t_page_creation_throttled = 0;
	634
	635	if (page_kept == FALSE) {
	636	goto no_throttle;
	637	}
	638	}
	639	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
	640	#if (DEVELOPMENT \|\| DEBUG)
	641	thread->t_page_creation_throttled_hard++;
	642	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	643	#endif /* DEVELOPMENT \|\| DEBUG */
	644	return HARD_THROTTLE_DELAY;
	645	}
	646
	647	if ((vm_page_free_count < vm_page_throttle_limit \|\| (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
	648	thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
	649	if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
	650	#if (DEVELOPMENT \|\| DEBUG)
	651	OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
	652	#endif
	653	goto no_throttle;
	654	}
	655	clock_get_system_microtime(&tv_sec, &tv_usec);
	656
	657	elapsed_sec = tv_sec - thread->t_page_creation_time;
	658
	659	if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS \|\|
	660	(thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
	661	if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
	662	/*
	663	* we'll reset our stats to give a well behaved app
	664	* that was unlucky enough to accumulate a bunch of pages
	665	* over a long period of time a chance to get out of
	666	* the throttled state... we reset the counter and timestamp
	667	* so that if it stays under the rate limit for the next second
	668	* it will be back in our good graces... if it exceeds it, it
	669	* will remain in the throttled state
	670	*/
	671	thread->t_page_creation_time = tv_sec;
	672	thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
	673	}
	674	VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
	675
	676	thread->t_page_creation_throttled = 1;
	677
	678	if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
	679	#if (DEVELOPMENT \|\| DEBUG)
	680	thread->t_page_creation_throttled_hard++;
	681	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	682	#endif /* DEVELOPMENT \|\| DEBUG */
	683	return HARD_THROTTLE_DELAY;
	684	} else {
	685	#if (DEVELOPMENT \|\| DEBUG)
	686	thread->t_page_creation_throttled_soft++;
	687	OSAddAtomic(1, &vm_page_creation_throttled_soft);
	688	#endif /* DEVELOPMENT \|\| DEBUG */
	689	return SOFT_THROTTLE_DELAY;
	690	}
	691	}
	692	thread->t_page_creation_time = tv_sec;
	693	thread->t_page_creation_count = 0;
	694	}
	695	no_throttle:
	696	thread->t_page_creation_count++;
	697
	698	return 0;
	699	}
	700
	701
	702	/*
	703	* check for various conditions that would
	704	* prevent us from creating a ZF page...
	705	* cleanup is based on being called from vm_fault_page
	706	*
	707	* object must be locked
	708	* object == m->vmp_object
	709	*/
	710	static vm_fault_return_t
	711	vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
	712	{
	713	int throttle_delay;
	714
	715	if (object->shadow_severed \|\|
	716	VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
	717	/*
	718	* Either:
	719	* 1. the shadow chain was severed,
	720	* 2. the purgeable object is volatile or empty and is marked
	721	* to fault on access while volatile.
	722	* Just have to return an error at this point
	723	*/
	724	if (m != VM_PAGE_NULL) {
	725	VM_PAGE_FREE(m);
	726	}
	727	vm_fault_cleanup(object, first_m);
	728
	729	thread_interrupt_level(interruptible_state);
	730
	731	return VM_FAULT_MEMORY_ERROR;
	732	}
	733	if (page_throttle == TRUE) {
	734	if ((throttle_delay = vm_page_throttled(FALSE))) {
	735	/*
	736	* we're throttling zero-fills...
	737	* treat this as if we couldn't grab a page
	738	*/
	739	if (m != VM_PAGE_NULL) {
	740	VM_PAGE_FREE(m);
	741	}
	742	vm_fault_cleanup(object, first_m);
	743
	744	VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	745
	746	delay(throttle_delay);
	747
	748	if (current_thread_aborted()) {
	749	thread_interrupt_level(interruptible_state);
	750	return VM_FAULT_INTERRUPTED;
	751	}
	752	thread_interrupt_level(interruptible_state);
	753
	754	return VM_FAULT_MEMORY_SHORTAGE;
	755	}
	756	}
	757	return VM_FAULT_SUCCESS;
	758	}
	759
	760	/*
	761	* Clear the code signing bits on the given page_t
	762	*/
	763	static void
	764	vm_fault_cs_clear(vm_page_t m)
	765	{
	766	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
	767	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
	768	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
	769	}
	770
	771	/*
	772	* Enqueues the given page on the throttled queue.
	773	* The caller must hold the vm_page_queue_lock and it will be held on return.
	774	*/
	775	static void
	776	vm_fault_enqueue_throttled_locked(vm_page_t m)
	777	{
	778	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	779	assert(!VM_PAGE_WIRED(m));
	780
	781	/*
	782	* can't be on the pageout queue since we don't
	783	* have a pager to try and clean to
	784	*/
	785	vm_page_queues_remove(m, TRUE);
	786	vm_page_check_pageable_safe(m);
	787	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
	788	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
	789	vm_page_throttled_count++;
	790	}
	791
	792	/*
	793	* do the work to zero fill a page and
	794	* inject it into the correct paging queue
	795	*
	796	* m->vmp_object must be locked
	797	* page queue lock must NOT be held
	798	*/
	799	static int
	800	vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
	801	{
	802	int my_fault = DBG_ZERO_FILL_FAULT;
	803	vm_object_t object;
	804
	805	object = VM_PAGE_OBJECT(m);
	806
	807	/*
	808	* This is is a zero-fill page fault...
	809	*
	810	* Checking the page lock is a waste of
	811	* time; this page was absent, so
	812	* it can't be page locked by a pager.
	813	*
	814	* we also consider it undefined
	815	* with respect to instruction
	816	* execution. i.e. it is the responsibility
	817	* of higher layers to call for an instruction
	818	* sync after changing the contents and before
	819	* sending a program into this area. We
	820	* choose this approach for performance
	821	*/
	822	vm_fault_cs_clear(m);
	823	m->vmp_pmapped = TRUE;
	824
	825	if (no_zero_fill == TRUE) {
	826	my_fault = DBG_NZF_PAGE_FAULT;
	827
	828	if (m->vmp_absent && m->vmp_busy) {
	829	return my_fault;
	830	}
	831	} else {
	832	vm_page_zero_fill(m);
	833
	834	VM_STAT_INCR(zero_fill_count);
	835	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
	836	}
	837	assert(!m->vmp_laundry);
	838	assert(object != kernel_object);
	839	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
	840	if (!VM_DYNAMIC_PAGING_ENABLED() &&
	841	(object->purgable == VM_PURGABLE_DENY \|\|
	842	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	843	object->purgable == VM_PURGABLE_VOLATILE)) {
	844	vm_page_lockspin_queues();
	845	if (!VM_DYNAMIC_PAGING_ENABLED()) {
	846	vm_fault_enqueue_throttled_locked(m);
	847	}
	848	vm_page_unlock_queues();
	849	}
	850	return my_fault;
	851	}
	852
	853
	854	/*
	855	* Routine: vm_fault_page
	856	* Purpose:
	857	* Find the resident page for the virtual memory
	858	* specified by the given virtual memory object
	859	* and offset.
	860	* Additional arguments:
	861	* The required permissions for the page is given
	862	* in "fault_type". Desired permissions are included
	863	* in "protection".
	864	* fault_info is passed along to determine pagein cluster
	865	* limits... it contains the expected reference pattern,
	866	* cluster size if available, etc...
	867	*
	868	* If the desired page is known to be resident (for
	869	* example, because it was previously wired down), asserting
	870	* the "unwiring" parameter will speed the search.
	871	*
	872	* If the operation can be interrupted (by thread_abort
	873	* or thread_terminate), then the "interruptible"
	874	* parameter should be asserted.
	875	*
	876	* Results:
	877	* The page containing the proper data is returned
	878	* in "result_page".
	879	*
	880	* In/out conditions:
	881	* The source object must be locked and referenced,
	882	* and must donate one paging reference. The reference
	883	* is not affected. The paging reference and lock are
	884	* consumed.
	885	*
	886	* If the call succeeds, the object in which "result_page"
	887	* resides is left locked and holding a paging reference.
	888	* If this is not the original object, a busy page in the
	889	* original object is returned in "top_page", to prevent other
	890	* callers from pursuing this same data, along with a paging
	891	* reference for the original object. The "top_page" should
	892	* be destroyed when this guarantee is no longer required.
	893	* The "result_page" is also left busy. It is not removed
	894	* from the pageout queues.
	895	* Special Case:
	896	* A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
	897	* fault succeeded but there's no VM page (i.e. the VM object
	898	* does not actually hold VM pages, but device memory or
	899	* large pages). The object is still locked and we still hold a
	900	* paging_in_progress reference.
	901	*/
	902	unsigned int vm_fault_page_blocked_access = 0;
	903	unsigned int vm_fault_page_forced_retry = 0;
	904
	905	vm_fault_return_t
	906	vm_fault_page(
	907	/* Arguments: */
	908	vm_object_t first_object, /* Object to begin search */
	909	vm_object_offset_t first_offset, /* Offset into object */
	910	vm_prot_t fault_type, /* What access is requested */
	911	boolean_t must_be_resident,/* Must page be resident? */
	912	boolean_t caller_lookup, /* caller looked up page */
	913	/* Modifies in place: */
	914	vm_prot_t protection, / Protection for mapping */
	915	vm_page_t result_page, / Page found, if successful */
	916	/* Returns: */
	917	vm_page_t top_page, / Page in top object, if
	918	* not result_page. */
	919	int type_of_fault, / if non-null, fill in with type of fault
	920	* COW, zero-fill, etc... returned in trace point */
	921	/* More arguments: */
	922	kern_return_t error_code, / code if page is in error */
	923	boolean_t no_zero_fill, /* don't zero fill absent pages */
	924	boolean_t data_supply, /* treat as data_supply if
	925	* it is a write fault and a full
	926	* page is provided */
	927	vm_object_fault_info_t fault_info)
	928	{
	929	vm_page_t m;
	930	vm_object_t object;
	931	vm_object_offset_t offset;
	932	vm_page_t first_m;
	933	vm_object_t next_object;
	934	vm_object_t copy_object;
	935	boolean_t look_for_page;
	936	boolean_t force_fault_retry = FALSE;
	937	vm_prot_t access_required = fault_type;
	938	vm_prot_t wants_copy_flag;
	939	kern_return_t wait_result;
	940	wait_interrupt_t interruptible_state;
	941	boolean_t data_already_requested = FALSE;
	942	vm_behavior_t orig_behavior;
	943	vm_size_t orig_cluster_size;
	944	vm_fault_return_t error;
	945	int my_fault;
	946	uint32_t try_failed_count;
	947	int interruptible; /* how may fault be interrupted? */
	948	int external_state = VM_EXTERNAL_STATE_UNKNOWN;
	949	memory_object_t pager;
	950	vm_fault_return_t retval;
	951	int grab_options;
	952
	953	/*
	954	* MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
	955	* marked as paged out in the compressor pager or the pager doesn't exist.
	956	* Note also that if the pager for an internal object
	957	* has not been created, the pager is not invoked regardless of the value
	958	* of MUST_ASK_PAGER().
	959	*
	960	* PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
	961	* is marked as paged out in the compressor pager.
	962	* PAGED_OUT() is used to determine if a page has already been pushed
	963	* into a copy object in order to avoid a redundant page out operation.
	964	*/
	965	#define MUST_ASK_PAGER(o, f, s) \
	966	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
	967
	968	#define PAGED_OUT(o, f) \
	969	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
	970
	971	/*
	972	* Recovery actions
	973	*/
	974	#define RELEASE_PAGE(m) \
	975	MACRO_BEGIN \
	976	PAGE_WAKEUP_DONE(m); \
	977	if ( !VM_PAGE_PAGEABLE(m)) { \
	978	vm_page_lockspin_queues(); \
	979	if ( !VM_PAGE_PAGEABLE(m)) { \
	980	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
	981	vm_page_deactivate(m); \
	982	else \
	983	vm_page_activate(m); \
	984	} \
	985	vm_page_unlock_queues(); \
	986	} \
	987	MACRO_END
	988
	989	#if TRACEFAULTPAGE
	990	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
	991	#endif
	992
	993	interruptible = fault_info->interruptible;
	994	interruptible_state = thread_interrupt_level(interruptible);
	995
	996	/*
	997	* INVARIANTS (through entire routine):
	998	*
	999	* 1) At all times, we must either have the object
	1000	* lock or a busy page in some object to prevent
	1001	* some other thread from trying to bring in
	1002	* the same page.
	1003	*
	1004	* Note that we cannot hold any locks during the
	1005	* pager access or when waiting for memory, so
	1006	* we use a busy page then.
	1007	*
	1008	* 2) To prevent another thread from racing us down the
	1009	* shadow chain and entering a new page in the top
	1010	* object before we do, we must keep a busy page in
	1011	* the top object while following the shadow chain.
	1012	*
	1013	* 3) We must increment paging_in_progress on any object
	1014	* for which we have a busy page before dropping
	1015	* the object lock
	1016	*
	1017	* 4) We leave busy pages on the pageout queues.
	1018	* If the pageout daemon comes across a busy page,
	1019	* it will remove the page from the pageout queues.
	1020	*/
	1021
	1022	object = first_object;
	1023	offset = first_offset;
	1024	first_m = VM_PAGE_NULL;
	1025	access_required = fault_type;
	1026
	1027	/*
	1028	* default type of fault
	1029	*/
	1030	my_fault = DBG_CACHE_HIT_FAULT;
	1031
	1032	while (TRUE) {
	1033	#if TRACEFAULTPAGE
	1034	dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
	1035	#endif
	1036
	1037	grab_options = 0;
	1038	#if CONFIG_SECLUDED_MEMORY
	1039	if (object->can_grab_secluded) {
	1040	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	1041	}
	1042	#endif /* CONFIG_SECLUDED_MEMORY */
	1043
	1044	if (!object->alive) {
	1045	/*
	1046	* object is no longer valid
	1047	* clean up and return error
	1048	*/
	1049	vm_fault_cleanup(object, first_m);
	1050	thread_interrupt_level(interruptible_state);
	1051
	1052	return VM_FAULT_MEMORY_ERROR;
	1053	}
	1054
	1055	if (!object->pager_created && object->phys_contiguous) {
	1056	/*
	1057	* A physically-contiguous object without a pager:
	1058	* must be a "large page" object. We do not deal
	1059	* with VM pages for this object.
	1060	*/
	1061	caller_lookup = FALSE;
	1062	m = VM_PAGE_NULL;
	1063	goto phys_contig_object;
	1064	}
	1065
	1066	if (object->blocked_access) {
	1067	/*
	1068	* Access to this VM object has been blocked.
	1069	* Replace our "paging_in_progress" reference with
	1070	* a "activity_in_progress" reference and wait for
	1071	* access to be unblocked.
	1072	*/
	1073	caller_lookup = FALSE; /* no longer valid after sleep */
	1074	vm_object_activity_begin(object);
	1075	vm_object_paging_end(object);
	1076	while (object->blocked_access) {
	1077	vm_object_sleep(object,
	1078	VM_OBJECT_EVENT_UNBLOCKED,
	1079	THREAD_UNINT);
	1080	}
	1081	vm_fault_page_blocked_access++;
	1082	vm_object_paging_begin(object);
	1083	vm_object_activity_end(object);
	1084	}
	1085
	1086	/*
	1087	* See whether the page at 'offset' is resident
	1088	*/
	1089	if (caller_lookup == TRUE) {
	1090	/*
	1091	* The caller has already looked up the page
	1092	* and gave us the result in "result_page".
	1093	* We can use this for the first lookup but
	1094	* it loses its validity as soon as we unlock
	1095	* the object.
	1096	*/
	1097	m = *result_page;
	1098	caller_lookup = FALSE; /* no longer valid after that */
	1099	} else {
	1100	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	1101	}
	1102	#if TRACEFAULTPAGE
	1103	dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1104	#endif
	1105	if (m != VM_PAGE_NULL) {
	1106	if (m->vmp_busy) {
	1107	/*
	1108	* The page is being brought in,
	1109	* wait for it and then retry.
	1110	*/
	1111	#if TRACEFAULTPAGE
	1112	dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1113	#endif
	1114	wait_result = PAGE_SLEEP(object, m, interruptible);
	1115
	1116	counter(c_vm_fault_page_block_busy_kernel++);
	1117
	1118	if (wait_result != THREAD_AWAKENED) {
	1119	vm_fault_cleanup(object, first_m);
	1120	thread_interrupt_level(interruptible_state);
	1121
	1122	if (wait_result == THREAD_RESTART) {
	1123	return VM_FAULT_RETRY;
	1124	} else {
	1125	return VM_FAULT_INTERRUPTED;
	1126	}
	1127	}
	1128	continue;
	1129	}
	1130	if (m->vmp_laundry) {
	1131	m->vmp_free_when_done = FALSE;
	1132
	1133	if (!m->vmp_cleaning) {
	1134	vm_pageout_steal_laundry(m, FALSE);
	1135	}
	1136	}
	1137	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	1138	/*
	1139	* Guard page: off limits !
	1140	*/
	1141	if (fault_type == VM_PROT_NONE) {
	1142	/*
	1143	* The fault is not requesting any
	1144	* access to the guard page, so it must
	1145	* be just to wire or unwire it.
	1146	* Let's pretend it succeeded...
	1147	*/
	1148	m->vmp_busy = TRUE;
	1149	*result_page = m;
	1150	assert(first_m == VM_PAGE_NULL);
	1151	*top_page = first_m;
	1152	if (type_of_fault) {
	1153	*type_of_fault = DBG_GUARD_FAULT;
	1154	}
	1155	thread_interrupt_level(interruptible_state);
	1156	return VM_FAULT_SUCCESS;
	1157	} else {
	1158	/*
	1159	* The fault requests access to the
	1160	* guard page: let's deny that !
	1161	*/
	1162	vm_fault_cleanup(object, first_m);
	1163	thread_interrupt_level(interruptible_state);
	1164	return VM_FAULT_MEMORY_ERROR;
	1165	}
	1166	}
	1167
	1168	if (m->vmp_error) {
	1169	/*
	1170	* The page is in error, give up now.
	1171	*/
	1172	#if TRACEFAULTPAGE
	1173	dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
	1174	#endif
	1175	if (error_code) {
	1176	*error_code = KERN_MEMORY_ERROR;
	1177	}
	1178	VM_PAGE_FREE(m);
	1179
	1180	vm_fault_cleanup(object, first_m);
	1181	thread_interrupt_level(interruptible_state);
	1182
	1183	return VM_FAULT_MEMORY_ERROR;
	1184	}
	1185	if (m->vmp_restart) {
	1186	/*
	1187	* The pager wants us to restart
	1188	* at the top of the chain,
	1189	* typically because it has moved the
	1190	* page to another pager, then do so.
	1191	*/
	1192	#if TRACEFAULTPAGE
	1193	dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1194	#endif
	1195	VM_PAGE_FREE(m);
	1196
	1197	vm_fault_cleanup(object, first_m);
	1198	thread_interrupt_level(interruptible_state);
	1199
	1200	return VM_FAULT_RETRY;
	1201	}
	1202	if (m->vmp_absent) {
	1203	/*
	1204	* The page isn't busy, but is absent,
	1205	* therefore it's deemed "unavailable".
	1206	*
	1207	* Remove the non-existent page (unless it's
	1208	* in the top object) and move on down to the
	1209	* next object (if there is one).
	1210	*/
	1211	#if TRACEFAULTPAGE
	1212	dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
	1213	#endif
	1214	next_object = object->shadow;
	1215
	1216	if (next_object == VM_OBJECT_NULL) {
	1217	/*
	1218	* Absent page at bottom of shadow
	1219	* chain; zero fill the page we left
	1220	* busy in the first object, and free
	1221	* the absent page.
	1222	*/
	1223	assert(!must_be_resident);
	1224
	1225	/*
	1226	* check for any conditions that prevent
	1227	* us from creating a new zero-fill page
	1228	* vm_fault_check will do all of the
	1229	* fault cleanup in the case of an error condition
	1230	* including resetting the thread_interrupt_level
	1231	*/
	1232	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
	1233
	1234	if (error != VM_FAULT_SUCCESS) {
	1235	return error;
	1236	}
	1237
	1238	if (object != first_object) {
	1239	/*
	1240	* free the absent page we just found
	1241	*/
	1242	VM_PAGE_FREE(m);
	1243
	1244	/*
	1245	* drop reference and lock on current object
	1246	*/
	1247	vm_object_paging_end(object);
	1248	vm_object_unlock(object);
	1249
	1250	/*
	1251	* grab the original page we
	1252	* 'soldered' in place and
	1253	* retake lock on 'first_object'
	1254	*/
	1255	m = first_m;
	1256	first_m = VM_PAGE_NULL;
	1257
	1258	object = first_object;
	1259	offset = first_offset;
	1260
	1261	vm_object_lock(object);
	1262	} else {
	1263	/*
	1264	* we're going to use the absent page we just found
	1265	* so convert it to a 'busy' page
	1266	*/
	1267	m->vmp_absent = FALSE;
	1268	m->vmp_busy = TRUE;
	1269	}
	1270	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
	1271	m->vmp_absent = TRUE;
	1272	}
	1273	/*
	1274	* zero-fill the page and put it on
	1275	* the correct paging queue
	1276	*/
	1277	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1278
	1279	break;
	1280	} else {
	1281	if (must_be_resident) {
	1282	vm_object_paging_end(object);
	1283	} else if (object != first_object) {
	1284	vm_object_paging_end(object);
	1285	VM_PAGE_FREE(m);
	1286	} else {
	1287	first_m = m;
	1288	m->vmp_absent = FALSE;
	1289	m->vmp_busy = TRUE;
	1290
	1291	vm_page_lockspin_queues();
	1292	vm_page_queues_remove(m, FALSE);
	1293	vm_page_unlock_queues();
	1294	}
	1295
	1296	offset += object->vo_shadow_offset;
	1297	fault_info->lo_offset += object->vo_shadow_offset;
	1298	fault_info->hi_offset += object->vo_shadow_offset;
	1299	access_required = VM_PROT_READ;
	1300
	1301	vm_object_lock(next_object);
	1302	vm_object_unlock(object);
	1303	object = next_object;
	1304	vm_object_paging_begin(object);
	1305
	1306	/*
	1307	* reset to default type of fault
	1308	*/
	1309	my_fault = DBG_CACHE_HIT_FAULT;
	1310
	1311	continue;
	1312	}
	1313	}
	1314	if ((m->vmp_cleaning)
	1315	&& ((object != first_object) \|\| (object->copy != VM_OBJECT_NULL))
	1316	&& (fault_type & VM_PROT_WRITE)) {
	1317	/*
	1318	* This is a copy-on-write fault that will
	1319	* cause us to revoke access to this page, but
	1320	* this page is in the process of being cleaned
	1321	* in a clustered pageout. We must wait until
	1322	* the cleaning operation completes before
	1323	* revoking access to the original page,
	1324	* otherwise we might attempt to remove a
	1325	* wired mapping.
	1326	*/
	1327	#if TRACEFAULTPAGE
	1328	dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
	1329	#endif
	1330	/*
	1331	* take an extra ref so that object won't die
	1332	*/
	1333	vm_object_reference_locked(object);
	1334
	1335	vm_fault_cleanup(object, first_m);
	1336
	1337	counter(c_vm_fault_page_block_backoff_kernel++);
	1338	vm_object_lock(object);
	1339	assert(object->ref_count > 0);
	1340
	1341	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	1342
	1343	if (m != VM_PAGE_NULL && m->vmp_cleaning) {
	1344	PAGE_ASSERT_WAIT(m, interruptible);
	1345
	1346	vm_object_unlock(object);
	1347	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1348	vm_object_deallocate(object);
	1349
	1350	goto backoff;
	1351	} else {
	1352	vm_object_unlock(object);
	1353
	1354	vm_object_deallocate(object);
	1355	thread_interrupt_level(interruptible_state);
	1356
	1357	return VM_FAULT_RETRY;
	1358	}
	1359	}
	1360	if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
	1361	!(fault_info != NULL && fault_info->stealth)) {
	1362	/*
	1363	* If we were passed a non-NULL pointer for
	1364	* "type_of_fault", than we came from
	1365	* vm_fault... we'll let it deal with
	1366	* this condition, since it
	1367	* needs to see m->vmp_speculative to correctly
	1368	* account the pageins, otherwise...
	1369	* take it off the speculative queue, we'll
	1370	* let the caller of vm_fault_page deal
	1371	* with getting it onto the correct queue
	1372	*
	1373	* If the caller specified in fault_info that
	1374	* it wants a "stealth" fault, we also leave
	1375	* the page in the speculative queue.
	1376	*/
	1377	vm_page_lockspin_queues();
	1378	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
	1379	vm_page_queues_remove(m, FALSE);
	1380	}
	1381	vm_page_unlock_queues();
	1382	}
	1383	assert(object == VM_PAGE_OBJECT(m));
	1384
	1385	if (object->code_signed) {
	1386	/*
	1387	* CODE SIGNING:
	1388	* We just paged in a page from a signed
	1389	* memory object but we don't need to
	1390	* validate it now. We'll validate it if
	1391	* when it gets mapped into a user address
	1392	* space for the first time or when the page
	1393	* gets copied to another object as a result
	1394	* of a copy-on-write.
	1395	*/
	1396	}
	1397
	1398	/*
	1399	* We mark the page busy and leave it on
	1400	* the pageout queues. If the pageout
	1401	* deamon comes across it, then it will
	1402	* remove the page from the queue, but not the object
	1403	*/
	1404	#if TRACEFAULTPAGE
	1405	dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1406	#endif
	1407	assert(!m->vmp_busy);
	1408	assert(!m->vmp_absent);
	1409
	1410	m->vmp_busy = TRUE;
	1411	break;
	1412	}
	1413
	1414
	1415	/*
	1416	* we get here when there is no page present in the object at
	1417	* the offset we're interested in... we'll allocate a page
	1418	* at this point if the pager associated with
	1419	* this object can provide the data or we're the top object...
	1420	* object is locked; m == NULL
	1421	*/
	1422
	1423	if (must_be_resident) {
	1424	if (fault_type == VM_PROT_NONE &&
	1425	object == kernel_object) {
	1426	/*
	1427	* We've been called from vm_fault_unwire()
	1428	* while removing a map entry that was allocated
	1429	* with KMA_KOBJECT and KMA_VAONLY. This page
	1430	* is not present and there's nothing more to
	1431	* do here (nothing to unwire).
	1432	*/
	1433	vm_fault_cleanup(object, first_m);
	1434	thread_interrupt_level(interruptible_state);
	1435
	1436	return VM_FAULT_MEMORY_ERROR;
	1437	}
	1438
	1439	goto dont_look_for_page;
	1440	}
	1441
	1442	/* Don't expect to fault pages into the kernel object. */
	1443	assert(object != kernel_object);
	1444
	1445	data_supply = FALSE;
	1446
	1447	look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
	1448
	1449	#if TRACEFAULTPAGE
	1450	dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
	1451	#endif
	1452	if (!look_for_page && object == first_object && !object->phys_contiguous) {
	1453	/*
	1454	* Allocate a new page for this object/offset pair as a placeholder
	1455	*/
	1456	m = vm_page_grab_options(grab_options);
	1457	#if TRACEFAULTPAGE
	1458	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1459	#endif
	1460	if (m == VM_PAGE_NULL) {
	1461	vm_fault_cleanup(object, first_m);
	1462	thread_interrupt_level(interruptible_state);
	1463
	1464	return VM_FAULT_MEMORY_SHORTAGE;
	1465	}
	1466
	1467	if (fault_info && fault_info->batch_pmap_op == TRUE) {
	1468	vm_page_insert_internal(m, object,
	1469	vm_object_trunc_page(offset),
	1470	VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
	1471	} else {
	1472	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1473	}
	1474	}
	1475	if (look_for_page) {
	1476	kern_return_t rc;
	1477	int my_fault_type;
	1478
	1479	/*
	1480	* If the memory manager is not ready, we
	1481	* cannot make requests.
	1482	*/
	1483	if (!object->pager_ready) {
	1484	#if TRACEFAULTPAGE
	1485	dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
	1486	#endif
	1487	if (m != VM_PAGE_NULL) {
	1488	VM_PAGE_FREE(m);
	1489	}
	1490
	1491	/*
	1492	* take an extra ref so object won't die
	1493	*/
	1494	vm_object_reference_locked(object);
	1495	vm_fault_cleanup(object, first_m);
	1496	counter(c_vm_fault_page_block_backoff_kernel++);
	1497
	1498	vm_object_lock(object);
	1499	assert(object->ref_count > 0);
	1500
	1501	if (!object->pager_ready) {
	1502	wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
	1503
	1504	vm_object_unlock(object);
	1505	if (wait_result == THREAD_WAITING) {
	1506	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1507	}
	1508	vm_object_deallocate(object);
	1509
	1510	goto backoff;
	1511	} else {
	1512	vm_object_unlock(object);
	1513	vm_object_deallocate(object);
	1514	thread_interrupt_level(interruptible_state);
	1515
	1516	return VM_FAULT_RETRY;
	1517	}
	1518	}
	1519	if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
	1520	/*
	1521	* If there are too many outstanding page
	1522	* requests pending on this external object, we
	1523	* wait for them to be resolved now.
	1524	*/
	1525	#if TRACEFAULTPAGE
	1526	dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1527	#endif
	1528	if (m != VM_PAGE_NULL) {
	1529	VM_PAGE_FREE(m);
	1530	}
	1531	/*
	1532	* take an extra ref so object won't die
	1533	*/
	1534	vm_object_reference_locked(object);
	1535
	1536	vm_fault_cleanup(object, first_m);
	1537
	1538	counter(c_vm_fault_page_block_backoff_kernel++);
	1539
	1540	vm_object_lock(object);
	1541	assert(object->ref_count > 0);
	1542
	1543	if (object->paging_in_progress >= vm_object_pagein_throttle) {
	1544	vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
	1545
	1546	vm_object_unlock(object);
	1547	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1548	vm_object_deallocate(object);
	1549
	1550	goto backoff;
	1551	} else {
	1552	vm_object_unlock(object);
	1553	vm_object_deallocate(object);
	1554	thread_interrupt_level(interruptible_state);
	1555
	1556	return VM_FAULT_RETRY;
	1557	}
	1558	}
	1559	if (object->internal) {
	1560	int compressed_count_delta;
	1561
	1562	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	1563
	1564	if (m == VM_PAGE_NULL) {
	1565	/*
	1566	* Allocate a new page for this object/offset pair as a placeholder
	1567	*/
	1568	m = vm_page_grab_options(grab_options);
	1569	#if TRACEFAULTPAGE
	1570	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1571	#endif
	1572	if (m == VM_PAGE_NULL) {
	1573	vm_fault_cleanup(object, first_m);
	1574	thread_interrupt_level(interruptible_state);
	1575
	1576	return VM_FAULT_MEMORY_SHORTAGE;
	1577	}
	1578
	1579	m->vmp_absent = TRUE;
	1580	if (fault_info && fault_info->batch_pmap_op == TRUE) {
	1581	vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
	1582	} else {
	1583	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1584	}
	1585	}
	1586	assert(m->vmp_busy);
	1587
	1588	m->vmp_absent = TRUE;
	1589	pager = object->pager;
	1590
	1591	assert(object->paging_in_progress > 0);
	1592	vm_object_unlock(object);
	1593
	1594	rc = vm_compressor_pager_get(
	1595	pager,
	1596	offset + object->paging_offset,
	1597	VM_PAGE_GET_PHYS_PAGE(m),
	1598	&my_fault_type,
	1599	0,
	1600	&compressed_count_delta);
	1601
	1602	if (type_of_fault == NULL) {
	1603	int throttle_delay;
	1604
	1605	/*
	1606	* we weren't called from vm_fault, so we
	1607	* need to apply page creation throttling
	1608	* do it before we re-acquire any locks
	1609	*/
	1610	if (my_fault_type == DBG_COMPRESSOR_FAULT) {
	1611	if ((throttle_delay = vm_page_throttled(TRUE))) {
	1612	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
	1613	delay(throttle_delay);
	1614	}
	1615	}
	1616	}
	1617	vm_object_lock(object);
	1618	assert(object->paging_in_progress > 0);
	1619
	1620	vm_compressor_pager_count(
	1621	pager,
	1622	compressed_count_delta,
	1623	FALSE, /* shared_lock */
	1624	object);
	1625
	1626	switch (rc) {
	1627	case KERN_SUCCESS:
	1628	m->vmp_absent = FALSE;
	1629	m->vmp_dirty = TRUE;
	1630	if ((object->wimg_bits &
	1631	VM_WIMG_MASK) !=
	1632	VM_WIMG_USE_DEFAULT) {
	1633	/*
	1634	* If the page is not cacheable,
	1635	* we can't let its contents
	1636	* linger in the data cache
	1637	* after the decompression.
	1638	*/
	1639	pmap_sync_page_attributes_phys(
	1640	VM_PAGE_GET_PHYS_PAGE(m));
	1641	} else {
	1642	m->vmp_written_by_kernel = TRUE;
	1643	}
	1644
	1645	/*
	1646	* If the object is purgeable, its
	1647	* owner's purgeable ledgers have been
	1648	* updated in vm_page_insert() but the
	1649	* page was also accounted for in a
	1650	* "compressed purgeable" ledger, so
	1651	* update that now.
	1652	*/
	1653	if (((object->purgable !=
	1654	VM_PURGABLE_DENY) \|\|
	1655	object->vo_ledger_tag) &&
	1656	(object->vo_owner !=
	1657	NULL)) {
	1658	/*
	1659	* One less compressed
	1660	* purgeable/tagged page.
	1661	*/
	1662	vm_object_owner_compressed_update(
	1663	object,
	1664	-1);
	1665	}
	1666
	1667	break;
	1668	case KERN_MEMORY_FAILURE:
	1669	m->vmp_unusual = TRUE;
	1670	m->vmp_error = TRUE;
	1671	m->vmp_absent = FALSE;
	1672	break;
	1673	case KERN_MEMORY_ERROR:
	1674	assert(m->vmp_absent);
	1675	break;
	1676	default:
	1677	panic("vm_fault_page(): unexpected "
	1678	"error %d from "
	1679	"vm_compressor_pager_get()\n",
	1680	rc);
	1681	}
	1682	PAGE_WAKEUP_DONE(m);
	1683
	1684	rc = KERN_SUCCESS;
	1685	goto data_requested;
	1686	}
	1687	my_fault_type = DBG_PAGEIN_FAULT;
	1688
	1689	if (m != VM_PAGE_NULL) {
	1690	VM_PAGE_FREE(m);
	1691	m = VM_PAGE_NULL;
	1692	}
	1693
	1694	#if TRACEFAULTPAGE
	1695	dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
	1696	#endif
	1697
	1698	/*
	1699	* It's possible someone called vm_object_destroy while we weren't
	1700	* holding the object lock. If that has happened, then bail out
	1701	* here.
	1702	*/
	1703
	1704	pager = object->pager;
	1705
	1706	if (pager == MEMORY_OBJECT_NULL) {
	1707	vm_fault_cleanup(object, first_m);
	1708	thread_interrupt_level(interruptible_state);
	1709	return VM_FAULT_MEMORY_ERROR;
	1710	}
	1711
	1712	/*
	1713	* We have an absent page in place for the faulting offset,
	1714	* so we can release the object lock.
	1715	*/
	1716
	1717	if (object->object_is_shared_cache) {
	1718	set_thread_rwlock_boost();
	1719	}
	1720
	1721	vm_object_unlock(object);
	1722
	1723	/*
	1724	* If this object uses a copy_call strategy,
	1725	* and we are interested in a copy of this object
	1726	* (having gotten here only by following a
	1727	* shadow chain), then tell the memory manager
	1728	* via a flag added to the desired_access
	1729	* parameter, so that it can detect a race
	1730	* between our walking down the shadow chain
	1731	* and its pushing pages up into a copy of
	1732	* the object that it manages.
	1733	*/
	1734	if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
	1735	wants_copy_flag = VM_PROT_WANTS_COPY;
	1736	} else {
	1737	wants_copy_flag = VM_PROT_NONE;
	1738	}
	1739
	1740	if (object->copy == first_object) {
	1741	/*
	1742	* if we issue the memory_object_data_request in
	1743	* this state, we are subject to a deadlock with
	1744	* the underlying filesystem if it is trying to
	1745	* shrink the file resulting in a push of pages
	1746	* into the copy object... that push will stall
	1747	* on the placeholder page, and if the pushing thread
	1748	* is holding a lock that is required on the pagein
	1749	* path (such as a truncate lock), we'll deadlock...
	1750	* to avoid this potential deadlock, we throw away
	1751	* our placeholder page before calling memory_object_data_request
	1752	* and force this thread to retry the vm_fault_page after
	1753	* we have issued the I/O. the second time through this path
	1754	* we will find the page already in the cache (presumably still
	1755	* busy waiting for the I/O to complete) and then complete
	1756	* the fault w/o having to go through memory_object_data_request again
	1757	*/
	1758	assert(first_m != VM_PAGE_NULL);
	1759	assert(VM_PAGE_OBJECT(first_m) == first_object);
	1760
	1761	vm_object_lock(first_object);
	1762	VM_PAGE_FREE(first_m);
	1763	vm_object_paging_end(first_object);
	1764	vm_object_unlock(first_object);
	1765
	1766	first_m = VM_PAGE_NULL;
	1767	force_fault_retry = TRUE;
	1768
	1769	vm_fault_page_forced_retry++;
	1770	}
	1771
	1772	if (data_already_requested == TRUE) {
	1773	orig_behavior = fault_info->behavior;
	1774	orig_cluster_size = fault_info->cluster_size;
	1775
	1776	fault_info->behavior = VM_BEHAVIOR_RANDOM;
	1777	fault_info->cluster_size = PAGE_SIZE;
	1778	}
	1779	/*
	1780	* Call the memory manager to retrieve the data.
	1781	*/
	1782	rc = memory_object_data_request(
	1783	pager,
	1784	vm_object_trunc_page(offset) + object->paging_offset,
	1785	PAGE_SIZE,
	1786	access_required \| wants_copy_flag,
	1787	(memory_object_fault_info_t)fault_info);
	1788
	1789	if (data_already_requested == TRUE) {
	1790	fault_info->behavior = orig_behavior;
	1791	fault_info->cluster_size = orig_cluster_size;
	1792	} else {
	1793	data_already_requested = TRUE;
	1794	}
	1795
	1796	DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
	1797	#if TRACEFAULTPAGE
	1798	dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
	1799	#endif
	1800	vm_object_lock(object);
	1801
	1802	if (object->object_is_shared_cache) {
	1803	clear_thread_rwlock_boost();
	1804	}
	1805
	1806	data_requested:
	1807	if (rc != KERN_SUCCESS) {
	1808	vm_fault_cleanup(object, first_m);
	1809	thread_interrupt_level(interruptible_state);
	1810
	1811	return (rc == MACH_SEND_INTERRUPTED) ?
	1812	VM_FAULT_INTERRUPTED :
	1813	VM_FAULT_MEMORY_ERROR;
	1814	} else {
	1815	clock_sec_t tv_sec;
	1816	clock_usec_t tv_usec;
	1817
	1818	if (my_fault_type == DBG_PAGEIN_FAULT) {
	1819	clock_get_system_microtime(&tv_sec, &tv_usec);
	1820	current_thread()->t_page_creation_time = tv_sec;
	1821	current_thread()->t_page_creation_count = 0;
	1822	}
	1823	}
	1824	if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
	1825	vm_fault_cleanup(object, first_m);
	1826	thread_interrupt_level(interruptible_state);
	1827
	1828	return VM_FAULT_INTERRUPTED;
	1829	}
	1830	if (force_fault_retry == TRUE) {
	1831	vm_fault_cleanup(object, first_m);
	1832	thread_interrupt_level(interruptible_state);
	1833
	1834	return VM_FAULT_RETRY;
	1835	}
	1836	if (m == VM_PAGE_NULL && object->phys_contiguous) {
	1837	/*
	1838	* No page here means that the object we
	1839	* initially looked up was "physically
	1840	* contiguous" (i.e. device memory). However,
	1841	* with Virtual VRAM, the object might not
	1842	* be backed by that device memory anymore,
	1843	* so we're done here only if the object is
	1844	* still "phys_contiguous".
	1845	* Otherwise, if the object is no longer
	1846	* "phys_contiguous", we need to retry the
	1847	* page fault against the object's new backing
	1848	* store (different memory object).
	1849	*/
	1850	phys_contig_object:
	1851	goto done;
	1852	}
	1853	/*
	1854	* potentially a pagein fault
	1855	* if we make it through the state checks
	1856	* above, than we'll count it as such
	1857	*/
	1858	my_fault = my_fault_type;
	1859
	1860	/*
	1861	* Retry with same object/offset, since new data may
	1862	* be in a different page (i.e., m is meaningless at
	1863	* this point).
	1864	*/
	1865	continue;
	1866	}
	1867	dont_look_for_page:
	1868	/*
	1869	* We get here if the object has no pager, or an existence map
	1870	* exists and indicates the page isn't present on the pager
	1871	* or we're unwiring a page. If a pager exists, but there
	1872	* is no existence map, then the m->vmp_absent case above handles
	1873	* the ZF case when the pager can't provide the page
	1874	*/
	1875	#if TRACEFAULTPAGE
	1876	dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	1877	#endif
	1878	if (object == first_object) {
	1879	first_m = m;
	1880	} else {
	1881	assert(m == VM_PAGE_NULL);
	1882	}
	1883
	1884	next_object = object->shadow;
	1885
	1886	if (next_object == VM_OBJECT_NULL) {
	1887	/*
	1888	* we've hit the bottom of the shadown chain,
	1889	* fill the page in the top object with zeros.
	1890	*/
	1891	assert(!must_be_resident);
	1892
	1893	if (object != first_object) {
	1894	vm_object_paging_end(object);
	1895	vm_object_unlock(object);
	1896
	1897	object = first_object;
	1898	offset = first_offset;
	1899	vm_object_lock(object);
	1900	}
	1901	m = first_m;
	1902	assert(VM_PAGE_OBJECT(m) == object);
	1903	first_m = VM_PAGE_NULL;
	1904
	1905	/*
	1906	* check for any conditions that prevent
	1907	* us from creating a new zero-fill page
	1908	* vm_fault_check will do all of the
	1909	* fault cleanup in the case of an error condition
	1910	* including resetting the thread_interrupt_level
	1911	*/
	1912	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
	1913
	1914	if (error != VM_FAULT_SUCCESS) {
	1915	return error;
	1916	}
	1917
	1918	if (m == VM_PAGE_NULL) {
	1919	m = vm_page_grab_options(grab_options);
	1920
	1921	if (m == VM_PAGE_NULL) {
	1922	vm_fault_cleanup(object, VM_PAGE_NULL);
	1923	thread_interrupt_level(interruptible_state);
	1924
	1925	return VM_FAULT_MEMORY_SHORTAGE;
	1926	}
	1927	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1928	}
	1929	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
	1930	m->vmp_absent = TRUE;
	1931	}
	1932
	1933	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1934
	1935	break;
	1936	} else {
	1937	/*
	1938	* Move on to the next object. Lock the next
	1939	* object before unlocking the current one.
	1940	*/
	1941	if ((object != first_object) \|\| must_be_resident) {
	1942	vm_object_paging_end(object);
	1943	}
	1944
	1945	offset += object->vo_shadow_offset;
	1946	fault_info->lo_offset += object->vo_shadow_offset;
	1947	fault_info->hi_offset += object->vo_shadow_offset;
	1948	access_required = VM_PROT_READ;
	1949
	1950	vm_object_lock(next_object);
	1951	vm_object_unlock(object);
	1952
	1953	object = next_object;
	1954	vm_object_paging_begin(object);
	1955	}
	1956	}
	1957
	1958	/*
	1959	* PAGE HAS BEEN FOUND.
	1960	*
	1961	* This page (m) is:
	1962	* busy, so that we can play with it;
	1963	* not absent, so that nobody else will fill it;
	1964	* possibly eligible for pageout;
	1965	*
	1966	* The top-level page (first_m) is:
	1967	* VM_PAGE_NULL if the page was found in the
	1968	* top-level object;
	1969	* busy, not absent, and ineligible for pageout.
	1970	*
	1971	* The current object (object) is locked. A paging
	1972	* reference is held for the current and top-level
	1973	* objects.
	1974	*/
	1975
	1976	#if TRACEFAULTPAGE
	1977	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	1978	#endif
	1979	#if EXTRA_ASSERTIONS
	1980	assert(m->vmp_busy && !m->vmp_absent);
	1981	assert((first_m == VM_PAGE_NULL) \|\|
	1982	(first_m->vmp_busy && !first_m->vmp_absent &&
	1983	!first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
	1984	#endif /* EXTRA_ASSERTIONS */
	1985
	1986	/*
	1987	* If the page is being written, but isn't
	1988	* already owned by the top-level object,
	1989	* we have to copy it into a new page owned
	1990	* by the top-level object.
	1991	*/
	1992	if (object != first_object) {
	1993	#if TRACEFAULTPAGE
	1994	dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
	1995	#endif
	1996	if (fault_type & VM_PROT_WRITE) {
	1997	vm_page_t copy_m;
	1998
	1999	/*
	2000	* We only really need to copy if we
	2001	* want to write it.
	2002	*/
	2003	assert(!must_be_resident);
	2004
	2005	/*
	2006	* If we try to collapse first_object at this
	2007	* point, we may deadlock when we try to get
	2008	* the lock on an intermediate object (since we
	2009	* have the bottom object locked). We can't
	2010	* unlock the bottom object, because the page
	2011	* we found may move (by collapse) if we do.
	2012	*
	2013	* Instead, we first copy the page. Then, when
	2014	* we have no more use for the bottom object,
	2015	* we unlock it and try to collapse.
	2016	*
	2017	* Note that we copy the page even if we didn't
	2018	* need to... that's the breaks.
	2019	*/
	2020
	2021	/*
	2022	* Allocate a page for the copy
	2023	*/
	2024	copy_m = vm_page_grab_options(grab_options);
	2025
	2026	if (copy_m == VM_PAGE_NULL) {
	2027	RELEASE_PAGE(m);
	2028
	2029	vm_fault_cleanup(object, first_m);
	2030	thread_interrupt_level(interruptible_state);
	2031
	2032	return VM_FAULT_MEMORY_SHORTAGE;
	2033	}
	2034
	2035	vm_page_copy(m, copy_m);
	2036
	2037	/*
	2038	* If another map is truly sharing this
	2039	* page with us, we have to flush all
	2040	* uses of the original page, since we
	2041	* can't distinguish those which want the
	2042	* original from those which need the
	2043	* new copy.
	2044	*
	2045	* XXXO If we know that only one map has
	2046	* access to this page, then we could
	2047	* avoid the pmap_disconnect() call.
	2048	*/
	2049	if (m->vmp_pmapped) {
	2050	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2051	}
	2052
	2053	if (m->vmp_clustered) {
	2054	VM_PAGE_COUNT_AS_PAGEIN(m);
	2055	VM_PAGE_CONSUME_CLUSTERED(m);
	2056	}
	2057	assert(!m->vmp_cleaning);
	2058
	2059	/*
	2060	* We no longer need the old page or object.
	2061	*/
	2062	RELEASE_PAGE(m);
	2063
	2064	/*
	2065	* This check helps with marking the object as having a sequential pattern
	2066	* Normally we'll miss doing this below because this fault is about COW to
	2067	* the first_object i.e. bring page in from disk, push to object above but
	2068	* don't update the file object's sequential pattern.
	2069	*/
	2070	if (object->internal == FALSE) {
	2071	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2072	}
	2073
	2074	vm_object_paging_end(object);
	2075	vm_object_unlock(object);
	2076
	2077	my_fault = DBG_COW_FAULT;
	2078	VM_STAT_INCR(cow_faults);
	2079	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
	2080	current_task()->cow_faults++;
	2081
	2082	object = first_object;
	2083	offset = first_offset;
	2084
	2085	vm_object_lock(object);
	2086	/*
	2087	* get rid of the place holder
	2088	* page that we soldered in earlier
	2089	*/
	2090	VM_PAGE_FREE(first_m);
	2091	first_m = VM_PAGE_NULL;
	2092
	2093	/*
	2094	* and replace it with the
	2095	* page we just copied into
	2096	*/
	2097	assert(copy_m->vmp_busy);
	2098	vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
	2099	SET_PAGE_DIRTY(copy_m, TRUE);
	2100
	2101	m = copy_m;
	2102	/*
	2103	* Now that we've gotten the copy out of the
	2104	* way, let's try to collapse the top object.
	2105	* But we have to play ugly games with
	2106	* paging_in_progress to do that...
	2107	*/
	2108	vm_object_paging_end(object);
	2109	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
	2110	vm_object_paging_begin(object);
	2111	} else {
	2112	*protection &= (~VM_PROT_WRITE);
	2113	}
	2114	}
	2115	/*
	2116	* Now check whether the page needs to be pushed into the
	2117	* copy object. The use of asymmetric copy on write for
	2118	* shared temporary objects means that we may do two copies to
	2119	* satisfy the fault; one above to get the page from a
	2120	* shadowed object, and one here to push it into the copy.
	2121	*/
	2122	try_failed_count = 0;
	2123
	2124	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
	2125	vm_object_offset_t copy_offset;
	2126	vm_page_t copy_m;
	2127
	2128	#if TRACEFAULTPAGE
	2129	dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
	2130	#endif
	2131	/*
	2132	* If the page is being written, but hasn't been
	2133	* copied to the copy-object, we have to copy it there.
	2134	*/
	2135	if ((fault_type & VM_PROT_WRITE) == 0) {
	2136	*protection &= ~VM_PROT_WRITE;
	2137	break;
	2138	}
	2139
	2140	/*
	2141	* If the page was guaranteed to be resident,
	2142	* we must have already performed the copy.
	2143	*/
	2144	if (must_be_resident) {
	2145	break;
	2146	}
	2147
	2148	/*
	2149	* Try to get the lock on the copy_object.
	2150	*/
	2151	if (!vm_object_lock_try(copy_object)) {
	2152	vm_object_unlock(object);
	2153	try_failed_count++;
	2154
	2155	mutex_pause(try_failed_count); /* wait a bit */
	2156	vm_object_lock(object);
	2157
	2158	continue;
	2159	}
	2160	try_failed_count = 0;
	2161
	2162	/*
	2163	* Make another reference to the copy-object,
	2164	* to keep it from disappearing during the
	2165	* copy.
	2166	*/
	2167	vm_object_reference_locked(copy_object);
	2168
	2169	/*
	2170	* Does the page exist in the copy?
	2171	*/
	2172	copy_offset = first_offset - copy_object->vo_shadow_offset;
	2173	copy_offset = vm_object_trunc_page(copy_offset);
	2174
	2175	if (copy_object->vo_size <= copy_offset) {
	2176	/*
	2177	* Copy object doesn't cover this page -- do nothing.
	2178	*/
	2179	;
	2180	} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
	2181	/*
	2182	* Page currently exists in the copy object
	2183	*/
	2184	if (copy_m->vmp_busy) {
	2185	/*
	2186	* If the page is being brought
	2187	* in, wait for it and then retry.
	2188	*/
	2189	RELEASE_PAGE(m);
	2190
	2191	/*
	2192	* take an extra ref so object won't die
	2193	*/
	2194	vm_object_reference_locked(copy_object);
	2195	vm_object_unlock(copy_object);
	2196	vm_fault_cleanup(object, first_m);
	2197	counter(c_vm_fault_page_block_backoff_kernel++);
	2198
	2199	vm_object_lock(copy_object);
	2200	assert(copy_object->ref_count > 0);
	2201	VM_OBJ_RES_DECR(copy_object);
	2202	vm_object_lock_assert_exclusive(copy_object);
	2203	copy_object->ref_count--;
	2204	assert(copy_object->ref_count > 0);
	2205	copy_m = vm_page_lookup(copy_object, copy_offset);
	2206
	2207	if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
	2208	PAGE_ASSERT_WAIT(copy_m, interruptible);
	2209
	2210	vm_object_unlock(copy_object);
	2211	wait_result = thread_block(THREAD_CONTINUE_NULL);
	2212	vm_object_deallocate(copy_object);
	2213
	2214	goto backoff;
	2215	} else {
	2216	vm_object_unlock(copy_object);
	2217	vm_object_deallocate(copy_object);
	2218	thread_interrupt_level(interruptible_state);
	2219
	2220	return VM_FAULT_RETRY;
	2221	}
	2222	}
	2223	} else if (!PAGED_OUT(copy_object, copy_offset)) {
	2224	/*
	2225	* If PAGED_OUT is TRUE, then the page used to exist
	2226	* in the copy-object, and has already been paged out.
	2227	* We don't need to repeat this. If PAGED_OUT is
	2228	* FALSE, then either we don't know (!pager_created,
	2229	* for example) or it hasn't been paged out.
	2230	* (VM_EXTERNAL_STATE_UNKNOWN\|\|VM_EXTERNAL_STATE_ABSENT)
	2231	* We must copy the page to the copy object.
	2232	*
	2233	* Allocate a page for the copy
	2234	*/
	2235	copy_m = vm_page_alloc(copy_object, copy_offset);
	2236
	2237	if (copy_m == VM_PAGE_NULL) {
	2238	RELEASE_PAGE(m);
	2239
	2240	VM_OBJ_RES_DECR(copy_object);
	2241	vm_object_lock_assert_exclusive(copy_object);
	2242	copy_object->ref_count--;
	2243	assert(copy_object->ref_count > 0);
	2244
	2245	vm_object_unlock(copy_object);
	2246	vm_fault_cleanup(object, first_m);
	2247	thread_interrupt_level(interruptible_state);
	2248
	2249	return VM_FAULT_MEMORY_SHORTAGE;
	2250	}
	2251	/*
	2252	* Must copy page into copy-object.
	2253	*/
	2254	vm_page_copy(m, copy_m);
	2255
	2256	/*
	2257	* If the old page was in use by any users
	2258	* of the copy-object, it must be removed
	2259	* from all pmaps. (We can't know which
	2260	* pmaps use it.)
	2261	*/
	2262	if (m->vmp_pmapped) {
	2263	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2264	}
	2265
	2266	if (m->vmp_clustered) {
	2267	VM_PAGE_COUNT_AS_PAGEIN(m);
	2268	VM_PAGE_CONSUME_CLUSTERED(m);
	2269	}
	2270	/*
	2271	* If there's a pager, then immediately
	2272	* page out this page, using the "initialize"
	2273	* option. Else, we use the copy.
	2274	*/
	2275	if ((!copy_object->pager_ready)
	2276	\|\| VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
	2277	) {
	2278	vm_page_lockspin_queues();
	2279	assert(!m->vmp_cleaning);
	2280	vm_page_activate(copy_m);
	2281	vm_page_unlock_queues();
	2282
	2283	SET_PAGE_DIRTY(copy_m, TRUE);
	2284	PAGE_WAKEUP_DONE(copy_m);
	2285	} else {
	2286	assert(copy_m->vmp_busy == TRUE);
	2287	assert(!m->vmp_cleaning);
	2288
	2289	/*
	2290	* dirty is protected by the object lock
	2291	*/
	2292	SET_PAGE_DIRTY(copy_m, TRUE);
	2293
	2294	/*
	2295	* The page is already ready for pageout:
	2296	* not on pageout queues and busy.
	2297	* Unlock everything except the
	2298	* copy_object itself.
	2299	*/
	2300	vm_object_unlock(object);
	2301
	2302	/*
	2303	* Write the page to the copy-object,
	2304	* flushing it from the kernel.
	2305	*/
	2306	vm_pageout_initialize_page(copy_m);
	2307
	2308	/*
	2309	* Since the pageout may have
	2310	* temporarily dropped the
	2311	* copy_object's lock, we
	2312	* check whether we'll have
	2313	* to deallocate the hard way.
	2314	*/
	2315	if ((copy_object->shadow != object) \|\| (copy_object->ref_count == 1)) {
	2316	vm_object_unlock(copy_object);
	2317	vm_object_deallocate(copy_object);
	2318	vm_object_lock(object);
	2319
	2320	continue;
	2321	}
	2322	/*
	2323	* Pick back up the old object's
	2324	* lock. [It is safe to do so,
	2325	* since it must be deeper in the
	2326	* object tree.]
	2327	*/
	2328	vm_object_lock(object);
	2329	}
	2330
	2331	/*
	2332	* Because we're pushing a page upward
	2333	* in the object tree, we must restart
	2334	* any faults that are waiting here.
	2335	* [Note that this is an expansion of
	2336	* PAGE_WAKEUP that uses the THREAD_RESTART
	2337	* wait result]. Can't turn off the page's
	2338	* busy bit because we're not done with it.
	2339	*/
	2340	if (m->vmp_wanted) {
	2341	m->vmp_wanted = FALSE;
	2342	thread_wakeup_with_result((event_t) m, THREAD_RESTART);
	2343	}
	2344	}
	2345	/*
	2346	* The reference count on copy_object must be
	2347	* at least 2: one for our extra reference,
	2348	* and at least one from the outside world
	2349	* (we checked that when we last locked
	2350	* copy_object).
	2351	*/
	2352	vm_object_lock_assert_exclusive(copy_object);
	2353	copy_object->ref_count--;
	2354	assert(copy_object->ref_count > 0);
	2355
	2356	VM_OBJ_RES_DECR(copy_object);
	2357	vm_object_unlock(copy_object);
	2358
	2359	break;
	2360	}
	2361
	2362	done:
	2363	*result_page = m;
	2364	*top_page = first_m;
	2365
	2366	if (m != VM_PAGE_NULL) {
	2367	assert(VM_PAGE_OBJECT(m) == object);
	2368
	2369	retval = VM_FAULT_SUCCESS;
	2370
	2371	if (my_fault == DBG_PAGEIN_FAULT) {
	2372	VM_PAGE_COUNT_AS_PAGEIN(m);
	2373
	2374	if (object->internal) {
	2375	my_fault = DBG_PAGEIND_FAULT;
	2376	} else {
	2377	my_fault = DBG_PAGEINV_FAULT;
	2378	}
	2379
	2380	/*
	2381	* evaluate access pattern and update state
	2382	* vm_fault_deactivate_behind depends on the
	2383	* state being up to date
	2384	*/
	2385	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2386	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
	2387	} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
	2388	/*
	2389	* we weren't called from vm_fault, so handle the
	2390	* accounting here for hits in the cache
	2391	*/
	2392	if (m->vmp_clustered) {
	2393	VM_PAGE_COUNT_AS_PAGEIN(m);
	2394	VM_PAGE_CONSUME_CLUSTERED(m);
	2395	}
	2396	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2397	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
	2398	} else if (my_fault == DBG_COMPRESSOR_FAULT \|\| my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
	2399	VM_STAT_DECOMPRESSIONS();
	2400	}
	2401	if (type_of_fault) {
	2402	*type_of_fault = my_fault;
	2403	}
	2404	} else {
	2405	retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
	2406	assert(first_m == VM_PAGE_NULL);
	2407	assert(object == first_object);
	2408	}
	2409
	2410	thread_interrupt_level(interruptible_state);
	2411
	2412	#if TRACEFAULTPAGE
	2413	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
	2414	#endif
	2415	return retval;
	2416
	2417	backoff:
	2418	thread_interrupt_level(interruptible_state);
	2419
	2420	if (wait_result == THREAD_INTERRUPTED) {
	2421	return VM_FAULT_INTERRUPTED;
	2422	}
	2423	return VM_FAULT_RETRY;
	2424
	2425	#undef RELEASE_PAGE
	2426	}
	2427
	2428
	2429	extern int panic_on_cs_killed;
	2430	extern int proc_selfpid(void);
	2431	extern char proc_name_address(void p);
	2432	unsigned long cs_enter_tainted_rejected = 0;
	2433	unsigned long cs_enter_tainted_accepted = 0;
	2434
	2435	/*
	2436	* CODE SIGNING:
	2437	* When soft faulting a page, we have to validate the page if:
	2438	* 1. the page is being mapped in user space
	2439	* 2. the page hasn't already been found to be "tainted"
	2440	* 3. the page belongs to a code-signed object
	2441	* 4. the page has not been validated yet or has been mapped for write.
	2442	*/
	2443	static bool
	2444	vm_fault_cs_need_validation(
	2445	pmap_t pmap,
	2446	vm_page_t page,
	2447	vm_object_t page_obj,
	2448	vm_map_size_t fault_page_size,
	2449	vm_map_offset_t fault_phys_offset)
	2450	{
	2451	if (pmap == kernel_pmap) {
	2452	/* 1 - not user space */
	2453	return false;
	2454	}
	2455	if (!page_obj->code_signed) {
	2456	/* 3 - page does not belong to a code-signed object */
	2457	return false;
	2458	}
	2459	if (fault_page_size == PAGE_SIZE) {
	2460	/* looking at the whole page */
	2461	assertf(fault_phys_offset == 0,
	2462	"fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
	2463	(uint64_t)fault_page_size,
	2464	(uint64_t)fault_phys_offset);
	2465	if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
	2466	/* 2 - page is all tainted */
	2467	return false;
	2468	}
	2469	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
	2470	!page->vmp_wpmapped) {
	2471	/* 4 - already fully validated and never mapped writable */
	2472	return false;
	2473	}
	2474	} else {
	2475	/* looking at a specific sub-page */
	2476	if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	2477	/* 2 - sub-page was already marked as tainted */
	2478	return false;
	2479	}
	2480	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
	2481	!page->vmp_wpmapped) {
	2482	/* 4 - already validated and never mapped writable */
	2483	return false;
	2484	}
	2485	}
	2486	/* page needs to be validated */
	2487	return true;
	2488	}
	2489
	2490
	2491	static bool
	2492	vm_fault_cs_page_immutable(
	2493	vm_page_t m,
	2494	vm_map_size_t fault_page_size,
	2495	vm_map_offset_t fault_phys_offset,
	2496	vm_prot_t prot __unused)
	2497	{
	2498	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
	2499	/&& ((prot) & VM_PROT_EXECUTE)/) {
	2500	return true;
	2501	}
	2502	return false;
	2503	}
	2504
	2505	static bool
	2506	vm_fault_cs_page_nx(
	2507	vm_page_t m,
	2508	vm_map_size_t fault_page_size,
	2509	vm_map_offset_t fault_phys_offset)
	2510	{
	2511	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2512	}
	2513
	2514	/*
	2515	* Check if the page being entered into the pmap violates code signing.
	2516	*/
	2517	static kern_return_t
	2518	vm_fault_cs_check_violation(
	2519	bool cs_bypass,
	2520	vm_object_t object,
	2521	vm_page_t m,
	2522	pmap_t pmap,
	2523	vm_prot_t prot,
	2524	vm_prot_t caller_prot,
	2525	vm_map_size_t fault_page_size,
	2526	vm_map_offset_t fault_phys_offset,
	2527	vm_object_fault_info_t fault_info,
	2528	bool map_is_switched,
	2529	bool map_is_switch_protected,
	2530	bool *cs_violation)
	2531	{
	2532	#if !PMAP_CS
	2533	#pragma unused(caller_prot)
	2534	#pragma unused(fault_info)
	2535	#endif /* !PMAP_CS */
	2536	int cs_enforcement_enabled;
	2537	if (!cs_bypass &&
	2538	vm_fault_cs_need_validation(pmap, m, object,
	2539	fault_page_size, fault_phys_offset)) {
	2540	vm_object_lock_assert_exclusive(object);
	2541
	2542	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
	2543	vm_cs_revalidates++;
	2544	}
	2545
	2546	/* VM map is locked, so 1 ref will remain on VM object -
	2547	* so no harm if vm_page_validate_cs drops the object lock */
	2548
	2549	vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
	2550	}
	2551
	2552	/* If the map is switched, and is switch-protected, we must protect
	2553	* some pages from being write-faulted: immutable pages because by
	2554	* definition they may not be written, and executable pages because that
	2555	* would provide a way to inject unsigned code.
	2556	* If the page is immutable, we can simply return. However, we can't
	2557	* immediately determine whether a page is executable anywhere. But,
	2558	* we can disconnect it everywhere and remove the executable protection
	2559	* from the current map. We do that below right before we do the
	2560	* PMAP_ENTER.
	2561	*/
	2562	if (pmap == kernel_pmap) {
	2563	/* kernel fault: cs_enforcement does not apply */
	2564	cs_enforcement_enabled = 0;
	2565	} else {
	2566	cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
	2567	}
	2568
	2569	if (cs_enforcement_enabled && map_is_switched &&
	2570	map_is_switch_protected &&
	2571	vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
	2572	(prot & VM_PROT_WRITE)) {
	2573	return KERN_CODESIGN_ERROR;
	2574	}
	2575
	2576	if (cs_enforcement_enabled &&
	2577	vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
	2578	(prot & VM_PROT_EXECUTE)) {
	2579	if (cs_debug) {
	2580	printf("page marked to be NX, not letting it be mapped EXEC\n");
	2581	}
	2582	return KERN_CODESIGN_ERROR;
	2583	}
	2584
	2585	/* A page could be tainted, or pose a risk of being tainted later.
	2586	* Check whether the receiving process wants it, and make it feel
	2587	* the consequences (that hapens in cs_invalid_page()).
	2588	* For CS Enforcement, two other conditions will
	2589	* cause that page to be tainted as well:
	2590	* - pmapping an unsigned page executable - this means unsigned code;
	2591	* - writeable mapping of a validated page - the content of that page
	2592	* can be changed without the kernel noticing, therefore unsigned
	2593	* code can be created
	2594	*/
	2595	if (cs_bypass) {
	2596	/* code-signing is bypassed */
	2597	*cs_violation = FALSE;
	2598	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
	2599	/* tainted page */
	2600	*cs_violation = TRUE;
	2601	} else if (!cs_enforcement_enabled) {
	2602	/* no further code-signing enforcement */
	2603	*cs_violation = FALSE;
	2604	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
	2605	((prot & VM_PROT_WRITE) \|\|
	2606	m->vmp_wpmapped)) {
	2607	/*
	2608	* The page should be immutable, but is in danger of being
	2609	* modified.
	2610	* This is the case where we want policy from the code
	2611	* directory - is the page immutable or not? For now we have
	2612	* to assume that code pages will be immutable, data pages not.
	2613	* We'll assume a page is a code page if it has a code directory
	2614	* and we fault for execution.
	2615	* That is good enough since if we faulted the code page for
	2616	* writing in another map before, it is wpmapped; if we fault
	2617	* it for writing in this map later it will also be faulted for
	2618	* executing at the same time; and if we fault for writing in
	2619	* another map later, we will disconnect it from this pmap so
	2620	* we'll notice the change.
	2621	*/
	2622	*cs_violation = TRUE;
	2623	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
	2624	(prot & VM_PROT_EXECUTE)
	2625	) {
	2626	*cs_violation = TRUE;
	2627	} else {
	2628	*cs_violation = FALSE;
	2629	}
	2630	return KERN_SUCCESS;
	2631	}
	2632
	2633	/*
	2634	* Handles a code signing violation by either rejecting the page or forcing a disconnect.
	2635	* @param must_disconnect This value will be set to true if the caller must disconnect
	2636	* this page.
	2637	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2638	*/
	2639	static kern_return_t
	2640	vm_fault_cs_handle_violation(
	2641	vm_object_t object,
	2642	vm_page_t m,
	2643	pmap_t pmap,
	2644	vm_prot_t prot,
	2645	vm_map_offset_t vaddr,
	2646	vm_map_size_t fault_page_size,
	2647	vm_map_offset_t fault_phys_offset,
	2648	bool map_is_switched,
	2649	bool map_is_switch_protected,
	2650	bool *must_disconnect)
	2651	{
	2652	#if !MACH_ASSERT
	2653	#pragma unused(pmap)
	2654	#pragma unused(map_is_switch_protected)
	2655	#endif /* !MACH_ASSERT */
	2656	/*
	2657	* We will have a tainted page. Have to handle the special case
	2658	* of a switched map now. If the map is not switched, standard
	2659	* procedure applies - call cs_invalid_page().
	2660	* If the map is switched, the real owner is invalid already.
	2661	* There is no point in invalidating the switching process since
	2662	* it will not be executing from the map. So we don't call
	2663	* cs_invalid_page() in that case.
	2664	*/
	2665	boolean_t reject_page, cs_killed;
	2666	kern_return_t kr;
	2667	if (map_is_switched) {
	2668	assert(pmap == vm_map_pmap(current_thread()->map));
	2669	assert(!(prot & VM_PROT_WRITE) \|\| (map_is_switch_protected == FALSE));
	2670	reject_page = FALSE;
	2671	} else {
	2672	if (cs_debug > 5) {
	2673	printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
	2674	object->code_signed ? "yes" : "no",
	2675	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2676	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2677	m->vmp_wpmapped ? "yes" : "no",
	2678	(int)prot);
	2679	}
	2680	reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
	2681	}
	2682
	2683	if (reject_page) {
	2684	/* reject the invalid page: abort the page fault */
	2685	int pid;
	2686	const char *procname;
	2687	task_t task;
	2688	vm_object_t file_object, shadow;
	2689	vm_object_offset_t file_offset;
	2690	char pathname, filename;
	2691	vm_size_t pathname_len, filename_len;
	2692	boolean_t truncated_path;
	2693	#define __PATH_MAX 1024
	2694	struct timespec mtime, cs_mtime;
	2695	int shadow_depth;
	2696	os_reason_t codesigning_exit_reason = OS_REASON_NULL;
	2697
	2698	kr = KERN_CODESIGN_ERROR;
	2699	cs_enter_tainted_rejected++;
	2700
	2701	/* get process name and pid */
	2702	procname = "?";
	2703	task = current_task();
	2704	pid = proc_selfpid();
	2705	if (task->bsd_info != NULL) {
	2706	procname = proc_name_address(task->bsd_info);
	2707	}
	2708
	2709	/* get file's VM object */
	2710	file_object = object;
	2711	file_offset = m->vmp_offset;
	2712	for (shadow = file_object->shadow,
	2713	shadow_depth = 0;
	2714	shadow != VM_OBJECT_NULL;
	2715	shadow = file_object->shadow,
	2716	shadow_depth++) {
	2717	vm_object_lock_shared(shadow);
	2718	if (file_object != object) {
	2719	vm_object_unlock(file_object);
	2720	}
	2721	file_offset += file_object->vo_shadow_offset;
	2722	file_object = shadow;
	2723	}
	2724
	2725	mtime.tv_sec = 0;
	2726	mtime.tv_nsec = 0;
	2727	cs_mtime.tv_sec = 0;
	2728	cs_mtime.tv_nsec = 0;
	2729
	2730	/* get file's pathname and/or filename */
	2731	pathname = NULL;
	2732	filename = NULL;
	2733	pathname_len = 0;
	2734	filename_len = 0;
	2735	truncated_path = FALSE;
	2736	/* no pager -> no file -> no pathname, use "<nil>" in that case */
	2737	if (file_object->pager != NULL) {
	2738	pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
	2739	if (pathname) {
	2740	pathname[0] = '\0';
	2741	pathname_len = __PATH_MAX;
	2742	filename = pathname + pathname_len;
	2743	filename_len = __PATH_MAX;
	2744
	2745	if (vnode_pager_get_object_name(file_object->pager,
	2746	pathname,
	2747	pathname_len,
	2748	filename,
	2749	filename_len,
	2750	&truncated_path) == KERN_SUCCESS) {
	2751	/* safety first... */
	2752	pathname[__PATH_MAX - 1] = '\0';
	2753	filename[__PATH_MAX - 1] = '\0';
	2754
	2755	vnode_pager_get_object_mtime(file_object->pager,
	2756	&mtime,
	2757	&cs_mtime);
	2758	} else {
	2759	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
	2760	pathname = NULL;
	2761	filename = NULL;
	2762	pathname_len = 0;
	2763	filename_len = 0;
	2764	truncated_path = FALSE;
	2765	}
	2766	}
	2767	}
	2768	printf("CODE SIGNING: process %d[%s]: "
	2769	"rejecting invalid page at address 0x%llx "
	2770	"from offset 0x%llx in file \"%s%s%s\" "
	2771	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
	2772	"(signed:%d validated:%d tainted:%d nx:%d "
	2773	"wpmapped:%d dirty:%d depth:%d)\n",
	2774	pid, procname, (addr64_t) vaddr,
	2775	file_offset,
	2776	(pathname ? pathname : "<nil>"),
	2777	(truncated_path ? "/.../" : ""),
	2778	(truncated_path ? filename : ""),
	2779	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2780	((cs_mtime.tv_sec == mtime.tv_sec &&
	2781	cs_mtime.tv_nsec == mtime.tv_nsec)
	2782	? "=="
	2783	: "!="),
	2784	mtime.tv_sec, mtime.tv_nsec,
	2785	object->code_signed,
	2786	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2787	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2788	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
	2789	m->vmp_wpmapped,
	2790	m->vmp_dirty,
	2791	shadow_depth);
	2792
	2793	/*
	2794	* We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
	2795	* did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
	2796	* process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
	2797	* will deal with the segmentation fault.
	2798	*/
	2799	if (cs_killed) {
	2800	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
	2801	pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
	2802
	2803	codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
	2804	if (codesigning_exit_reason == NULL) {
	2805	printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
	2806	} else {
	2807	mach_vm_address_t data_addr = 0;
	2808	struct codesigning_exit_reason_info *ceri = NULL;
	2809	uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
	2810
	2811	if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
	2812	printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
	2813	} else {
	2814	if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
	2815	EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
	2816	ceri = (struct codesigning_exit_reason_info *)data_addr;
	2817	static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
	2818
	2819	ceri->ceri_virt_addr = vaddr;
	2820	ceri->ceri_file_offset = file_offset;
	2821	if (pathname) {
	2822	strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
	2823	} else {
	2824	ceri->ceri_pathname[0] = '\0';
	2825	}
	2826	if (filename) {
	2827	strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
	2828	} else {
	2829	ceri->ceri_filename[0] = '\0';
	2830	}
	2831	ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
	2832	ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
	2833	ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
	2834	ceri->ceri_page_modtime_secs = mtime.tv_sec;
	2835	ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
	2836	ceri->ceri_object_codesigned = (object->code_signed);
	2837	ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
	2838	ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
	2839	ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2840	ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
	2841	ceri->ceri_page_slid = 0;
	2842	ceri->ceri_page_dirty = (m->vmp_dirty);
	2843	ceri->ceri_page_shadow_depth = shadow_depth;
	2844	} else {
	2845	#if DEBUG \|\| DEVELOPMENT
	2846	panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
	2847	#else
	2848	printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
	2849	#endif /* DEBUG \|\| DEVELOPMENT */
	2850	/* Free the buffer */
	2851	os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
	2852	}
	2853	}
	2854	}
	2855
	2856	set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
	2857	}
	2858	if (panic_on_cs_killed &&
	2859	object->object_is_shared_cache) {
	2860	char *tainted_contents;
	2861	vm_map_offset_t src_vaddr;
	2862	src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
	2863	tainted_contents = kalloc(PAGE_SIZE);
	2864	bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
	2865	printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
	2866	panic("CODE SIGNING: process %d[%s]: "
	2867	"rejecting invalid page (phys#0x%x) at address 0x%llx "
	2868	"from offset 0x%llx in file \"%s%s%s\" "
	2869	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
	2870	"(signed:%d validated:%d tainted:%d nx:%d"
	2871	"wpmapped:%d dirty:%d depth:%d)\n",
	2872	pid, procname,
	2873	VM_PAGE_GET_PHYS_PAGE(m),
	2874	(addr64_t) vaddr,
	2875	file_offset,
	2876	(pathname ? pathname : "<nil>"),
	2877	(truncated_path ? "/.../" : ""),
	2878	(truncated_path ? filename : ""),
	2879	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2880	((cs_mtime.tv_sec == mtime.tv_sec &&
	2881	cs_mtime.tv_nsec == mtime.tv_nsec)
	2882	? "=="
	2883	: "!="),
	2884	mtime.tv_sec, mtime.tv_nsec,
	2885	object->code_signed,
	2886	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2887	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2888	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
	2889	m->vmp_wpmapped,
	2890	m->vmp_dirty,
	2891	shadow_depth);
	2892	}
	2893
	2894	if (file_object != object) {
	2895	vm_object_unlock(file_object);
	2896	}
	2897	if (pathname_len != 0) {
	2898	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
	2899	pathname = NULL;
	2900	filename = NULL;
	2901	}
	2902	} else {
	2903	/* proceed with the invalid page */
	2904	kr = KERN_SUCCESS;
	2905	if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
	2906	!object->code_signed) {
	2907	/*
	2908	* This page has not been (fully) validated but
	2909	* does not belong to a code-signed object
	2910	* so it should not be forcefully considered
	2911	* as tainted.
	2912	* We're just concerned about it here because
	2913	* we've been asked to "execute" it but that
	2914	* does not mean that it should cause other
	2915	* accesses to fail.
	2916	* This happens when a debugger sets a
	2917	* breakpoint and we then execute code in
	2918	* that page. Marking the page as "tainted"
	2919	* would cause any inspection tool ("leaks",
	2920	* "vmmap", "CrashReporter", ...) to get killed
	2921	* due to code-signing violation on that page,
	2922	* even though they're just reading it and not
	2923	* executing from it.
	2924	*/
	2925	} else {
	2926	/*
	2927	* Page might have been tainted before or not;
	2928	* now it definitively is. If the page wasn't
	2929	* tainted, we must disconnect it from all
	2930	* pmaps later, to force existing mappings
	2931	* through that code path for re-consideration
	2932	* of the validity of that page.
	2933	*/
	2934	if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
	2935	*must_disconnect = TRUE;
	2936	VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
	2937	}
	2938	}
	2939	cs_enter_tainted_accepted++;
	2940	}
	2941	if (kr != KERN_SUCCESS) {
	2942	if (cs_debug) {
	2943	printf("CODESIGNING: vm_fault_enter(0x%llx): "
	2944	"* INVALID PAGE *\n",
	2945	(long long)vaddr);
	2946	}
	2947	#if !SECURE_KERNEL
	2948	if (cs_enforcement_panic) {
	2949	panic("CODESIGNING: panicking on invalid page\n");
	2950	}
	2951	#endif
	2952	}
	2953	return kr;
	2954	}
	2955
	2956	/*
	2957	* Check that the code signature is valid for the given page being inserted into
	2958	* the pmap.
	2959	*
	2960	* @param must_disconnect This value will be set to true if the caller must disconnect
	2961	* this page.
	2962	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2963	*/
	2964	static kern_return_t
	2965	vm_fault_validate_cs(
	2966	bool cs_bypass,
	2967	vm_object_t object,
	2968	vm_page_t m,
	2969	pmap_t pmap,
	2970	vm_map_offset_t vaddr,
	2971	vm_prot_t prot,
	2972	vm_prot_t caller_prot,
	2973	vm_map_size_t fault_page_size,
	2974	vm_map_offset_t fault_phys_offset,
	2975	vm_object_fault_info_t fault_info,
	2976	bool *must_disconnect)
	2977	{
	2978	bool map_is_switched, map_is_switch_protected, cs_violation;
	2979	kern_return_t kr;
	2980	/* Validate code signature if necessary. */
	2981	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
	2982	(pmap == vm_map_pmap(current_thread()->map)));
	2983	map_is_switch_protected = current_thread()->map->switch_protect;
	2984	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
	2985	prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
	2986	map_is_switched, map_is_switch_protected, &cs_violation);
	2987	if (kr != KERN_SUCCESS) {
	2988	return kr;
	2989	}
	2990	if (cs_violation) {
	2991	kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
	2992	fault_page_size, fault_phys_offset,
	2993	map_is_switched, map_is_switch_protected, must_disconnect);
	2994	}
	2995	return kr;
	2996	}
	2997
	2998	/*
	2999	* Enqueue the page on the appropriate paging queue.
	3000	*/
	3001	static void
	3002	vm_fault_enqueue_page(
	3003	vm_object_t object,
	3004	vm_page_t m,
	3005	bool wired,
	3006	bool change_wiring,
	3007	vm_tag_t wire_tag,
	3008	bool no_cache,
	3009	int *type_of_fault,
	3010	kern_return_t kr)
	3011	{
	3012	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) \|\| object != compressor_object);
	3013	boolean_t page_queues_locked = FALSE;
	3014	boolean_t previously_pmapped = m->vmp_pmapped;
	3015	#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
	3016	MACRO_BEGIN \
	3017	if (! page_queues_locked) { \
	3018	page_queues_locked = TRUE; \
	3019	vm_page_lockspin_queues(); \
	3020	} \
	3021	MACRO_END
	3022	#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
	3023	MACRO_BEGIN \
	3024	if (page_queues_locked) { \
	3025	page_queues_locked = FALSE; \
	3026	vm_page_unlock_queues(); \
	3027	} \
	3028	MACRO_END
	3029
	3030	#if CONFIG_BACKGROUND_QUEUE
	3031	vm_page_update_background_state(m);
	3032	#endif
	3033	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
	3034	/*
	3035	* Compressor pages are neither wired
	3036	* nor pageable and should never change.
	3037	*/
	3038	assert(object == compressor_object);
	3039	} else if (change_wiring) {
	3040	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3041
	3042	if (wired) {
	3043	if (kr == KERN_SUCCESS) {
	3044	vm_page_wire(m, wire_tag, TRUE);
	3045	}
	3046	} else {
	3047	vm_page_unwire(m, TRUE);
	3048	}
	3049	/* we keep the page queues lock, if we need it later */
	3050	} else {
	3051	if (object->internal == TRUE) {
	3052	/*
	3053	* don't allow anonymous pages on
	3054	* the speculative queues
	3055	*/
	3056	no_cache = FALSE;
	3057	}
	3058	if (kr != KERN_SUCCESS) {
	3059	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3060	vm_page_deactivate(m);
	3061	/* we keep the page queues lock, if we need it later */
	3062	} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) \|\|
	3063	(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) \|\|
	3064	(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) \|\|
	3065	((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
	3066	!VM_PAGE_WIRED(m)) {
	3067	if (vm_page_local_q &&
	3068	(*type_of_fault == DBG_COW_FAULT \|\|
	3069	*type_of_fault == DBG_ZERO_FILL_FAULT)) {
	3070	struct vpl *lq;
	3071	uint32_t lid;
	3072
	3073	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
	3074
	3075	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
	3076	vm_object_lock_assert_exclusive(object);
	3077
	3078	/*
	3079	* we got a local queue to stuff this
	3080	* new page on...
	3081	* its safe to manipulate local and
	3082	* local_id at this point since we're
	3083	* behind an exclusive object lock and
	3084	* the page is not on any global queue.
	3085	*
	3086	* we'll use the current cpu number to
	3087	* select the queue note that we don't
	3088	* need to disable preemption... we're
	3089	* going to be behind the local queue's
	3090	* lock to do the real work
	3091	*/
	3092	lid = cpu_number();
	3093
	3094	lq = zpercpu_get_cpu(vm_page_local_q, lid);
	3095
	3096	VPL_LOCK(&lq->vpl_lock);
	3097
	3098	vm_page_check_pageable_safe(m);
	3099	vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
	3100	m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
	3101	m->vmp_local_id = lid;
	3102	lq->vpl_count++;
	3103
	3104	if (object->internal) {
	3105	lq->vpl_internal_count++;
	3106	} else {
	3107	lq->vpl_external_count++;
	3108	}
	3109
	3110	VPL_UNLOCK(&lq->vpl_lock);
	3111
	3112	if (lq->vpl_count > vm_page_local_q_soft_limit) {
	3113	/*
	3114	* we're beyond the soft limit
	3115	* for the local queue
	3116	* vm_page_reactivate_local will
	3117	* 'try' to take the global page
	3118	* queue lock... if it can't
	3119	* that's ok... we'll let the
	3120	* queue continue to grow up
	3121	* to the hard limit... at that
	3122	* point we'll wait for the
	3123	* lock... once we've got the
	3124	* lock, we'll transfer all of
	3125	* the pages from the local
	3126	* queue to the global active
	3127	* queue
	3128	*/
	3129	vm_page_reactivate_local(lid, FALSE, FALSE);
	3130	}
	3131	} else {
	3132	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3133
	3134	/*
	3135	* test again now that we hold the
	3136	* page queue lock
	3137	*/
	3138	if (!VM_PAGE_WIRED(m)) {
	3139	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
	3140	vm_page_queues_remove(m, FALSE);
	3141
	3142	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
	3143	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
	3144	}
	3145
	3146	if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) \|\|
	3147	no_cache) {
	3148	/*
	3149	* If this is a no_cache mapping
	3150	* and the page has never been
	3151	* mapped before or was
	3152	* previously a no_cache page,
	3153	* then we want to leave pages
	3154	* in the speculative state so
	3155	* that they can be readily
	3156	* recycled if free memory runs
	3157	* low. Otherwise the page is
	3158	* activated as normal.
	3159	*/
	3160
	3161	if (no_cache &&
	3162	(!previously_pmapped \|\|
	3163	m->vmp_no_cache)) {
	3164	m->vmp_no_cache = TRUE;
	3165
	3166	if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
	3167	vm_page_speculate(m, FALSE);
	3168	}
	3169	} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
	3170	vm_page_activate(m);
	3171	}
	3172	}
	3173	}
	3174	/* we keep the page queues lock, if we need it later */
	3175	}
	3176	}
	3177	}
	3178	/* we're done with the page queues lock, if we ever took it */
	3179	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
	3180	}
	3181
	3182	/*
	3183	* Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
	3184	* @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
	3185	* before being inserted into the pmap.
	3186	*/
	3187	static bool
	3188	vm_fault_enter_set_mapped(
	3189	vm_object_t object,
	3190	vm_page_t m,
	3191	vm_prot_t prot,
	3192	vm_prot_t fault_type)
	3193	{
	3194	bool page_needs_sync = false;
	3195	/*
	3196	* NOTE: we may only hold the vm_object lock SHARED
	3197	* at this point, so we need the phys_page lock to
	3198	* properly serialize updating the pmapped and
	3199	* xpmapped bits
	3200	*/
	3201	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
	3202	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3203
	3204	pmap_lock_phys_page(phys_page);
	3205	m->vmp_pmapped = TRUE;
	3206
	3207	if (!m->vmp_xpmapped) {
	3208	m->vmp_xpmapped = TRUE;
	3209
	3210	pmap_unlock_phys_page(phys_page);
	3211
	3212	if (!object->internal) {
	3213	OSAddAtomic(1, &vm_page_xpmapped_external_count);
	3214	}
	3215
	3216	#if defined(__arm__) \|\| defined(__arm64__)
	3217	page_needs_sync = true;
	3218	#else
	3219	if (object->internal &&
	3220	object->pager != NULL) {
	3221	/*
	3222	* This page could have been
	3223	* uncompressed by the
	3224	* compressor pager and its
	3225	* contents might be only in
	3226	* the data cache.
	3227	* Since it's being mapped for
	3228	* "execute" for the fist time,
	3229	* make sure the icache is in
	3230	* sync.
	3231	*/
	3232	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	3233	page_needs_sync = true;
	3234	}
	3235	#endif
	3236	} else {
	3237	pmap_unlock_phys_page(phys_page);
	3238	}
	3239	} else {
	3240	if (m->vmp_pmapped == FALSE) {
	3241	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3242
	3243	pmap_lock_phys_page(phys_page);
	3244	m->vmp_pmapped = TRUE;
	3245	pmap_unlock_phys_page(phys_page);
	3246	}
	3247	}
	3248
	3249	if (fault_type & VM_PROT_WRITE) {
	3250	if (m->vmp_wpmapped == FALSE) {
	3251	vm_object_lock_assert_exclusive(object);
	3252	if (!object->internal && object->pager) {
	3253	task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
	3254	}
	3255	m->vmp_wpmapped = TRUE;
	3256	}
	3257	}
	3258	return page_needs_sync;
	3259	}
	3260
	3261	/*
	3262	* Try to enter the given page into the pmap.
	3263	* Will retry without execute permission iff PMAP_CS is enabled and we encounter
	3264	* a codesigning failure on a non-execute fault.
	3265	*/
	3266	static kern_return_t
	3267	vm_fault_attempt_pmap_enter(
	3268	pmap_t pmap,
	3269	vm_map_offset_t vaddr,
	3270	vm_map_size_t fault_page_size,
	3271	vm_map_offset_t fault_phys_offset,
	3272	vm_page_t m,
	3273	vm_prot_t *prot,
	3274	vm_prot_t caller_prot,
	3275	vm_prot_t fault_type,
	3276	bool wired,
	3277	int pmap_options)
	3278	{
	3279	#if !PMAP_CS
	3280	#pragma unused(caller_prot)
	3281	#endif /* !PMAP_CS */
	3282	kern_return_t kr;
	3283	if (fault_page_size != PAGE_SIZE) {
	3284	DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
	3285	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	3286	fault_phys_offset < PAGE_SIZE),
	3287	"0x%llx\n", (uint64_t)fault_phys_offset);
	3288	} else {
	3289	assertf(fault_phys_offset == 0,
	3290	"0x%llx\n", (uint64_t)fault_phys_offset);
	3291	}
	3292
	3293	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3294	fault_phys_offset,
	3295	m, *prot, fault_type, 0,
	3296	wired,
	3297	pmap_options,
	3298	kr);
	3299	return kr;
	3300	}
	3301
	3302	/*
	3303	* Enter the given page into the pmap.
	3304	* The map must be locked shared.
	3305	* The vm object must NOT be locked.
	3306	*
	3307	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3308	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3309	*/
	3310	static kern_return_t
	3311	vm_fault_pmap_enter(
	3312	pmap_t pmap,
	3313	vm_map_offset_t vaddr,
	3314	vm_map_size_t fault_page_size,
	3315	vm_map_offset_t fault_phys_offset,
	3316	vm_page_t m,
	3317	vm_prot_t *prot,
	3318	vm_prot_t caller_prot,
	3319	vm_prot_t fault_type,
	3320	bool wired,
	3321	int pmap_options,
	3322	boolean_t *need_retry)
	3323	{
	3324	kern_return_t kr;
	3325	if (need_retry != NULL) {
	3326	/*
	3327	* Although we don't hold a lock on this object, we hold a lock
	3328	* on the top object in the chain. To prevent a deadlock, we
	3329	* can't allow the pmap layer to block.
	3330	*/
	3331	pmap_options \|= PMAP_OPTIONS_NOWAIT;
	3332	}
	3333	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3334	fault_page_size, fault_phys_offset,
	3335	m, prot, caller_prot, fault_type, wired, pmap_options);
	3336	if (kr == KERN_RESOURCE_SHORTAGE) {
	3337	if (need_retry) {
	3338	/*
	3339	* There's nothing we can do here since we hold the
	3340	* lock on the top object in the chain. The caller
	3341	* will need to deal with this by dropping that lock and retrying.
	3342	*/
	3343	*need_retry = TRUE;
	3344	vm_pmap_enter_retried++;
	3345	}
	3346	}
	3347	return kr;
	3348	}
	3349
	3350	/*
	3351	* Enter the given page into the pmap.
	3352	* The vm map must be locked shared.
	3353	* The vm object must be locked exclusive, unless this is a soft fault.
	3354	* For a soft fault, the object must be locked shared or exclusive.
	3355	*
	3356	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3357	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3358	*/
	3359	static kern_return_t
	3360	vm_fault_pmap_enter_with_object_lock(
	3361	vm_object_t object,
	3362	pmap_t pmap,
	3363	vm_map_offset_t vaddr,
	3364	vm_map_size_t fault_page_size,
	3365	vm_map_offset_t fault_phys_offset,
	3366	vm_page_t m,
	3367	vm_prot_t *prot,
	3368	vm_prot_t caller_prot,
	3369	vm_prot_t fault_type,
	3370	bool wired,
	3371	int pmap_options,
	3372	boolean_t *need_retry)
	3373	{
	3374	kern_return_t kr;
	3375	/*
	3376	* Prevent a deadlock by not
	3377	* holding the object lock if we need to wait for a page in
	3378	* pmap_enter() - <rdar://problem/7138958>
	3379	*/
	3380	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3381	fault_page_size, fault_phys_offset,
	3382	m, prot, caller_prot, fault_type, wired, pmap_options \| PMAP_OPTIONS_NOWAIT);
	3383	#if __x86_64__
	3384	if (kr == KERN_INVALID_ARGUMENT &&
	3385	pmap == PMAP_NULL &&
	3386	wired) {
	3387	/*
	3388	* Wiring a page in a pmap-less VM map:
	3389	* VMware's "vmmon" kernel extension does this
	3390	* to grab pages.
	3391	* Let it proceed even though the PMAP_ENTER() failed.
	3392	*/
	3393	kr = KERN_SUCCESS;
	3394	}
	3395	#endif /* __x86_64__ */
	3396
	3397	if (kr == KERN_RESOURCE_SHORTAGE) {
	3398	if (need_retry) {
	3399	/*
	3400	* this will be non-null in the case where we hold the lock
	3401	* on the top-object in this chain... we can't just drop
	3402	* the lock on the object we're inserting the page into
	3403	* and recall the PMAP_ENTER since we can still cause
	3404	* a deadlock if one of the critical paths tries to
	3405	* acquire the lock on the top-object and we're blocked
	3406	* in PMAP_ENTER waiting for memory... our only recourse
	3407	* is to deal with it at a higher level where we can
	3408	* drop both locks.
	3409	*/
	3410	*need_retry = TRUE;
	3411	vm_pmap_enter_retried++;
	3412	goto done;
	3413	}
	3414	/*
	3415	* The nonblocking version of pmap_enter did not succeed.
	3416	* and we don't need to drop other locks and retry
	3417	* at the level above us, so
	3418	* use the blocking version instead. Requires marking
	3419	* the page busy and unlocking the object
	3420	*/
	3421	boolean_t was_busy = m->vmp_busy;
	3422
	3423	vm_object_lock_assert_exclusive(object);
	3424
	3425	m->vmp_busy = TRUE;
	3426	vm_object_unlock(object);
	3427
	3428	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3429	fault_phys_offset,
	3430	m, *prot, fault_type,
	3431	0, wired,
	3432	pmap_options, kr);
	3433
	3434	assert(VM_PAGE_OBJECT(m) == object);
	3435
	3436	/* Take the object lock again. */
	3437	vm_object_lock(object);
	3438
	3439	/* If the page was busy, someone else will wake it up.
	3440	* Otherwise, we have to do it now. */
	3441	assert(m->vmp_busy);
	3442	if (!was_busy) {
	3443	PAGE_WAKEUP_DONE(m);
	3444	}
	3445	vm_pmap_enter_blocked++;
	3446	}
	3447
	3448	done:
	3449	return kr;
	3450	}
	3451
	3452	/*
	3453	* Prepare to enter a page into the pmap by checking CS, protection bits,
	3454	* and setting mapped bits on the page_t.
	3455	* Does not modify the page's paging queue.
	3456	*
	3457	* page queue lock must NOT be held
	3458	* m->vmp_object must be locked
	3459	*
	3460	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3461	* from vm_fault() as part of a soft fault.
	3462	*/
	3463	static kern_return_t
	3464	vm_fault_enter_prepare(
	3465	vm_page_t m,
	3466	pmap_t pmap,
	3467	vm_map_offset_t vaddr,
	3468	vm_prot_t *prot,
	3469	vm_prot_t caller_prot,
	3470	vm_map_size_t fault_page_size,
	3471	vm_map_offset_t fault_phys_offset,
	3472	boolean_t change_wiring,
	3473	vm_prot_t fault_type,
	3474	vm_object_fault_info_t fault_info,
	3475	int *type_of_fault,
	3476	bool *page_needs_data_sync)
	3477	{
	3478	kern_return_t kr;
	3479	bool is_tainted = false;
	3480	vm_object_t object;
	3481	boolean_t cs_bypass = fault_info->cs_bypass;
	3482
	3483	object = VM_PAGE_OBJECT(m);
	3484
	3485	vm_object_lock_assert_held(object);
	3486
	3487	#if KASAN
	3488	if (pmap == kernel_pmap) {
	3489	kasan_notify_address(vaddr, PAGE_SIZE);
	3490	}
	3491	#endif
	3492
	3493	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
	3494
	3495	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
	3496	vm_object_lock_assert_exclusive(object);
	3497	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
	3498	!change_wiring &&
	3499	(!m->vmp_wpmapped
	3500	#if VM_OBJECT_ACCESS_TRACKING
	3501	\|\| object->access_tracking
	3502	#endif /* VM_OBJECT_ACCESS_TRACKING */
	3503	)) {
	3504	/*
	3505	* This is not a "write" fault, so we
	3506	* might not have taken the object lock
	3507	* exclusively and we might not be able
	3508	* to update the "wpmapped" bit in
	3509	* vm_fault_enter().
	3510	* Let's just grant read access to
	3511	* the page for now and we'll
	3512	* soft-fault again if we need write
	3513	* access later...
	3514	*/
	3515
	3516	/* This had better not be a JIT page. */
	3517	if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
	3518	*prot &= ~VM_PROT_WRITE;
	3519	} else {
	3520	assert(cs_bypass);
	3521	}
	3522	}
	3523	if (m->vmp_pmapped == FALSE) {
	3524	if (m->vmp_clustered) {
	3525	if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
	3526	/*
	3527	* found it in the cache, but this
	3528	* is the first fault-in of the page (m->vmp_pmapped == FALSE)
	3529	* so it must have come in as part of
	3530	* a cluster... account 1 pagein against it
	3531	*/
	3532	if (object->internal) {
	3533	*type_of_fault = DBG_PAGEIND_FAULT;
	3534	} else {
	3535	*type_of_fault = DBG_PAGEINV_FAULT;
	3536	}
	3537
	3538	VM_PAGE_COUNT_AS_PAGEIN(m);
	3539	}
	3540	VM_PAGE_CONSUME_CLUSTERED(m);
	3541	}
	3542	}
	3543
	3544	if (*type_of_fault != DBG_COW_FAULT) {
	3545	DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
	3546
	3547	if (pmap == kernel_pmap) {
	3548	DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
	3549	}
	3550	}
	3551
	3552	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
	3553	*prot, caller_prot, fault_page_size, fault_phys_offset,
	3554	fault_info, &is_tainted);
	3555	if (kr == KERN_SUCCESS) {
	3556	/*
	3557	* We either have a good page, or a tainted page that has been accepted by the process.
	3558	* In both cases the page will be entered into the pmap.
	3559	*/
	3560	page_needs_data_sync = vm_fault_enter_set_mapped(object, m, prot, fault_type);
	3561	if ((fault_type & VM_PROT_WRITE) && is_tainted) {
	3562	/*
	3563	* This page is tainted but we're inserting it anyways.
	3564	* Since it's writeable, we need to disconnect it from other pmaps
	3565	* now so those processes can take note.
	3566	*/
	3567
	3568	/*
	3569	* We can only get here
	3570	* because of the CSE logic
	3571	*/
	3572	assert(pmap_get_vm_map_cs_enforced(pmap));
	3573	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	3574	/*
	3575	* If we are faulting for a write, we can clear
	3576	* the execute bit - that will ensure the page is
	3577	* checked again before being executable, which
	3578	* protects against a map switch.
	3579	* This only happens the first time the page
	3580	* gets tainted, so we won't get stuck here
	3581	* to make an already writeable page executable.
	3582	*/
	3583	if (!cs_bypass) {
	3584	assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
	3585	*prot &= ~VM_PROT_EXECUTE;
	3586	}
	3587	}
	3588	assert(VM_PAGE_OBJECT(m) == object);
	3589
	3590	#if VM_OBJECT_ACCESS_TRACKING
	3591	if (object->access_tracking) {
	3592	DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
	3593	if (fault_type & VM_PROT_WRITE) {
	3594	object->access_tracking_writes++;
	3595	vm_object_access_tracking_writes++;
	3596	} else {
	3597	object->access_tracking_reads++;
	3598	vm_object_access_tracking_reads++;
	3599	}
	3600	}
	3601	#endif /* VM_OBJECT_ACCESS_TRACKING */
	3602	}
	3603
	3604	return kr;
	3605	}
	3606
	3607	/*
	3608	* page queue lock must NOT be held
	3609	* m->vmp_object must be locked
	3610	*
	3611	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3612	* from vm_fault() as part of a soft fault. If so, we must be
	3613	* careful not to modify the VM object in any way that is not
	3614	* legal under a shared lock...
	3615	*/
	3616	kern_return_t
	3617	vm_fault_enter(
	3618	vm_page_t m,
	3619	pmap_t pmap,
	3620	vm_map_offset_t vaddr,
	3621	vm_map_size_t fault_page_size,
	3622	vm_map_offset_t fault_phys_offset,
	3623	vm_prot_t prot,
	3624	vm_prot_t caller_prot,
	3625	boolean_t wired,
	3626	boolean_t change_wiring,
	3627	vm_tag_t wire_tag,
	3628	vm_object_fault_info_t fault_info,
	3629	boolean_t *need_retry,
	3630	int *type_of_fault)
	3631	{
	3632	kern_return_t kr;
	3633	vm_object_t object;
	3634	bool page_needs_data_sync;
	3635	vm_prot_t fault_type;
	3636	int pmap_options = fault_info->pmap_options;
	3637
	3638	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	3639	assert(m->vmp_fictitious);
	3640	return KERN_SUCCESS;
	3641	}
	3642
	3643	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
	3644
	3645	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
	3646	fault_page_size, fault_phys_offset, change_wiring, fault_type,
	3647	fault_info, type_of_fault, &page_needs_data_sync);
	3648	object = VM_PAGE_OBJECT(m);
	3649
	3650	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
	3651
	3652	if (kr == KERN_SUCCESS) {
	3653	if (page_needs_data_sync) {
	3654	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
	3655	}
	3656
	3657	kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
	3658	fault_page_size, fault_phys_offset, m,
	3659	&prot, caller_prot, fault_type, wired, pmap_options, need_retry);
	3660	}
	3661
	3662	return kr;
	3663	}
	3664
	3665	void
	3666	vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
	3667	{
	3668	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
	3669	vm_fault(current_map(), /* map */
	3670	vaddr, /* vaddr */
	3671	prot, /* fault_type */
	3672	FALSE, /* change_wiring */
	3673	VM_KERN_MEMORY_NONE, /* tag - not wiring */
	3674	THREAD_UNINT, /* interruptible */
	3675	NULL, /* caller_pmap */
	3676	0 /* caller_pmap_addr */);
	3677	}
	3678	}
	3679
	3680
	3681	/*
	3682	* Routine: vm_fault
	3683	* Purpose:
	3684	* Handle page faults, including pseudo-faults
	3685	* used to change the wiring status of pages.
	3686	* Returns:
	3687	* Explicit continuations have been removed.
	3688	* Implementation:
	3689	* vm_fault and vm_fault_page save mucho state
	3690	* in the moral equivalent of a closure. The state
	3691	* structure is allocated when first entering vm_fault
	3692	* and deallocated when leaving vm_fault.
	3693	*/
	3694
	3695	extern uint64_t get_current_unique_pid(void);
	3696
	3697	unsigned long vm_fault_collapse_total = 0;
	3698	unsigned long vm_fault_collapse_skipped = 0;
	3699
	3700
	3701	kern_return_t
	3702	vm_fault_external(
	3703	vm_map_t map,
	3704	vm_map_offset_t vaddr,
	3705	vm_prot_t fault_type,
	3706	boolean_t change_wiring,
	3707	int interruptible,
	3708	pmap_t caller_pmap,
	3709	vm_map_offset_t caller_pmap_addr)
	3710	{
	3711	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
	3712	change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
	3713	interruptible, caller_pmap, caller_pmap_addr,
	3714	NULL);
	3715	}
	3716
	3717	kern_return_t
	3718	vm_fault(
	3719	vm_map_t map,
	3720	vm_map_offset_t vaddr,
	3721	vm_prot_t fault_type,
	3722	boolean_t change_wiring,
	3723	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3724	int interruptible,
	3725	pmap_t caller_pmap,
	3726	vm_map_offset_t caller_pmap_addr)
	3727	{
	3728	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
	3729	interruptible, caller_pmap, caller_pmap_addr,
	3730	NULL);
	3731	}
	3732
	3733	static boolean_t
	3734	current_proc_is_privileged(void)
	3735	{
	3736	return csproc_get_platform_binary(current_proc());
	3737	}
	3738
	3739	uint64_t vm_copied_on_read = 0;
	3740
	3741	/*
	3742	* Cleanup after a vm_fault_enter.
	3743	* At this point, the fault should either have failed (kr != KERN_SUCCESS)
	3744	* or the page should be in the pmap and on the correct paging queue.
	3745	*
	3746	* Precondition:
	3747	* map must be locked shared.
	3748	* m_object must be locked.
	3749	* If top_object != VM_OBJECT_NULL, it must be locked.
	3750	* real_map must be locked.
	3751	*
	3752	* Postcondition:
	3753	* map will be unlocked
	3754	* m_object will be unlocked
	3755	* top_object will be unlocked
	3756	* If real_map != map, it will be unlocked
	3757	*/
	3758	static void
	3759	vm_fault_complete(
	3760	vm_map_t map,
	3761	vm_map_t real_map,
	3762	vm_object_t object,
	3763	vm_object_t m_object,
	3764	vm_page_t m,
	3765	vm_map_offset_t offset,
	3766	vm_map_offset_t trace_real_vaddr,
	3767	vm_object_fault_info_t fault_info,
	3768	vm_prot_t caller_prot,
	3769	#if CONFIG_DTRACE
	3770	vm_map_offset_t real_vaddr,
	3771	#else
	3772	__unused vm_map_offset_t real_vaddr,
	3773	#endif /* CONFIG_DTRACE */
	3774	int type_of_fault,
	3775	boolean_t need_retry,
	3776	kern_return_t kr,
	3777	ppnum_t *physpage_p,
	3778	vm_prot_t prot,
	3779	vm_object_t top_object,
	3780	boolean_t need_collapse,
	3781	vm_map_offset_t cur_offset,
	3782	vm_prot_t fault_type,
	3783	vm_object_t *written_on_object,
	3784	memory_object_t *written_on_pager,
	3785	vm_object_offset_t *written_on_offset)
	3786	{
	3787	int event_code = 0;
	3788	vm_map_lock_assert_shared(map);
	3789	vm_object_lock_assert_held(m_object);
	3790	if (top_object != VM_OBJECT_NULL) {
	3791	vm_object_lock_assert_held(top_object);
	3792	}
	3793	vm_map_lock_assert_held(real_map);
	3794
	3795	if (m_object->internal) {
	3796	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	3797	} else if (m_object->object_is_shared_cache) {
	3798	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	3799	} else {
	3800	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	3801	}
	3802
	3803	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) \| (caller_prot << 8) \| type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
	3804	if (need_retry == FALSE) {
	3805	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
	3806	}
	3807	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
	3808	if (kr == KERN_SUCCESS &&
	3809	physpage_p != NULL) {
	3810	/* for vm_map_wire_and_extract() */
	3811	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	3812	if (prot & VM_PROT_WRITE) {
	3813	vm_object_lock_assert_exclusive(m_object);
	3814	m->vmp_dirty = TRUE;
	3815	}
	3816	}
	3817
	3818	if (top_object != VM_OBJECT_NULL) {
	3819	/*
	3820	* It's safe to drop the top object
	3821	* now that we've done our
	3822	* vm_fault_enter(). Any other fault
	3823	* in progress for that virtual
	3824	* address will either find our page
	3825	* and translation or put in a new page
	3826	* and translation.
	3827	*/
	3828	vm_object_unlock(top_object);
	3829	top_object = VM_OBJECT_NULL;
	3830	}
	3831
	3832	if (need_collapse == TRUE) {
	3833	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
	3834	}
	3835
	3836	if (need_retry == FALSE &&
	3837	(type_of_fault == DBG_PAGEIND_FAULT \|\| type_of_fault == DBG_PAGEINV_FAULT \|\| type_of_fault == DBG_CACHE_HIT_FAULT)) {
	3838	/*
	3839	* evaluate access pattern and update state
	3840	* vm_fault_deactivate_behind depends on the
	3841	* state being up to date
	3842	*/
	3843	vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
	3844
	3845	vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
	3846	}
	3847	/*
	3848	* That's it, clean up and return.
	3849	*/
	3850	if (m->vmp_busy) {
	3851	vm_object_lock_assert_exclusive(m_object);
	3852	PAGE_WAKEUP_DONE(m);
	3853	}
	3854
	3855	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
	3856	vm_object_paging_begin(m_object);
	3857
	3858	assert(*written_on_object == VM_OBJECT_NULL);
	3859	*written_on_object = m_object;
	3860	*written_on_pager = m_object->pager;
	3861	*written_on_offset = m_object->paging_offset + m->vmp_offset;
	3862	}
	3863	vm_object_unlock(object);
	3864
	3865	vm_map_unlock_read(map);
	3866	if (real_map != map) {
	3867	vm_map_unlock(real_map);
	3868	}
	3869	}
	3870
	3871	static inline int
	3872	vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
	3873	{
	3874	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
	3875	return DBG_COR_FAULT;
	3876	}
	3877	return type_of_fault;
	3878	}
	3879
	3880	kern_return_t
	3881	vm_fault_internal(
	3882	vm_map_t map,
	3883	vm_map_offset_t vaddr,
	3884	vm_prot_t caller_prot,
	3885	boolean_t change_wiring,
	3886	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3887	int interruptible,
	3888	pmap_t caller_pmap,
	3889	vm_map_offset_t caller_pmap_addr,
	3890	ppnum_t *physpage_p)
	3891	{
	3892	vm_map_version_t version; /* Map version for verificiation */
	3893	boolean_t wired; /* Should mapping be wired down? */
	3894	vm_object_t object; /* Top-level object */
	3895	vm_object_offset_t offset; /* Top-level offset */
	3896	vm_prot_t prot; /* Protection for mapping */
	3897	vm_object_t old_copy_object; /* Saved copy object */
	3898	vm_page_t result_page; /* Result of vm_fault_page */
	3899	vm_page_t top_page; /* Placeholder page */
	3900	kern_return_t kr;
	3901
	3902	vm_page_t m; /* Fast access to result_page */
	3903	kern_return_t error_code;
	3904	vm_object_t cur_object;
	3905	vm_object_t m_object = NULL;
	3906	vm_object_offset_t cur_offset;
	3907	vm_page_t cur_m;
	3908	vm_object_t new_object;
	3909	int type_of_fault;
	3910	pmap_t pmap;
	3911	wait_interrupt_t interruptible_state;
	3912	vm_map_t real_map = map;
	3913	vm_map_t original_map = map;
	3914	bool object_locks_dropped = FALSE;
	3915	vm_prot_t fault_type;
	3916	vm_prot_t original_fault_type;
	3917	struct vm_object_fault_info fault_info = {};
	3918	bool need_collapse = FALSE;
	3919	boolean_t need_retry = FALSE;
	3920	boolean_t *need_retry_ptr = NULL;
	3921	uint8_t object_lock_type = 0;
	3922	uint8_t cur_object_lock_type;
	3923	vm_object_t top_object = VM_OBJECT_NULL;
	3924	vm_object_t written_on_object = VM_OBJECT_NULL;
	3925	memory_object_t written_on_pager = NULL;
	3926	vm_object_offset_t written_on_offset = 0;
	3927	int throttle_delay;
	3928	int compressed_count_delta;
	3929	uint8_t grab_options;
	3930	bool need_copy;
	3931	bool need_copy_on_read;
	3932	vm_map_offset_t trace_vaddr;
	3933	vm_map_offset_t trace_real_vaddr;
	3934	vm_map_size_t fault_page_size;
	3935	vm_map_size_t fault_page_mask;
	3936	vm_map_offset_t fault_phys_offset;
	3937	vm_map_offset_t real_vaddr;
	3938	bool resilient_media_retry = FALSE;
	3939	vm_object_t resilient_media_object = VM_OBJECT_NULL;
	3940	vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
	3941	bool page_needs_data_sync = false;
	3942	/*
	3943	* Was the VM object contended when vm_map_lookup_locked locked it?
	3944	* If so, the zero fill path will drop the lock
	3945	* NB: Ideally we would always drop the lock rather than rely on
	3946	* this heuristic, but vm_object_unlock currently takes > 30 cycles.
	3947	*/
	3948	bool object_is_contended = false;
	3949
	3950	real_vaddr = vaddr;
	3951	trace_real_vaddr = vaddr;
	3952
	3953	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
	3954	fault_phys_offset = (vm_map_offset_t)-1;
	3955	fault_page_size = VM_MAP_PAGE_SIZE(original_map);
	3956	fault_page_mask = VM_MAP_PAGE_MASK(original_map);
	3957	if (fault_page_size < PAGE_SIZE) {
	3958	DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
	3959	vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
	3960	}
	3961	} else {
	3962	fault_phys_offset = 0;
	3963	fault_page_size = PAGE_SIZE;
	3964	fault_page_mask = PAGE_MASK;
	3965	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
	3966	}
	3967
	3968	if (map == kernel_map) {
	3969	trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
	3970	trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
	3971	} else {
	3972	trace_vaddr = vaddr;
	3973	}
	3974
	3975	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3976	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_START,
	3977	((uint64_t)trace_vaddr >> 32),
	3978	trace_vaddr,
	3979	(map == kernel_map),
	3980	0,
	3981	0);
	3982
	3983	if (get_preemption_level() != 0) {
	3984	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3985	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	3986	((uint64_t)trace_vaddr >> 32),
	3987	trace_vaddr,
	3988	KERN_FAILURE,
	3989	0,
	3990	0);
	3991
	3992	return KERN_FAILURE;
	3993	}
	3994
	3995	thread_t cthread = current_thread();
	3996	bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
	3997	uint64_t fstart = 0;
	3998
	3999	if (rtfault) {
	4000	fstart = mach_continuous_time();
	4001	}
	4002
	4003	interruptible_state = thread_interrupt_level(interruptible);
	4004
	4005	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
	4006
	4007	VM_STAT_INCR(faults);
	4008	current_task()->faults++;
	4009	original_fault_type = fault_type;
	4010
	4011	need_copy = FALSE;
	4012	if (fault_type & VM_PROT_WRITE) {
	4013	need_copy = TRUE;
	4014	}
	4015
	4016	if (need_copy \|\| change_wiring) {
	4017	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4018	} else {
	4019	object_lock_type = OBJECT_LOCK_SHARED;
	4020	}
	4021
	4022	cur_object_lock_type = OBJECT_LOCK_SHARED;
	4023
	4024	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
	4025	if (compressor_map) {
	4026	if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
	4027	panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void ) vaddr, caller_prot, (void ) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
	4028	}
	4029	}
	4030	}
	4031	RetryFault:
	4032	assert(written_on_object == VM_OBJECT_NULL);
	4033
	4034	/*
	4035	* assume we will hit a page in the cache
	4036	* otherwise, explicitly override with
	4037	* the real fault type once we determine it
	4038	*/
	4039	type_of_fault = DBG_CACHE_HIT_FAULT;
	4040
	4041	/*
	4042	* Find the backing store object and offset into
	4043	* it to begin the search.
	4044	*/
	4045	fault_type = original_fault_type;
	4046	map = original_map;
	4047	vm_map_lock_read(map);
	4048
	4049	if (resilient_media_retry) {
	4050	/*
	4051	* If we have to insert a fake zero-filled page to hide
	4052	* a media failure to provide the real page, we need to
	4053	* resolve any pending copy-on-write on this mapping.
	4054	* VM_PROT_COPY tells vm_map_lookup_locked() to deal
	4055	* with that even if this is not a "write" fault.
	4056	*/
	4057	need_copy = TRUE;
	4058	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4059	}
	4060
	4061	kr = vm_map_lookup_locked(&map, vaddr,
	4062	(fault_type \| (need_copy ? VM_PROT_COPY : 0)),
	4063	object_lock_type, &version,
	4064	&object, &offset, &prot, &wired,
	4065	&fault_info,
	4066	&real_map,
	4067	&object_is_contended);
	4068
	4069	if (kr != KERN_SUCCESS) {
	4070	vm_map_unlock_read(map);
	4071	goto done;
	4072	}
	4073
	4074
	4075	pmap = real_map->pmap;
	4076	fault_info.interruptible = interruptible;
	4077	fault_info.stealth = FALSE;
	4078	fault_info.io_sync = FALSE;
	4079	fault_info.mark_zf_absent = FALSE;
	4080	fault_info.batch_pmap_op = FALSE;
	4081
	4082	if (resilient_media_retry) {
	4083	/*
	4084	* We're retrying this fault after having detected a media
	4085	* failure from a "resilient_media" mapping.
	4086	* Check that the mapping is still pointing at the object
	4087	* that just failed to provide a page.
	4088	*/
	4089	assert(resilient_media_object != VM_OBJECT_NULL);
	4090	assert(resilient_media_offset != (vm_object_offset_t)-1);
	4091	if (object != VM_OBJECT_NULL &&
	4092	object == resilient_media_object &&
	4093	offset == resilient_media_offset &&
	4094	fault_info.resilient_media) {
	4095	/*
	4096	* This mapping still points at the same object
	4097	* and is still "resilient_media": proceed in
	4098	* "recovery-from-media-failure" mode, where we'll
	4099	* insert a zero-filled page in the top object.
	4100	*/
	4101	// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
	4102	} else {
	4103	/* not recovering: reset state */
	4104	// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
	4105	resilient_media_retry = FALSE;
	4106	/* release our extra reference on failed object */
	4107	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	4108	vm_object_deallocate(resilient_media_object);
	4109	resilient_media_object = VM_OBJECT_NULL;
	4110	resilient_media_offset = (vm_object_offset_t)-1;
	4111	}
	4112	} else {
	4113	assert(resilient_media_object == VM_OBJECT_NULL);
	4114	resilient_media_offset = (vm_object_offset_t)-1;
	4115	}
	4116
	4117	/*
	4118	* If the page is wired, we must fault for the current protection
	4119	* value, to avoid further faults.
	4120	*/
	4121	if (wired) {
	4122	fault_type = prot \| VM_PROT_WRITE;
	4123	}
	4124	if (wired \|\| need_copy) {
	4125	/*
	4126	* since we're treating this fault as a 'write'
	4127	* we must hold the top object lock exclusively
	4128	*/
	4129	if (object_lock_type == OBJECT_LOCK_SHARED) {
	4130	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4131
	4132	if (vm_object_lock_upgrade(object) == FALSE) {
	4133	/*
	4134	* couldn't upgrade, so explictly
	4135	* take the lock exclusively
	4136	*/
	4137	vm_object_lock(object);
	4138	}
	4139	}
	4140	}
	4141
	4142	#if VM_FAULT_CLASSIFY
	4143	/*
	4144	* Temporary data gathering code
	4145	*/
	4146	vm_fault_classify(object, offset, fault_type);
	4147	#endif
	4148	/*
	4149	* Fast fault code. The basic idea is to do as much as
	4150	* possible while holding the map lock and object locks.
	4151	* Busy pages are not used until the object lock has to
	4152	* be dropped to do something (copy, zero fill, pmap enter).
	4153	* Similarly, paging references aren't acquired until that
	4154	* point, and object references aren't used.
	4155	*
	4156	* If we can figure out what to do
	4157	* (zero fill, copy on write, pmap enter) while holding
	4158	* the locks, then it gets done. Otherwise, we give up,
	4159	* and use the original fault path (which doesn't hold
	4160	* the map lock, and relies on busy pages).
	4161	* The give up cases include:
	4162	* - Have to talk to pager.
	4163	* - Page is busy, absent or in error.
	4164	* - Pager has locked out desired access.
	4165	* - Fault needs to be restarted.
	4166	* - Have to push page into copy object.
	4167	*
	4168	* The code is an infinite loop that moves one level down
	4169	* the shadow chain each time. cur_object and cur_offset
	4170	* refer to the current object being examined. object and offset
	4171	* are the original object from the map. The loop is at the
	4172	* top level if and only if object and cur_object are the same.
	4173	*
	4174	* Invariants: Map lock is held throughout. Lock is held on
	4175	* original object and cur_object (if different) when
	4176	* continuing or exiting loop.
	4177	*
	4178	*/
	4179
	4180	#if defined(__arm64__)
	4181	/*
	4182	* Fail if reading an execute-only page in a
	4183	* pmap that enforces execute-only protection.
	4184	*/
	4185	if (fault_type == VM_PROT_READ &&
	4186	(prot & VM_PROT_EXECUTE) &&
	4187	!(prot & VM_PROT_READ) &&
	4188	pmap_enforces_execute_only(pmap)) {
	4189	vm_object_unlock(object);
	4190	vm_map_unlock_read(map);
	4191	if (real_map != map) {
	4192	vm_map_unlock(real_map);
	4193	}
	4194	kr = KERN_PROTECTION_FAILURE;
	4195	goto done;
	4196	}
	4197	#endif
	4198
	4199	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
	4200
	4201	/*
	4202	* If this page is to be inserted in a copy delay object
	4203	* for writing, and if the object has a copy, then the
	4204	* copy delay strategy is implemented in the slow fault page.
	4205	*/
	4206	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
	4207	object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
	4208	goto handle_copy_delay;
	4209	}
	4210
	4211	cur_object = object;
	4212	cur_offset = offset;
	4213
	4214	grab_options = 0;
	4215	#if CONFIG_SECLUDED_MEMORY
	4216	if (object->can_grab_secluded) {
	4217	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	4218	}
	4219	#endif /* CONFIG_SECLUDED_MEMORY */
	4220
	4221	while (TRUE) {
	4222	if (!cur_object->pager_created &&
	4223	cur_object->phys_contiguous) { /* superpage */
	4224	break;
	4225	}
	4226
	4227	if (cur_object->blocked_access) {
	4228	/*
	4229	* Access to this VM object has been blocked.
	4230	* Let the slow path handle it.
	4231	*/
	4232	break;
	4233	}
	4234
	4235	m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
	4236	m_object = NULL;
	4237
	4238	if (m != VM_PAGE_NULL) {
	4239	m_object = cur_object;
	4240
	4241	if (m->vmp_busy) {
	4242	wait_result_t result;
	4243
	4244	/*
	4245	* in order to do the PAGE_ASSERT_WAIT, we must
	4246	* have object that 'm' belongs to locked exclusively
	4247	*/
	4248	if (object != cur_object) {
	4249	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4250	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4251
	4252	if (vm_object_lock_upgrade(cur_object) == FALSE) {
	4253	/*
	4254	* couldn't upgrade so go do a full retry
	4255	* immediately since we can no longer be
	4256	* certain about cur_object (since we
	4257	* don't hold a reference on it)...
	4258	* first drop the top object lock
	4259	*/
	4260	vm_object_unlock(object);
	4261
	4262	vm_map_unlock_read(map);
	4263	if (real_map != map) {
	4264	vm_map_unlock(real_map);
	4265	}
	4266
	4267	goto RetryFault;
	4268	}
	4269	}
	4270	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4271	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4272
	4273	if (vm_object_lock_upgrade(object) == FALSE) {
	4274	/*
	4275	* couldn't upgrade, so explictly take the lock
	4276	* exclusively and go relookup the page since we
	4277	* will have dropped the object lock and
	4278	* a different thread could have inserted
	4279	* a page at this offset
	4280	* no need for a full retry since we're
	4281	* at the top level of the object chain
	4282	*/
	4283	vm_object_lock(object);
	4284
	4285	continue;
	4286	}
	4287	}
	4288	if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
	4289	/*
	4290	* m->vmp_busy == TRUE and the object is locked exclusively
	4291	* if m->pageout_queue == TRUE after we acquire the
	4292	* queues lock, we are guaranteed that it is stable on
	4293	* the pageout queue and therefore reclaimable
	4294	*
	4295	* NOTE: this is only true for the internal pageout queue
	4296	* in the compressor world
	4297	*/
	4298	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	4299
	4300	vm_page_lock_queues();
	4301
	4302	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
	4303	vm_pageout_throttle_up(m);
	4304	vm_page_unlock_queues();
	4305
	4306	PAGE_WAKEUP_DONE(m);
	4307	goto reclaimed_from_pageout;
	4308	}
	4309	vm_page_unlock_queues();
	4310	}
	4311	if (object != cur_object) {
	4312	vm_object_unlock(object);
	4313	}
	4314
	4315	vm_map_unlock_read(map);
	4316	if (real_map != map) {
	4317	vm_map_unlock(real_map);
	4318	}
	4319
	4320	result = PAGE_ASSERT_WAIT(m, interruptible);
	4321
	4322	vm_object_unlock(cur_object);
	4323
	4324	if (result == THREAD_WAITING) {
	4325	result = thread_block(THREAD_CONTINUE_NULL);
	4326
	4327	counter(c_vm_fault_page_block_busy_kernel++);
	4328	}
	4329	if (result == THREAD_AWAKENED \|\| result == THREAD_RESTART) {
	4330	goto RetryFault;
	4331	}
	4332
	4333	kr = KERN_ABORTED;
	4334	goto done;
	4335	}
	4336	reclaimed_from_pageout:
	4337	if (m->vmp_laundry) {
	4338	if (object != cur_object) {
	4339	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4340	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4341
	4342	vm_object_unlock(object);
	4343	vm_object_unlock(cur_object);
	4344
	4345	vm_map_unlock_read(map);
	4346	if (real_map != map) {
	4347	vm_map_unlock(real_map);
	4348	}
	4349
	4350	goto RetryFault;
	4351	}
	4352	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4353	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4354
	4355	if (vm_object_lock_upgrade(object) == FALSE) {
	4356	/*
	4357	* couldn't upgrade, so explictly take the lock
	4358	* exclusively and go relookup the page since we
	4359	* will have dropped the object lock and
	4360	* a different thread could have inserted
	4361	* a page at this offset
	4362	* no need for a full retry since we're
	4363	* at the top level of the object chain
	4364	*/
	4365	vm_object_lock(object);
	4366
	4367	continue;
	4368	}
	4369	}
	4370	vm_pageout_steal_laundry(m, FALSE);
	4371	}
	4372
	4373	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	4374	/*
	4375	* Guard page: let the slow path deal with it
	4376	*/
	4377	break;
	4378	}
	4379	if (m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_private \|\| m->vmp_absent)) {
	4380	/*
	4381	* Unusual case... let the slow path deal with it
	4382	*/
	4383	break;
	4384	}
	4385	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
	4386	if (object != cur_object) {
	4387	vm_object_unlock(object);
	4388	}
	4389	vm_map_unlock_read(map);
	4390	if (real_map != map) {
	4391	vm_map_unlock(real_map);
	4392	}
	4393	vm_object_unlock(cur_object);
	4394	kr = KERN_MEMORY_ERROR;
	4395	goto done;
	4396	}
	4397	assert(m_object == VM_PAGE_OBJECT(m));
	4398
	4399	if (vm_fault_cs_need_validation(map->pmap, m, m_object,
	4400	PAGE_SIZE, 0) \|\|
	4401	(physpage_p != NULL && (prot & VM_PROT_WRITE))) {
	4402	upgrade_lock_and_retry:
	4403	/*
	4404	* We might need to validate this page
	4405	* against its code signature, so we
	4406	* want to hold the VM object exclusively.
	4407	*/
	4408	if (object != cur_object) {
	4409	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4410	vm_object_unlock(object);
	4411	vm_object_unlock(cur_object);
	4412
	4413	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4414
	4415	vm_map_unlock_read(map);
	4416	if (real_map != map) {
	4417	vm_map_unlock(real_map);
	4418	}
	4419
	4420	goto RetryFault;
	4421	}
	4422	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4423	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4424
	4425	if (vm_object_lock_upgrade(object) == FALSE) {
	4426	/*
	4427	* couldn't upgrade, so explictly take the lock
	4428	* exclusively and go relookup the page since we
	4429	* will have dropped the object lock and
	4430	* a different thread could have inserted
	4431	* a page at this offset
	4432	* no need for a full retry since we're
	4433	* at the top level of the object chain
	4434	*/
	4435	vm_object_lock(object);
	4436
	4437	continue;
	4438	}
	4439	}
	4440	}
	4441	/*
	4442	* Two cases of map in faults:
	4443	* - At top level w/o copy object.
	4444	* - Read fault anywhere.
	4445	* --> must disallow write.
	4446	*/
	4447
	4448	if (object == cur_object && object->copy == VM_OBJECT_NULL) {
	4449	goto FastPmapEnter;
	4450	}
	4451
	4452	if (!need_copy &&
	4453	!fault_info.no_copy_on_read &&
	4454	cur_object != object &&
	4455	!cur_object->internal &&
	4456	!cur_object->pager_trusted &&
	4457	vm_protect_privileged_from_untrusted &&
	4458	!((prot & VM_PROT_EXECUTE) &&
	4459	cur_object->code_signed &&
	4460	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
	4461	current_proc_is_privileged()) {
	4462	/*
	4463	* We're faulting on a page in "object" and
	4464	* went down the shadow chain to "cur_object"
	4465	* to find out that "cur_object"'s pager
	4466	* is not "trusted", i.e. we can not trust it
	4467	* to always return the same contents.
	4468	* Since the target is a "privileged" process,
	4469	* let's treat this as a copy-on-read fault, as
	4470	* if it was a copy-on-write fault.
	4471	* Once "object" gets a copy of this page, it
	4472	* won't have to rely on "cur_object" to
	4473	* provide the contents again.
	4474	*
	4475	* This is done by setting "need_copy" and
	4476	* retrying the fault from the top with the
	4477	* appropriate locking.
	4478	*
	4479	* Special case: if the mapping is executable
	4480	* and the untrusted object is code-signed and
	4481	* the process is "cs_enforced", we do not
	4482	* copy-on-read because that would break
	4483	* code-signing enforcement expectations (an
	4484	* executable page must belong to a code-signed
	4485	* object) and we can rely on code-signing
	4486	* to re-validate the page if it gets evicted
	4487	* and paged back in.
	4488	*/
	4489	// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4490	vm_copied_on_read++;
	4491	need_copy = TRUE;
	4492
	4493	vm_object_unlock(object);
	4494	vm_object_unlock(cur_object);
	4495	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4496	vm_map_unlock_read(map);
	4497	if (real_map != map) {
	4498	vm_map_unlock(real_map);
	4499	}
	4500	goto RetryFault;
	4501	}
	4502
	4503	if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
	4504	if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
	4505	prot &= ~VM_PROT_WRITE;
	4506	} else {
	4507	/*
	4508	* For a protection that the pmap cares
	4509	* about, we must hand over the full
	4510	* set of protections (so that the pmap
	4511	* layer can apply any desired policy).
	4512	* This means that cs_bypass must be
	4513	* set, as this can force us to pass
	4514	* RWX.
	4515	*/
	4516	assert(fault_info.cs_bypass);
	4517	}
	4518
	4519	if (object != cur_object) {
	4520	/*
	4521	* We still need to hold the top object
	4522	* lock here to prevent a race between
	4523	* a read fault (taking only "shared"
	4524	* locks) and a write fault (taking
	4525	* an "exclusive" lock on the top
	4526	* object.
	4527	* Otherwise, as soon as we release the
	4528	* top lock, the write fault could
	4529	* proceed and actually complete before
	4530	* the read fault, and the copied page's
	4531	* translation could then be overwritten
	4532	* by the read fault's translation for
	4533	* the original page.
	4534	*
	4535	* Let's just record what the top object
	4536	* is and we'll release it later.
	4537	*/
	4538	top_object = object;
	4539
	4540	/*
	4541	* switch to the object that has the new page
	4542	*/
	4543	object = cur_object;
	4544	object_lock_type = cur_object_lock_type;
	4545	}
	4546	FastPmapEnter:
	4547	assert(m_object == VM_PAGE_OBJECT(m));
	4548
	4549	/*
	4550	* prepare for the pmap_enter...
	4551	* object and map are both locked
	4552	* m contains valid data
	4553	* object == m->vmp_object
	4554	* cur_object == NULL or it's been unlocked
	4555	* no paging references on either object or cur_object
	4556	*/
	4557	if (top_object != VM_OBJECT_NULL \|\| object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
	4558	need_retry_ptr = &need_retry;
	4559	} else {
	4560	need_retry_ptr = NULL;
	4561	}
	4562
	4563	if (fault_page_size < PAGE_SIZE) {
	4564	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	4565	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	4566	fault_phys_offset < PAGE_SIZE),
	4567	"0x%llx\n", (uint64_t)fault_phys_offset);
	4568	} else {
	4569	assertf(fault_phys_offset == 0,
	4570	"0x%llx\n", (uint64_t)fault_phys_offset);
	4571	}
	4572
	4573	if (caller_pmap) {
	4574	kr = vm_fault_enter(m,
	4575	caller_pmap,
	4576	caller_pmap_addr,
	4577	fault_page_size,
	4578	fault_phys_offset,
	4579	prot,
	4580	caller_prot,
	4581	wired,
	4582	change_wiring,
	4583	wire_tag,
	4584	&fault_info,
	4585	need_retry_ptr,
	4586	&type_of_fault);
	4587	} else {
	4588	kr = vm_fault_enter(m,
	4589	pmap,
	4590	vaddr,
	4591	fault_page_size,
	4592	fault_phys_offset,
	4593	prot,
	4594	caller_prot,
	4595	wired,
	4596	change_wiring,
	4597	wire_tag,
	4598	&fault_info,
	4599	need_retry_ptr,
	4600	&type_of_fault);
	4601	}
	4602
	4603	vm_fault_complete(
	4604	map,
	4605	real_map,
	4606	object,
	4607	m_object,
	4608	m,
	4609	offset,
	4610	trace_real_vaddr,
	4611	&fault_info,
	4612	caller_prot,
	4613	real_vaddr,
	4614	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
	4615	need_retry,
	4616	kr,
	4617	physpage_p,
	4618	prot,
	4619	top_object,
	4620	need_collapse,
	4621	cur_offset,
	4622	fault_type,
	4623	&written_on_object,
	4624	&written_on_pager,
	4625	&written_on_offset);
	4626	top_object = VM_OBJECT_NULL;
	4627	if (need_retry == TRUE) {
	4628	/*
	4629	* vm_fault_enter couldn't complete the PMAP_ENTER...
	4630	* at this point we don't hold any locks so it's safe
	4631	* to ask the pmap layer to expand the page table to
	4632	* accommodate this mapping... once expanded, we'll
	4633	* re-drive the fault which should result in vm_fault_enter
	4634	* being able to successfully enter the mapping this time around
	4635	*/
	4636	(void)pmap_enter_options(
	4637	pmap, vaddr, 0, 0, 0, 0, 0,
	4638	PMAP_OPTIONS_NOENTER, NULL);
	4639
	4640	need_retry = FALSE;
	4641	goto RetryFault;
	4642	}
	4643	goto done;
	4644	}
	4645	/*
	4646	* COPY ON WRITE FAULT
	4647	*/
	4648	assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
	4649
	4650	/*
	4651	* If objects match, then
	4652	* object->copy must not be NULL (else control
	4653	* would be in previous code block), and we
	4654	* have a potential push into the copy object
	4655	* with which we can't cope with here.
	4656	*/
	4657	if (cur_object == object) {
	4658	/*
	4659	* must take the slow path to
	4660	* deal with the copy push
	4661	*/
	4662	break;
	4663	}
	4664
	4665	/*
	4666	* This is now a shadow based copy on write
	4667	* fault -- it requires a copy up the shadow
	4668	* chain.
	4669	*/
	4670	assert(m_object == VM_PAGE_OBJECT(m));
	4671
	4672	if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
	4673	vm_fault_cs_need_validation(NULL, m, m_object,
	4674	PAGE_SIZE, 0)) {
	4675	goto upgrade_lock_and_retry;
	4676	}
	4677
	4678	/*
	4679	* Allocate a page in the original top level
	4680	* object. Give up if allocate fails. Also
	4681	* need to remember current page, as it's the
	4682	* source of the copy.
	4683	*
	4684	* at this point we hold locks on both
	4685	* object and cur_object... no need to take
	4686	* paging refs or mark pages BUSY since
	4687	* we don't drop either object lock until
	4688	* the page has been copied and inserted
	4689	*/
	4690	cur_m = m;
	4691	m = vm_page_grab_options(grab_options);
	4692	m_object = NULL;
	4693
	4694	if (m == VM_PAGE_NULL) {
	4695	/*
	4696	* no free page currently available...
	4697	* must take the slow path
	4698	*/
	4699	break;
	4700	}
	4701	/*
	4702	* Now do the copy. Mark the source page busy...
	4703	*
	4704	* NOTE: This code holds the map lock across
	4705	* the page copy.
	4706	*/
	4707	vm_page_copy(cur_m, m);
	4708	vm_page_insert(m, object, vm_object_trunc_page(offset));
	4709	if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
	4710	DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4711	}
	4712	m_object = object;
	4713	SET_PAGE_DIRTY(m, FALSE);
	4714
	4715	/*
	4716	* Now cope with the source page and object
	4717	*/
	4718	if (object->ref_count > 1 && cur_m->vmp_pmapped) {
	4719	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
	4720	} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
	4721	/*
	4722	* We've copied the full 16K page but we're
	4723	* about to call vm_fault_enter() only for
	4724	* the 4K chunk we're faulting on. The other
	4725	* three 4K chunks in that page could still
	4726	* be pmapped in this pmap.
	4727	* Since the VM object layer thinks that the
	4728	* entire page has been dealt with and the
	4729	* original page might no longer be needed,
	4730	* it might collapse/bypass the original VM
	4731	* object and free its pages, which would be
	4732	* bad (and would trigger pmap_verify_free()
	4733	* assertions) if the other 4K chunks are still
	4734	* pmapped.
	4735	*/
	4736	/*
	4737	* XXX FBDP TODO4K: to be revisisted
	4738	* Technically, we need to pmap_disconnect()
	4739	* only the target pmap's mappings for the 4K
	4740	* chunks of this 16K VM page. If other pmaps
	4741	* have PTEs on these chunks, that means that
	4742	* the associated VM map must have a reference
	4743	* on the VM object, so no need to worry about
	4744	* those.
	4745	* pmap_protect() for each 4K chunk would be
	4746	* better but we'd have to check which chunks
	4747	* are actually mapped before and after this
	4748	* one.
	4749	* A full-blown pmap_disconnect() is easier
	4750	* for now but not efficient.
	4751	*/
	4752	DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
	4753	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
	4754	}
	4755
	4756	if (cur_m->vmp_clustered) {
	4757	VM_PAGE_COUNT_AS_PAGEIN(cur_m);
	4758	VM_PAGE_CONSUME_CLUSTERED(cur_m);
	4759	vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
	4760	}
	4761	need_collapse = TRUE;
	4762
	4763	if (!cur_object->internal &&
	4764	cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
	4765	/*
	4766	* The object from which we've just
	4767	* copied a page is most probably backed
	4768	* by a vnode. We don't want to waste too
	4769	* much time trying to collapse the VM objects
	4770	* and create a bottleneck when several tasks
	4771	* map the same file.
	4772	*/
	4773	if (cur_object->copy == object) {
	4774	/*
	4775	* Shared mapping or no COW yet.
	4776	* We can never collapse a copy
	4777	* object into its backing object.
	4778	*/
	4779	need_collapse = FALSE;
	4780	} else if (cur_object->copy == object->shadow &&
	4781	object->shadow->resident_page_count == 0) {
	4782	/*
	4783	* Shared mapping after a COW occurred.
	4784	*/
	4785	need_collapse = FALSE;
	4786	}
	4787	}
	4788	vm_object_unlock(cur_object);
	4789
	4790	if (need_collapse == FALSE) {
	4791	vm_fault_collapse_skipped++;
	4792	}
	4793	vm_fault_collapse_total++;
	4794
	4795	type_of_fault = DBG_COW_FAULT;
	4796	VM_STAT_INCR(cow_faults);
	4797	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
	4798	current_task()->cow_faults++;
	4799
	4800	goto FastPmapEnter;
	4801	} else {
	4802	/*
	4803	* No page at cur_object, cur_offset... m == NULL
	4804	*/
	4805	if (cur_object->pager_created) {
	4806	vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
	4807
	4808	if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
	4809	int my_fault_type;
	4810	uint8_t c_flags = C_DONT_BLOCK;
	4811	bool insert_cur_object = FALSE;
	4812
	4813	/*
	4814	* May have to talk to a pager...
	4815	* if so, take the slow path by
	4816	* doing a 'break' from the while (TRUE) loop
	4817	*
	4818	* external_state will only be set to VM_EXTERNAL_STATE_EXISTS
	4819	* if the compressor is active and the page exists there
	4820	*/
	4821	if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
	4822	break;
	4823	}
	4824
	4825	if (map == kernel_map \|\| real_map == kernel_map) {
	4826	/*
	4827	* can't call into the compressor with the kernel_map
	4828	* lock held, since the compressor may try to operate
	4829	* on the kernel map in order to return an empty c_segment
	4830	*/
	4831	break;
	4832	}
	4833	if (object != cur_object) {
	4834	if (fault_type & VM_PROT_WRITE) {
	4835	c_flags \|= C_KEEP;
	4836	} else {
	4837	insert_cur_object = TRUE;
	4838	}
	4839	}
	4840	if (insert_cur_object == TRUE) {
	4841	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4842	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4843
	4844	if (vm_object_lock_upgrade(cur_object) == FALSE) {
	4845	/*
	4846	* couldn't upgrade so go do a full retry
	4847	* immediately since we can no longer be
	4848	* certain about cur_object (since we
	4849	* don't hold a reference on it)...
	4850	* first drop the top object lock
	4851	*/
	4852	vm_object_unlock(object);
	4853
	4854	vm_map_unlock_read(map);
	4855	if (real_map != map) {
	4856	vm_map_unlock(real_map);
	4857	}
	4858
	4859	goto RetryFault;
	4860	}
	4861	}
	4862	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4863	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4864
	4865	if (object != cur_object) {
	4866	/*
	4867	* we can't go for the upgrade on the top
	4868	* lock since the upgrade may block waiting
	4869	* for readers to drain... since we hold
	4870	* cur_object locked at this point, waiting
	4871	* for the readers to drain would represent
	4872	* a lock order inversion since the lock order
	4873	* for objects is the reference order in the
	4874	* shadown chain
	4875	*/
	4876	vm_object_unlock(object);
	4877	vm_object_unlock(cur_object);
	4878
	4879	vm_map_unlock_read(map);
	4880	if (real_map != map) {
	4881	vm_map_unlock(real_map);
	4882	}
	4883
	4884	goto RetryFault;
	4885	}
	4886	if (vm_object_lock_upgrade(object) == FALSE) {
	4887	/*
	4888	* couldn't upgrade, so explictly take the lock
	4889	* exclusively and go relookup the page since we
	4890	* will have dropped the object lock and
	4891	* a different thread could have inserted
	4892	* a page at this offset
	4893	* no need for a full retry since we're
	4894	* at the top level of the object chain
	4895	*/
	4896	vm_object_lock(object);
	4897
	4898	continue;
	4899	}
	4900	}
	4901	m = vm_page_grab_options(grab_options);
	4902	m_object = NULL;
	4903
	4904	if (m == VM_PAGE_NULL) {
	4905	/*
	4906	* no free page currently available...
	4907	* must take the slow path
	4908	*/
	4909	break;
	4910	}
	4911
	4912	/*
	4913	* The object is and remains locked
	4914	* so no need to take a
	4915	* "paging_in_progress" reference.
	4916	*/
	4917	bool shared_lock;
	4918	if ((object == cur_object &&
	4919	object_lock_type == OBJECT_LOCK_EXCLUSIVE) \|\|
	4920	(object != cur_object &&
	4921	cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
	4922	shared_lock = FALSE;
	4923	} else {
	4924	shared_lock = TRUE;
	4925	}
	4926
	4927	kr = vm_compressor_pager_get(
	4928	cur_object->pager,
	4929	(vm_object_trunc_page(cur_offset)
	4930	+ cur_object->paging_offset),
	4931	VM_PAGE_GET_PHYS_PAGE(m),
	4932	&my_fault_type,
	4933	c_flags,
	4934	&compressed_count_delta);
	4935
	4936	vm_compressor_pager_count(
	4937	cur_object->pager,
	4938	compressed_count_delta,
	4939	shared_lock,
	4940	cur_object);
	4941
	4942	if (kr != KERN_SUCCESS) {
	4943	vm_page_release(m, FALSE);
	4944	m = VM_PAGE_NULL;
	4945	}
	4946	/*
	4947	* If vm_compressor_pager_get() returns
	4948	* KERN_MEMORY_FAILURE, then the
	4949	* compressed data is permanently lost,
	4950	* so return this error immediately.
	4951	*/
	4952	if (kr == KERN_MEMORY_FAILURE) {
	4953	if (object != cur_object) {
	4954	vm_object_unlock(cur_object);
	4955	}
	4956	vm_object_unlock(object);
	4957	vm_map_unlock_read(map);
	4958	if (real_map != map) {
	4959	vm_map_unlock(real_map);
	4960	}
	4961	goto done;
	4962	} else if (kr != KERN_SUCCESS) {
	4963	break;
	4964	}
	4965	m->vmp_dirty = TRUE;
	4966
	4967	/*
	4968	* If the object is purgeable, its
	4969	* owner's purgeable ledgers will be
	4970	* updated in vm_page_insert() but the
	4971	* page was also accounted for in a
	4972	* "compressed purgeable" ledger, so
	4973	* update that now.
	4974	*/
	4975	if (object != cur_object &&
	4976	!insert_cur_object) {
	4977	/*
	4978	* We're not going to insert
	4979	* the decompressed page into
	4980	* the object it came from.
	4981	*
	4982	* We're dealing with a
	4983	* copy-on-write fault on
	4984	* "object".
	4985	* We're going to decompress
	4986	* the page directly into the
	4987	* target "object" while
	4988	* keepin the compressed
	4989	* page for "cur_object", so
	4990	* no ledger update in that
	4991	* case.
	4992	*/
	4993	} else if (((cur_object->purgable ==
	4994	VM_PURGABLE_DENY) &&
	4995	(!cur_object->vo_ledger_tag)) \|\|
	4996	(cur_object->vo_owner ==
	4997	NULL)) {
	4998	/*
	4999	* "cur_object" is not purgeable
	5000	* and is not ledger-taged, or
	5001	* there's no owner for it,
	5002	* so no owner's ledgers to
	5003	* update.
	5004	*/
	5005	} else {
	5006	/*
	5007	* One less compressed
	5008	* purgeable/tagged page for
	5009	* cur_object's owner.
	5010	*/
	5011	vm_object_owner_compressed_update(
	5012	cur_object,
	5013	-1);
	5014	}
	5015
	5016	if (insert_cur_object) {
	5017	vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
	5018	m_object = cur_object;
	5019	} else {
	5020	vm_page_insert(m, object, vm_object_trunc_page(offset));
	5021	m_object = object;
	5022	}
	5023
	5024	if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
	5025	/*
	5026	* If the page is not cacheable,
	5027	* we can't let its contents
	5028	* linger in the data cache
	5029	* after the decompression.
	5030	*/
	5031	pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
	5032	}
	5033
	5034	type_of_fault = my_fault_type;
	5035
	5036	VM_STAT_DECOMPRESSIONS();
	5037
	5038	if (cur_object != object) {
	5039	if (insert_cur_object) {
	5040	top_object = object;
	5041	/*
	5042	* switch to the object that has the new page
	5043	*/
	5044	object = cur_object;
	5045	object_lock_type = cur_object_lock_type;
	5046	} else {
	5047	vm_object_unlock(cur_object);
	5048	cur_object = object;
	5049	}
	5050	}
	5051	goto FastPmapEnter;
	5052	}
	5053	/*
	5054	* existence map present and indicates
	5055	* that the pager doesn't have this page
	5056	*/
	5057	}
	5058	if (cur_object->shadow == VM_OBJECT_NULL \|\|
	5059	resilient_media_retry) {
	5060	/*
	5061	* Zero fill fault. Page gets
	5062	* inserted into the original object.
	5063	*/
	5064	if (cur_object->shadow_severed \|\|
	5065	VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) \|\|
	5066	cur_object == compressor_object \|\|
	5067	cur_object == kernel_object \|\|
	5068	cur_object == vm_submap_object) {
	5069	if (object != cur_object) {
	5070	vm_object_unlock(cur_object);
	5071	}
	5072	vm_object_unlock(object);
	5073
	5074	vm_map_unlock_read(map);
	5075	if (real_map != map) {
	5076	vm_map_unlock(real_map);
	5077	}
	5078
	5079	kr = KERN_MEMORY_ERROR;
	5080	goto done;
	5081	}
	5082	if (cur_object != object) {
	5083	vm_object_unlock(cur_object);
	5084
	5085	cur_object = object;
	5086	}
	5087	if (object_lock_type == OBJECT_LOCK_SHARED) {
	5088	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	5089
	5090	if (vm_object_lock_upgrade(object) == FALSE) {
	5091	/*
	5092	* couldn't upgrade so do a full retry on the fault
	5093	* since we dropped the object lock which
	5094	* could allow another thread to insert
	5095	* a page at this offset
	5096	*/
	5097	vm_map_unlock_read(map);
	5098	if (real_map != map) {
	5099	vm_map_unlock(real_map);
	5100	}
	5101
	5102	goto RetryFault;
	5103	}
	5104	}
	5105	if (!object->internal) {
	5106	panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
	5107	}
	5108	m = vm_page_alloc(object, vm_object_trunc_page(offset));
	5109	m_object = NULL;
	5110
	5111	if (m == VM_PAGE_NULL) {
	5112	/*
	5113	* no free page currently available...
	5114	* must take the slow path
	5115	*/
	5116	break;
	5117	}
	5118	m_object = object;
	5119
	5120	/*
	5121	* Zeroing the page and entering into it into the pmap
	5122	* represents a significant amount of the zero fill fault handler's work.
	5123	*
	5124	* To improve fault scalability, we'll drop the object lock, if it appears contended,
	5125	* now that we've inserted the page into the vm object.
	5126	* Before dropping the lock, we need to check protection bits and set the
	5127	* mapped bits on the page. Then we can mark the page busy, drop the lock,
	5128	* zero it, and do the pmap enter. We'll need to reacquire the lock
	5129	* to clear the busy bit and wake up any waiters.
	5130	*/
	5131	vm_fault_cs_clear(m);
	5132	m->vmp_pmapped = TRUE;
	5133	if (map->no_zero_fill) {
	5134	type_of_fault = DBG_NZF_PAGE_FAULT;
	5135	} else {
	5136	type_of_fault = DBG_ZERO_FILL_FAULT;
	5137	}
	5138	{
	5139	pmap_t destination_pmap;
	5140	vm_map_offset_t destination_pmap_vaddr;
	5141	vm_prot_t enter_fault_type;
	5142	if (caller_pmap) {
	5143	destination_pmap = caller_pmap;
	5144	destination_pmap_vaddr = caller_pmap_addr;
	5145	} else {
	5146	destination_pmap = pmap;
	5147	destination_pmap_vaddr = vaddr;
	5148	}
	5149	if (change_wiring) {
	5150	enter_fault_type = VM_PROT_NONE;
	5151	} else {
	5152	enter_fault_type = caller_prot;
	5153	}
	5154	kr = vm_fault_enter_prepare(m,
	5155	destination_pmap,
	5156	destination_pmap_vaddr,
	5157	&prot,
	5158	caller_prot,
	5159	fault_page_size,
	5160	fault_phys_offset,
	5161	change_wiring,
	5162	enter_fault_type,
	5163	&fault_info,
	5164	&type_of_fault,
	5165	&page_needs_data_sync);
	5166	if (kr != KERN_SUCCESS) {
	5167	goto zero_fill_cleanup;
	5168	}
	5169
	5170	if (object_is_contended) {
	5171	/*
	5172	* At this point the page is in the vm object, but not on a paging queue.
	5173	* Since it's accessible to another thread but its contents are invalid
	5174	* (it hasn't been zeroed) mark it busy before dropping the object lock.
	5175	*/
	5176	m->vmp_busy = TRUE;
	5177	vm_object_unlock(object);
	5178	}
	5179	if (type_of_fault == DBG_ZERO_FILL_FAULT) {
	5180	/*
	5181	* Now zero fill page...
	5182	* the page is probably going to
	5183	* be written soon, so don't bother
	5184	* to clear the modified bit
	5185	*
	5186	* NOTE: This code holds the map
	5187	* lock across the zero fill.
	5188	*/
	5189	vm_page_zero_fill(m);
	5190	VM_STAT_INCR(zero_fill_count);
	5191	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
	5192	}
	5193	if (page_needs_data_sync) {
	5194	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
	5195	}
	5196
	5197	if (top_object != VM_OBJECT_NULL) {
	5198	need_retry_ptr = &need_retry;
	5199	} else {
	5200	need_retry_ptr = NULL;
	5201	}
	5202	if (object_is_contended) {
	5203	kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
	5204	fault_page_size, fault_phys_offset,
	5205	m, &prot, caller_prot, enter_fault_type, wired,
	5206	fault_info.pmap_options, need_retry_ptr);
	5207	vm_object_lock(object);
	5208	} else {
	5209	kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
	5210	fault_page_size, fault_phys_offset,
	5211	m, &prot, caller_prot, enter_fault_type, wired,
	5212	fault_info.pmap_options, need_retry_ptr);
	5213	}
	5214	}
	5215	zero_fill_cleanup:
	5216	if (!VM_DYNAMIC_PAGING_ENABLED() &&
	5217	(object->purgable == VM_PURGABLE_DENY \|\|
	5218	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	5219	object->purgable == VM_PURGABLE_VOLATILE)) {
	5220	vm_page_lockspin_queues();
	5221	if (!VM_DYNAMIC_PAGING_ENABLED()) {
	5222	vm_fault_enqueue_throttled_locked(m);
	5223	}
	5224	vm_page_unlock_queues();
	5225	}
	5226	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
	5227
	5228	vm_fault_complete(
	5229	map,
	5230	real_map,
	5231	object,
	5232	m_object,
	5233	m,
	5234	offset,
	5235	trace_real_vaddr,
	5236	&fault_info,
	5237	caller_prot,
	5238	real_vaddr,
	5239	type_of_fault,
	5240	need_retry,
	5241	kr,
	5242	physpage_p,
	5243	prot,
	5244	top_object,
	5245	need_collapse,
	5246	cur_offset,
	5247	fault_type,
	5248	&written_on_object,
	5249	&written_on_pager,
	5250	&written_on_offset);
	5251	top_object = VM_OBJECT_NULL;
	5252	if (need_retry == TRUE) {
	5253	/*
	5254	* vm_fault_enter couldn't complete the PMAP_ENTER...
	5255	* at this point we don't hold any locks so it's safe
	5256	* to ask the pmap layer to expand the page table to
	5257	* accommodate this mapping... once expanded, we'll
	5258	* re-drive the fault which should result in vm_fault_enter
	5259	* being able to successfully enter the mapping this time around
	5260	*/
	5261	(void)pmap_enter_options(
	5262	pmap, vaddr, 0, 0, 0, 0, 0,
	5263	PMAP_OPTIONS_NOENTER, NULL);
	5264
	5265	need_retry = FALSE;
	5266	goto RetryFault;
	5267	}
	5268	goto done;
	5269	}
	5270	/*
	5271	* On to the next level in the shadow chain
	5272	*/
	5273	cur_offset += cur_object->vo_shadow_offset;
	5274	new_object = cur_object->shadow;
	5275	fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
	5276
	5277	/*
	5278	* take the new_object's lock with the indicated state
	5279	*/
	5280	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	5281	vm_object_lock_shared(new_object);
	5282	} else {
	5283	vm_object_lock(new_object);
	5284	}
	5285
	5286	if (cur_object != object) {
	5287	vm_object_unlock(cur_object);
	5288	}
	5289
	5290	cur_object = new_object;
	5291
	5292	continue;
	5293	}
	5294	}
	5295	/*
	5296	* Cleanup from fast fault failure. Drop any object
	5297	* lock other than original and drop map lock.
	5298	*/
	5299	if (object != cur_object) {
	5300	vm_object_unlock(cur_object);
	5301	}
	5302
	5303	/*
	5304	* must own the object lock exclusively at this point
	5305	*/
	5306	if (object_lock_type == OBJECT_LOCK_SHARED) {
	5307	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	5308
	5309	if (vm_object_lock_upgrade(object) == FALSE) {
	5310	/*
	5311	* couldn't upgrade, so explictly
	5312	* take the lock exclusively
	5313	* no need to retry the fault at this
	5314	* point since "vm_fault_page" will
	5315	* completely re-evaluate the state
	5316	*/
	5317	vm_object_lock(object);
	5318	}
	5319	}
	5320
	5321	handle_copy_delay:
	5322	vm_map_unlock_read(map);
	5323	if (real_map != map) {
	5324	vm_map_unlock(real_map);
	5325	}
	5326
	5327	if (__improbable(object == compressor_object \|\|
	5328	object == kernel_object \|\|
	5329	object == vm_submap_object)) {
	5330	/*
	5331	* These objects are explicitly managed and populated by the
	5332	* kernel. The virtual ranges backed by these objects should
	5333	* either have wired pages or "holes" that are not supposed to
	5334	* be accessed at all until they get explicitly populated.
	5335	* We should never have to resolve a fault on a mapping backed
	5336	* by one of these VM objects and providing a zero-filled page
	5337	* would be wrong here, so let's fail the fault and let the
	5338	* caller crash or recover.
	5339	*/
	5340	vm_object_unlock(object);
	5341	kr = KERN_MEMORY_ERROR;
	5342	goto done;
	5343	}
	5344
	5345	assert(object != compressor_object);
	5346	assert(object != kernel_object);
	5347	assert(object != vm_submap_object);
	5348
	5349	if (resilient_media_retry) {
	5350	/*
	5351	* We could get here if we failed to get a free page
	5352	* to zero-fill and had to take the slow path again.
	5353	* Reset our "recovery-from-failed-media" state.
	5354	*/
	5355	assert(resilient_media_object != VM_OBJECT_NULL);
	5356	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5357	/* release our extra reference on failed object */
	5358	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5359	vm_object_deallocate(resilient_media_object);
	5360	resilient_media_object = VM_OBJECT_NULL;
	5361	resilient_media_offset = (vm_object_offset_t)-1;
	5362	resilient_media_retry = FALSE;
	5363	}
	5364
	5365	/*
	5366	* Make a reference to this object to
	5367	* prevent its disposal while we are messing with
	5368	* it. Once we have the reference, the map is free
	5369	* to be diddled. Since objects reference their
	5370	* shadows (and copies), they will stay around as well.
	5371	*/
	5372	vm_object_reference_locked(object);
	5373	vm_object_paging_begin(object);
	5374
	5375	set_thread_pagein_error(cthread, 0);
	5376	error_code = 0;
	5377
	5378	result_page = VM_PAGE_NULL;
	5379	kr = vm_fault_page(object, offset, fault_type,
	5380	(change_wiring && !wired),
	5381	FALSE, /* page not looked up */
	5382	&prot, &result_page, &top_page,
	5383	&type_of_fault,
	5384	&error_code, map->no_zero_fill,
	5385	FALSE, &fault_info);
	5386
	5387	/*
	5388	* if kr != VM_FAULT_SUCCESS, then the paging reference
	5389	* has been dropped and the object unlocked... the ref_count
	5390	* is still held
	5391	*
	5392	* if kr == VM_FAULT_SUCCESS, then the paging reference
	5393	* is still held along with the ref_count on the original object
	5394	*
	5395	* the object is returned locked with a paging reference
	5396	*
	5397	* if top_page != NULL, then it's BUSY and the
	5398	* object it belongs to has a paging reference
	5399	* but is returned unlocked
	5400	*/
	5401	if (kr != VM_FAULT_SUCCESS &&
	5402	kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
	5403	if (kr == VM_FAULT_MEMORY_ERROR &&
	5404	fault_info.resilient_media) {
	5405	assertf(object->internal, "object %p", object);
	5406	/*
	5407	* This fault failed but the mapping was
	5408	* "media resilient", so we'll retry the fault in
	5409	* recovery mode to get a zero-filled page in the
	5410	* top object.
	5411	* Keep the reference on the failing object so
	5412	* that we can check that the mapping is still
	5413	* pointing to it when we retry the fault.
	5414	*/
	5415	// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
	5416	assert(!resilient_media_retry); /* no double retry */
	5417	assert(resilient_media_object == VM_OBJECT_NULL);
	5418	assert(resilient_media_offset == (vm_object_offset_t)-1);
	5419	resilient_media_retry = TRUE;
	5420	resilient_media_object = object;
	5421	resilient_media_offset = offset;
	5422	// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
	5423	goto RetryFault;
	5424	} else {
	5425	/*
	5426	* we didn't succeed, lose the object reference
	5427	* immediately.
	5428	*/
	5429	vm_object_deallocate(object);
	5430	object = VM_OBJECT_NULL; /* no longer valid */
	5431	}
	5432
	5433	/*
	5434	* See why we failed, and take corrective action.
	5435	*/
	5436	switch (kr) {
	5437	case VM_FAULT_MEMORY_SHORTAGE:
	5438	if (vm_page_wait((change_wiring) ?
	5439	THREAD_UNINT :
	5440	THREAD_ABORTSAFE)) {
	5441	goto RetryFault;
	5442	}
	5443	OS_FALLTHROUGH;
	5444	case VM_FAULT_INTERRUPTED:
	5445	kr = KERN_ABORTED;
	5446	goto done;
	5447	case VM_FAULT_RETRY:
	5448	goto RetryFault;
	5449	case VM_FAULT_MEMORY_ERROR:
	5450	if (error_code) {
	5451	kr = error_code;
	5452	} else {
	5453	kr = KERN_MEMORY_ERROR;
	5454	}
	5455	goto done;
	5456	default:
	5457	panic("vm_fault: unexpected error 0x%x from "
	5458	"vm_fault_page()\n", kr);
	5459	}
	5460	}
	5461	m = result_page;
	5462	m_object = NULL;
	5463
	5464	if (m != VM_PAGE_NULL) {
	5465	m_object = VM_PAGE_OBJECT(m);
	5466	assert((change_wiring && !wired) ?
	5467	(top_page == VM_PAGE_NULL) :
	5468	((top_page == VM_PAGE_NULL) == (m_object == object)));
	5469	}
	5470
	5471	/*
	5472	* What to do with the resulting page from vm_fault_page
	5473	* if it doesn't get entered into the physical map:
	5474	*/
	5475	#define RELEASE_PAGE(m) \
	5476	MACRO_BEGIN \
	5477	PAGE_WAKEUP_DONE(m); \
	5478	if ( !VM_PAGE_PAGEABLE(m)) { \
	5479	vm_page_lockspin_queues(); \
	5480	if ( !VM_PAGE_PAGEABLE(m)) \
	5481	vm_page_activate(m); \
	5482	vm_page_unlock_queues(); \
	5483	} \
	5484	MACRO_END
	5485
	5486
	5487	object_locks_dropped = FALSE;
	5488	/*
	5489	* We must verify that the maps have not changed
	5490	* since our last lookup. vm_map_verify() needs the
	5491	* map lock (shared) but we are holding object locks.
	5492	* So we do a try_lock() first and, if that fails, we
	5493	* drop the object locks and go in for the map lock again.
	5494	*/
	5495	if (!vm_map_try_lock_read(original_map)) {
	5496	if (m != VM_PAGE_NULL) {
	5497	old_copy_object = m_object->copy;
	5498	vm_object_unlock(m_object);
	5499	} else {
	5500	old_copy_object = VM_OBJECT_NULL;
	5501	vm_object_unlock(object);
	5502	}
	5503
	5504	object_locks_dropped = TRUE;
	5505
	5506	vm_map_lock_read(original_map);
	5507	}
	5508
	5509	if ((map != original_map) \|\| !vm_map_verify(map, &version)) {
	5510	if (object_locks_dropped == FALSE) {
	5511	if (m != VM_PAGE_NULL) {
	5512	old_copy_object = m_object->copy;
	5513	vm_object_unlock(m_object);
	5514	} else {
	5515	old_copy_object = VM_OBJECT_NULL;
	5516	vm_object_unlock(object);
	5517	}
	5518
	5519	object_locks_dropped = TRUE;
	5520	}
	5521
	5522	/*
	5523	* no object locks are held at this point
	5524	*/
	5525	vm_object_t retry_object;
	5526	vm_object_offset_t retry_offset;
	5527	vm_prot_t retry_prot;
	5528
	5529	/*
	5530	* To avoid trying to write_lock the map while another
	5531	* thread has it read_locked (in vm_map_pageable), we
	5532	* do not try for write permission. If the page is
	5533	* still writable, we will get write permission. If it
	5534	* is not, or has been marked needs_copy, we enter the
	5535	* mapping without write permission, and will merely
	5536	* take another fault.
	5537	*/
	5538	map = original_map;
	5539
	5540	kr = vm_map_lookup_locked(&map, vaddr,
	5541	fault_type & ~VM_PROT_WRITE,
	5542	OBJECT_LOCK_EXCLUSIVE, &version,
	5543	&retry_object, &retry_offset, &retry_prot,
	5544	&wired,
	5545	&fault_info,
	5546	&real_map,
	5547	NULL);
	5548	pmap = real_map->pmap;
	5549
	5550	if (kr != KERN_SUCCESS) {
	5551	vm_map_unlock_read(map);
	5552
	5553	if (m != VM_PAGE_NULL) {
	5554	assert(VM_PAGE_OBJECT(m) == m_object);
	5555
	5556	/*
	5557	* retake the lock so that
	5558	* we can drop the paging reference
	5559	* in vm_fault_cleanup and do the
	5560	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5561	*/
	5562	vm_object_lock(m_object);
	5563
	5564	RELEASE_PAGE(m);
	5565
	5566	vm_fault_cleanup(m_object, top_page);
	5567	} else {
	5568	/*
	5569	* retake the lock so that
	5570	* we can drop the paging reference
	5571	* in vm_fault_cleanup
	5572	*/
	5573	vm_object_lock(object);
	5574
	5575	vm_fault_cleanup(object, top_page);
	5576	}
	5577	vm_object_deallocate(object);
	5578
	5579	goto done;
	5580	}
	5581	vm_object_unlock(retry_object);
	5582
	5583	if ((retry_object != object) \|\| (retry_offset != offset)) {
	5584	vm_map_unlock_read(map);
	5585	if (real_map != map) {
	5586	vm_map_unlock(real_map);
	5587	}
	5588
	5589	if (m != VM_PAGE_NULL) {
	5590	assert(VM_PAGE_OBJECT(m) == m_object);
	5591
	5592	/*
	5593	* retake the lock so that
	5594	* we can drop the paging reference
	5595	* in vm_fault_cleanup and do the
	5596	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5597	*/
	5598	vm_object_lock(m_object);
	5599
	5600	RELEASE_PAGE(m);
	5601
	5602	vm_fault_cleanup(m_object, top_page);
	5603	} else {
	5604	/*
	5605	* retake the lock so that
	5606	* we can drop the paging reference
	5607	* in vm_fault_cleanup
	5608	*/
	5609	vm_object_lock(object);
	5610
	5611	vm_fault_cleanup(object, top_page);
	5612	}
	5613	vm_object_deallocate(object);
	5614
	5615	goto RetryFault;
	5616	}
	5617	/*
	5618	* Check whether the protection has changed or the object
	5619	* has been copied while we left the map unlocked.
	5620	*/
	5621	if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
	5622	/* If the pmap layer cares, pass the full set. */
	5623	prot = retry_prot;
	5624	} else {
	5625	prot &= retry_prot;
	5626	}
	5627	}
	5628
	5629	if (object_locks_dropped == TRUE) {
	5630	if (m != VM_PAGE_NULL) {
	5631	vm_object_lock(m_object);
	5632
	5633	if (m_object->copy != old_copy_object) {
	5634	/*
	5635	* The copy object changed while the top-level object
	5636	* was unlocked, so take away write permission.
	5637	*/
	5638	assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
	5639	prot &= ~VM_PROT_WRITE;
	5640	}
	5641	} else {
	5642	vm_object_lock(object);
	5643	}
	5644
	5645	object_locks_dropped = FALSE;
	5646	}
	5647
	5648	if (!need_copy &&
	5649	!fault_info.no_copy_on_read &&
	5650	m != VM_PAGE_NULL &&
	5651	VM_PAGE_OBJECT(m) != object &&
	5652	!VM_PAGE_OBJECT(m)->pager_trusted &&
	5653	vm_protect_privileged_from_untrusted &&
	5654	!((prot & VM_PROT_EXECUTE) &&
	5655	VM_PAGE_OBJECT(m)->code_signed &&
	5656	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
	5657	current_proc_is_privileged()) {
	5658	/*
	5659	* We found the page we want in an "untrusted" VM object
	5660	* down the shadow chain. Since the target is "privileged"
	5661	* we want to perform a copy-on-read of that page, so that the
	5662	* mapped object gets a stable copy and does not have to
	5663	* rely on the "untrusted" object to provide the same
	5664	* contents if the page gets reclaimed and has to be paged
	5665	* in again later on.
	5666	*
	5667	* Special case: if the mapping is executable and the untrusted
	5668	* object is code-signed and the process is "cs_enforced", we
	5669	* do not copy-on-read because that would break code-signing
	5670	* enforcement expectations (an executable page must belong
	5671	* to a code-signed object) and we can rely on code-signing
	5672	* to re-validate the page if it gets evicted and paged back in.
	5673	*/
	5674	// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	5675	vm_copied_on_read++;
	5676	need_copy_on_read = TRUE;
	5677	need_copy = TRUE;
	5678	} else {
	5679	need_copy_on_read = FALSE;
	5680	}
	5681
	5682	/*
	5683	* If we want to wire down this page, but no longer have
	5684	* adequate permissions, we must start all over.
	5685	* If we decided to copy-on-read, we must also start all over.
	5686	*/
	5687	if ((wired && (fault_type != (prot \| VM_PROT_WRITE))) \|\|
	5688	need_copy_on_read) {
	5689	vm_map_unlock_read(map);
	5690	if (real_map != map) {
	5691	vm_map_unlock(real_map);
	5692	}
	5693
	5694	if (m != VM_PAGE_NULL) {
	5695	assert(VM_PAGE_OBJECT(m) == m_object);
	5696
	5697	RELEASE_PAGE(m);
	5698
	5699	vm_fault_cleanup(m_object, top_page);
	5700	} else {
	5701	vm_fault_cleanup(object, top_page);
	5702	}
	5703
	5704	vm_object_deallocate(object);
	5705
	5706	goto RetryFault;
	5707	}
	5708	if (m != VM_PAGE_NULL) {
	5709	/*
	5710	* Put this page into the physical map.
	5711	* We had to do the unlock above because pmap_enter
	5712	* may cause other faults. The page may be on
	5713	* the pageout queues. If the pageout daemon comes
	5714	* across the page, it will remove it from the queues.
	5715	*/
	5716	if (fault_page_size < PAGE_SIZE) {
	5717	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	5718	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	5719	fault_phys_offset < PAGE_SIZE),
	5720	"0x%llx\n", (uint64_t)fault_phys_offset);
	5721	} else {
	5722	assertf(fault_phys_offset == 0,
	5723	"0x%llx\n", (uint64_t)fault_phys_offset);
	5724	}
	5725	if (caller_pmap) {
	5726	kr = vm_fault_enter(m,
	5727	caller_pmap,
	5728	caller_pmap_addr,
	5729	fault_page_size,
	5730	fault_phys_offset,
	5731	prot,
	5732	caller_prot,
	5733	wired,
	5734	change_wiring,
	5735	wire_tag,
	5736	&fault_info,
	5737	NULL,
	5738	&type_of_fault);
	5739	} else {
	5740	kr = vm_fault_enter(m,
	5741	pmap,
	5742	vaddr,
	5743	fault_page_size,
	5744	fault_phys_offset,
	5745	prot,
	5746	caller_prot,
	5747	wired,
	5748	change_wiring,
	5749	wire_tag,
	5750	&fault_info,
	5751	NULL,
	5752	&type_of_fault);
	5753	}
	5754	assert(VM_PAGE_OBJECT(m) == m_object);
	5755
	5756	{
	5757	int event_code = 0;
	5758
	5759	if (m_object->internal) {
	5760	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	5761	} else if (m_object->object_is_shared_cache) {
	5762	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	5763	} else {
	5764	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	5765	}
	5766
	5767	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) \| (caller_prot << 8) \| vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
	5768	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
	5769
	5770	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
	5771	}
	5772	if (kr != KERN_SUCCESS) {
	5773	/* abort this page fault */
	5774	vm_map_unlock_read(map);
	5775	if (real_map != map) {
	5776	vm_map_unlock(real_map);
	5777	}
	5778	PAGE_WAKEUP_DONE(m);
	5779	vm_fault_cleanup(m_object, top_page);
	5780	vm_object_deallocate(object);
	5781	goto done;
	5782	}
	5783	if (physpage_p != NULL) {
	5784	/* for vm_map_wire_and_extract() */
	5785	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	5786	if (prot & VM_PROT_WRITE) {
	5787	vm_object_lock_assert_exclusive(m_object);
	5788	m->vmp_dirty = TRUE;
	5789	}
	5790	}
	5791	} else {
	5792	vm_map_entry_t entry;
	5793	vm_map_offset_t laddr;
	5794	vm_map_offset_t ldelta, hdelta;
	5795
	5796	/*
	5797	* do a pmap block mapping from the physical address
	5798	* in the object
	5799	*/
	5800
	5801	if (real_map != map) {
	5802	vm_map_unlock(real_map);
	5803	}
	5804
	5805	if (original_map != map) {
	5806	vm_map_unlock_read(map);
	5807	vm_map_lock_read(original_map);
	5808	map = original_map;
	5809	}
	5810	real_map = map;
	5811
	5812	laddr = vaddr;
	5813	hdelta = 0xFFFFF000;
	5814	ldelta = 0xFFFFF000;
	5815
	5816	while (vm_map_lookup_entry(map, laddr, &entry)) {
	5817	if (ldelta > (laddr - entry->vme_start)) {
	5818	ldelta = laddr - entry->vme_start;
	5819	}
	5820	if (hdelta > (entry->vme_end - laddr)) {
	5821	hdelta = entry->vme_end - laddr;
	5822	}
	5823	if (entry->is_sub_map) {
	5824	laddr = ((laddr - entry->vme_start)
	5825	+ VME_OFFSET(entry));
	5826	vm_map_lock_read(VME_SUBMAP(entry));
	5827
	5828	if (map != real_map) {
	5829	vm_map_unlock_read(map);
	5830	}
	5831	if (entry->use_pmap) {
	5832	vm_map_unlock_read(real_map);
	5833	real_map = VME_SUBMAP(entry);
	5834	}
	5835	map = VME_SUBMAP(entry);
	5836	} else {
	5837	break;
	5838	}
	5839	}
	5840
	5841	if (vm_map_lookup_entry(map, laddr, &entry) &&
	5842	(VME_OBJECT(entry) != NULL) &&
	5843	(VME_OBJECT(entry) == object)) {
	5844	uint16_t superpage;
	5845
	5846	if (!object->pager_created &&
	5847	object->phys_contiguous &&
	5848	VME_OFFSET(entry) == 0 &&
	5849	(entry->vme_end - entry->vme_start == object->vo_size) &&
	5850	VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
	5851	superpage = VM_MEM_SUPERPAGE;
	5852	} else {
	5853	superpage = 0;
	5854	}
	5855
	5856	if (superpage && physpage_p) {
	5857	/* for vm_map_wire_and_extract() */
	5858	*physpage_p = (ppnum_t)
	5859	((((vm_map_offset_t)
	5860	object->vo_shadow_offset)
	5861	+ VME_OFFSET(entry)
	5862	+ (laddr - entry->vme_start))
	5863	>> PAGE_SHIFT);
	5864	}
	5865
	5866	if (caller_pmap) {
	5867	/*
	5868	* Set up a block mapped area
	5869	*/
	5870	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
	5871	kr = pmap_map_block(caller_pmap,
	5872	(addr64_t)(caller_pmap_addr - ldelta),
	5873	(ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
	5874	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5875	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5876	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
	5877
	5878	if (kr != KERN_SUCCESS) {
	5879	goto cleanup;
	5880	}
	5881	} else {
	5882	/*
	5883	* Set up a block mapped area
	5884	*/
	5885	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
	5886	kr = pmap_map_block(real_map->pmap,
	5887	(addr64_t)(vaddr - ldelta),
	5888	(ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
	5889	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5890	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5891	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
	5892
	5893	if (kr != KERN_SUCCESS) {
	5894	goto cleanup;
	5895	}
	5896	}
	5897	}
	5898	}
	5899
	5900	/*
	5901	* Success
	5902	*/
	5903	kr = KERN_SUCCESS;
	5904
	5905	/*
	5906	* TODO: could most of the done cases just use cleanup?
	5907	*/
	5908	cleanup:
	5909	/*
	5910	* Unlock everything, and return
	5911	*/
	5912	vm_map_unlock_read(map);
	5913	if (real_map != map) {
	5914	vm_map_unlock(real_map);
	5915	}
	5916
	5917	if (m != VM_PAGE_NULL) {
	5918	assert(VM_PAGE_OBJECT(m) == m_object);
	5919
	5920	if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
	5921	vm_object_paging_begin(m_object);
	5922
	5923	assert(written_on_object == VM_OBJECT_NULL);
	5924	written_on_object = m_object;
	5925	written_on_pager = m_object->pager;
	5926	written_on_offset = m_object->paging_offset + m->vmp_offset;
	5927	}
	5928	PAGE_WAKEUP_DONE(m);
	5929
	5930	vm_fault_cleanup(m_object, top_page);
	5931	} else {
	5932	vm_fault_cleanup(object, top_page);
	5933	}
	5934
	5935	vm_object_deallocate(object);
	5936
	5937	#undef RELEASE_PAGE
	5938
	5939	done:
	5940	thread_interrupt_level(interruptible_state);
	5941
	5942	if (resilient_media_object != VM_OBJECT_NULL) {
	5943	assert(resilient_media_retry);
	5944	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5945	/* release extra reference on failed object */
	5946	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5947	vm_object_deallocate(resilient_media_object);
	5948	resilient_media_object = VM_OBJECT_NULL;
	5949	resilient_media_offset = (vm_object_offset_t)-1;
	5950	resilient_media_retry = FALSE;
	5951	}
	5952	assert(!resilient_media_retry);
	5953
	5954	/*
	5955	* Only I/O throttle on faults which cause a pagein/swapin.
	5956	*/
	5957	if ((type_of_fault == DBG_PAGEIND_FAULT) \|\| (type_of_fault == DBG_PAGEINV_FAULT) \|\| (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
	5958	throttle_lowpri_io(1);
	5959	} else {
	5960	if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
	5961	if ((throttle_delay = vm_page_throttled(TRUE))) {
	5962	if (vm_debug_events) {
	5963	if (type_of_fault == DBG_COMPRESSOR_FAULT) {
	5964	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5965	} else if (type_of_fault == DBG_COW_FAULT) {
	5966	VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5967	} else {
	5968	VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5969	}
	5970	}
	5971	delay(throttle_delay);
	5972	}
	5973	}
	5974	}
	5975
	5976	if (written_on_object) {
	5977	vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
	5978
	5979	vm_object_lock(written_on_object);
	5980	vm_object_paging_end(written_on_object);
	5981	vm_object_unlock(written_on_object);
	5982
	5983	written_on_object = VM_OBJECT_NULL;
	5984	}
	5985
	5986	if (rtfault) {
	5987	vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
	5988	}
	5989
	5990	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5991	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	5992	((uint64_t)trace_vaddr >> 32),
	5993	trace_vaddr,
	5994	kr,
	5995	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
	5996	0);
	5997
	5998	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
	5999	DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
	6000	}
	6001
	6002	return kr;
	6003	}
	6004
	6005	/*
	6006	* vm_fault_wire:
	6007	*
	6008	* Wire down a range of virtual addresses in a map.
	6009	*/
	6010	kern_return_t
	6011	vm_fault_wire(
	6012	vm_map_t map,
	6013	vm_map_entry_t entry,
	6014	vm_prot_t prot,
	6015	vm_tag_t wire_tag,
	6016	pmap_t pmap,
	6017	vm_map_offset_t pmap_addr,
	6018	ppnum_t *physpage_p)
	6019	{
	6020	vm_map_offset_t va;
	6021	vm_map_offset_t end_addr = entry->vme_end;
	6022	kern_return_t rc;
	6023	vm_map_size_t effective_page_size;
	6024
	6025	assert(entry->in_transition);
	6026
	6027	if ((VME_OBJECT(entry) != NULL) &&
	6028	!entry->is_sub_map &&
	6029	VME_OBJECT(entry)->phys_contiguous) {
	6030	return KERN_SUCCESS;
	6031	}
	6032
	6033	/*
	6034	* Inform the physical mapping system that the
	6035	* range of addresses may not fault, so that
	6036	* page tables and such can be locked down as well.
	6037	*/
	6038
	6039	pmap_pageable(pmap, pmap_addr,
	6040	pmap_addr + (end_addr - entry->vme_start), FALSE);
	6041
	6042	/*
	6043	* We simulate a fault to get the page and enter it
	6044	* in the physical map.
	6045	*/
	6046
	6047	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6048	for (va = entry->vme_start;
	6049	va < end_addr;
	6050	va += effective_page_size) {
	6051	rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
	6052	pmap_addr + (va - entry->vme_start),
	6053	physpage_p);
	6054	if (rc != KERN_SUCCESS) {
	6055	rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
	6056	((pmap == kernel_pmap)
	6057	? THREAD_UNINT
	6058	: THREAD_ABORTSAFE),
	6059	pmap,
	6060	(pmap_addr +
	6061	(va - entry->vme_start)),
	6062	physpage_p);
	6063	DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
	6064	}
	6065
	6066	if (rc != KERN_SUCCESS) {
	6067	struct vm_map_entry tmp_entry = *entry;
	6068
	6069	/* unwire wired pages */
	6070	tmp_entry.vme_end = va;
	6071	vm_fault_unwire(map,
	6072	&tmp_entry, FALSE, pmap, pmap_addr);
	6073
	6074	return rc;
	6075	}
	6076	}
	6077	return KERN_SUCCESS;
	6078	}
	6079
	6080	/*
	6081	* vm_fault_unwire:
	6082	*
	6083	* Unwire a range of virtual addresses in a map.
	6084	*/
	6085	void
	6086	vm_fault_unwire(
	6087	vm_map_t map,
	6088	vm_map_entry_t entry,
	6089	boolean_t deallocate,
	6090	pmap_t pmap,
	6091	vm_map_offset_t pmap_addr)
	6092	{
	6093	vm_map_offset_t va;
	6094	vm_map_offset_t end_addr = entry->vme_end;
	6095	vm_object_t object;
	6096	struct vm_object_fault_info fault_info = {};
	6097	unsigned int unwired_pages;
	6098	vm_map_size_t effective_page_size;
	6099
	6100	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
	6101
	6102	/*
	6103	* If it's marked phys_contiguous, then vm_fault_wire() didn't actually
	6104	* do anything since such memory is wired by default. So we don't have
	6105	* anything to undo here.
	6106	*/
	6107
	6108	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
	6109	return;
	6110	}
	6111
	6112	fault_info.interruptible = THREAD_UNINT;
	6113	fault_info.behavior = entry->behavior;
	6114	fault_info.user_tag = VME_ALIAS(entry);
	6115	if (entry->iokit_acct \|\|
	6116	(!entry->is_sub_map && !entry->use_pmap)) {
	6117	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6118	}
	6119	fault_info.lo_offset = VME_OFFSET(entry);
	6120	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
	6121	fault_info.no_cache = entry->no_cache;
	6122	fault_info.stealth = TRUE;
	6123
	6124	unwired_pages = 0;
	6125
	6126	/*
	6127	* Since the pages are wired down, we must be able to
	6128	* get their mappings from the physical map system.
	6129	*/
	6130
	6131	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6132	for (va = entry->vme_start;
	6133	va < end_addr;
	6134	va += effective_page_size) {
	6135	if (object == VM_OBJECT_NULL) {
	6136	if (pmap) {
	6137	pmap_change_wiring(pmap,
	6138	pmap_addr + (va - entry->vme_start), FALSE);
	6139	}
	6140	(void) vm_fault(map, va, VM_PROT_NONE,
	6141	TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
	6142	} else {
	6143	vm_prot_t prot;
	6144	vm_page_t result_page;
	6145	vm_page_t top_page;
	6146	vm_object_t result_object;
	6147	vm_fault_return_t result;
	6148
	6149	/* cap cluster size at maximum UPL size */
	6150	upl_size_t cluster_size;
	6151	if (os_sub_overflow(end_addr, va, &cluster_size)) {
	6152	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6153	}
	6154	fault_info.cluster_size = cluster_size;
	6155
	6156	do {
	6157	prot = VM_PROT_NONE;
	6158
	6159	vm_object_lock(object);
	6160	vm_object_paging_begin(object);
	6161	result_page = VM_PAGE_NULL;
	6162	result = vm_fault_page(
	6163	object,
	6164	(VME_OFFSET(entry) +
	6165	(va - entry->vme_start)),
	6166	VM_PROT_NONE, TRUE,
	6167	FALSE, /* page not looked up */
	6168	&prot, &result_page, &top_page,
	6169	(int *)0,
	6170	NULL, map->no_zero_fill,
	6171	FALSE, &fault_info);
	6172	} while (result == VM_FAULT_RETRY);
	6173
	6174	/*
	6175	* If this was a mapping to a file on a device that has been forcibly
	6176	* unmounted, then we won't get a page back from vm_fault_page(). Just
	6177	* move on to the next one in case the remaining pages are mapped from
	6178	* different objects. During a forced unmount, the object is terminated
	6179	* so the alive flag will be false if this happens. A forced unmount will
	6180	* will occur when an external disk is unplugged before the user does an
	6181	* eject, so we don't want to panic in that situation.
	6182	*/
	6183
	6184	if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
	6185	continue;
	6186	}
	6187
	6188	if (result == VM_FAULT_MEMORY_ERROR &&
	6189	object == kernel_object) {
	6190	/*
	6191	* This must have been allocated with
	6192	* KMA_KOBJECT and KMA_VAONLY and there's
	6193	* no physical page at this offset.
	6194	* We're done (no page to free).
	6195	*/
	6196	assert(deallocate);
	6197	continue;
	6198	}
	6199
	6200	if (result != VM_FAULT_SUCCESS) {
	6201	panic("vm_fault_unwire: failure");
	6202	}
	6203
	6204	result_object = VM_PAGE_OBJECT(result_page);
	6205
	6206	if (deallocate) {
	6207	assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
	6208	vm_page_fictitious_addr);
	6209	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
	6210	if (VM_PAGE_WIRED(result_page)) {
	6211	unwired_pages++;
	6212	}
	6213	VM_PAGE_FREE(result_page);
	6214	} else {
	6215	if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
	6216	pmap_change_wiring(pmap,
	6217	pmap_addr + (va - entry->vme_start), FALSE);
	6218	}
	6219
	6220
	6221	if (VM_PAGE_WIRED(result_page)) {
	6222	vm_page_lockspin_queues();
	6223	vm_page_unwire(result_page, TRUE);
	6224	vm_page_unlock_queues();
	6225	unwired_pages++;
	6226	}
	6227	if (entry->zero_wired_pages) {
	6228	pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
	6229	entry->zero_wired_pages = FALSE;
	6230	}
	6231
	6232	PAGE_WAKEUP_DONE(result_page);
	6233	}
	6234	vm_fault_cleanup(result_object, top_page);
	6235	}
	6236	}
	6237
	6238	/*
	6239	* Inform the physical mapping system that the range
	6240	* of addresses may fault, so that page tables and
	6241	* such may be unwired themselves.
	6242	*/
	6243
	6244	pmap_pageable(pmap, pmap_addr,
	6245	pmap_addr + (end_addr - entry->vme_start), TRUE);
	6246
	6247	if (kernel_object == object) {
	6248	/*
	6249	* Would like to make user_tag in vm_object_fault_info
	6250	* vm_tag_t (unsigned short) but user_tag derives its value from
	6251	* VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
	6252	* to an _unsigned int_ which is used by non-fault_info paths throughout the
	6253	* code at many places.
	6254	*
	6255	* So, for now, an explicit truncation to unsigned short (vm_tag_t).
	6256	*/
	6257	assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
	6258	"VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
	6259	vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
	6260	}
	6261	}
	6262
	6263	/*
	6264	* vm_fault_wire_fast:
	6265	*
	6266	* Handle common case of a wire down page fault at the given address.
	6267	* If successful, the page is inserted into the associated physical map.
	6268	* The map entry is passed in to avoid the overhead of a map lookup.
	6269	*
	6270	* NOTE: the given address should be truncated to the
	6271	* proper page address.
	6272	*
	6273	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
	6274	* a standard error specifying why the fault is fatal is returned.
	6275	*
	6276	* The map in question must be referenced, and remains so.
	6277	* Caller has a read lock on the map.
	6278	*
	6279	* This is a stripped version of vm_fault() for wiring pages. Anything
	6280	* other than the common case will return KERN_FAILURE, and the caller
	6281	* is expected to call vm_fault().
	6282	*/
	6283	static kern_return_t
	6284	vm_fault_wire_fast(
	6285	__unused vm_map_t map,
	6286	vm_map_offset_t va,
	6287	__unused vm_prot_t caller_prot,
	6288	vm_tag_t wire_tag,
	6289	vm_map_entry_t entry,
	6290	pmap_t pmap,
	6291	vm_map_offset_t pmap_addr,
	6292	ppnum_t *physpage_p)
	6293	{
	6294	vm_object_t object;
	6295	vm_object_offset_t offset;
	6296	vm_page_t m;
	6297	vm_prot_t prot;
	6298	thread_t thread = current_thread();
	6299	int type_of_fault;
	6300	kern_return_t kr;
	6301	vm_map_size_t fault_page_size;
	6302	vm_map_offset_t fault_phys_offset;
	6303	struct vm_object_fault_info fault_info = {};
	6304
	6305	VM_STAT_INCR(faults);
	6306
	6307	if (thread != THREAD_NULL && thread->task != TASK_NULL) {
	6308	thread->task->faults++;
	6309	}
	6310
	6311	/*
	6312	* Recovery actions
	6313	*/
	6314
	6315	#undef RELEASE_PAGE
	6316	#define RELEASE_PAGE(m) { \
	6317	PAGE_WAKEUP_DONE(m); \
	6318	vm_page_lockspin_queues(); \
	6319	vm_page_unwire(m, TRUE); \
	6320	vm_page_unlock_queues(); \
	6321	}
	6322
	6323
	6324	#undef UNLOCK_THINGS
	6325	#define UNLOCK_THINGS { \
	6326	vm_object_paging_end(object); \
	6327	vm_object_unlock(object); \
	6328	}
	6329
	6330	#undef UNLOCK_AND_DEALLOCATE
	6331	#define UNLOCK_AND_DEALLOCATE { \
	6332	UNLOCK_THINGS; \
	6333	vm_object_deallocate(object); \
	6334	}
	6335	/*
	6336	* Give up and have caller do things the hard way.
	6337	*/
	6338
	6339	#define GIVE_UP { \
	6340	UNLOCK_AND_DEALLOCATE; \
	6341	return(KERN_FAILURE); \
	6342	}
	6343
	6344
	6345	/*
	6346	* If this entry is not directly to a vm_object, bail out.
	6347	*/
	6348	if (entry->is_sub_map) {
	6349	assert(physpage_p == NULL);
	6350	return KERN_FAILURE;
	6351	}
	6352
	6353	/*
	6354	* Find the backing store object and offset into it.
	6355	*/
	6356
	6357	object = VME_OBJECT(entry);
	6358	offset = (va - entry->vme_start) + VME_OFFSET(entry);
	6359	prot = entry->protection;
	6360
	6361	/*
	6362	* Make a reference to this object to prevent its
	6363	* disposal while we are messing with it.
	6364	*/
	6365
	6366	vm_object_lock(object);
	6367	vm_object_reference_locked(object);
	6368	vm_object_paging_begin(object);
	6369
	6370	/*
	6371	* INVARIANTS (through entire routine):
	6372	*
	6373	* 1) At all times, we must either have the object
	6374	* lock or a busy page in some object to prevent
	6375	* some other thread from trying to bring in
	6376	* the same page.
	6377	*
	6378	* 2) Once we have a busy page, we must remove it from
	6379	* the pageout queues, so that the pageout daemon
	6380	* will not grab it away.
	6381	*
	6382	*/
	6383
	6384	/*
	6385	* Look for page in top-level object. If it's not there or
	6386	* there's something going on, give up.
	6387	*/
	6388	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	6389	if ((m == VM_PAGE_NULL) \|\| (m->vmp_busy) \|\|
	6390	(m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent))) {
	6391	GIVE_UP;
	6392	}
	6393	if (m->vmp_fictitious &&
	6394	VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	6395	/*
	6396	* Guard pages are fictitious pages and are never
	6397	* entered into a pmap, so let's say it's been wired...
	6398	*/
	6399	kr = KERN_SUCCESS;
	6400	goto done;
	6401	}
	6402
	6403	/*
	6404	* Wire the page down now. All bail outs beyond this
	6405	* point must unwire the page.
	6406	*/
	6407
	6408	vm_page_lockspin_queues();
	6409	vm_page_wire(m, wire_tag, TRUE);
	6410	vm_page_unlock_queues();
	6411
	6412	/*
	6413	* Mark page busy for other threads.
	6414	*/
	6415	assert(!m->vmp_busy);
	6416	m->vmp_busy = TRUE;
	6417	assert(!m->vmp_absent);
	6418
	6419	/*
	6420	* Give up if the page is being written and there's a copy object
	6421	*/
	6422	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
	6423	RELEASE_PAGE(m);
	6424	GIVE_UP;
	6425	}
	6426
	6427	fault_info.user_tag = VME_ALIAS(entry);
	6428	fault_info.pmap_options = 0;
	6429	if (entry->iokit_acct \|\|
	6430	(!entry->is_sub_map && !entry->use_pmap)) {
	6431	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6432	}
	6433
	6434	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6435	fault_phys_offset = offset - vm_object_trunc_page(offset);
	6436
	6437	/*
	6438	* Put this page into the physical map.
	6439	*/
	6440	type_of_fault = DBG_CACHE_HIT_FAULT;
	6441	kr = vm_fault_enter(m,
	6442	pmap,
	6443	pmap_addr,
	6444	fault_page_size,
	6445	fault_phys_offset,
	6446	prot,
	6447	prot,
	6448	TRUE, /* wired */
	6449	FALSE, /* change_wiring */
	6450	wire_tag,
	6451	&fault_info,
	6452	NULL,
	6453	&type_of_fault);
	6454	if (kr != KERN_SUCCESS) {
	6455	RELEASE_PAGE(m);
	6456	GIVE_UP;
	6457	}
	6458
	6459	done:
	6460	/*
	6461	* Unlock everything, and return
	6462	*/
	6463
	6464	if (physpage_p) {
	6465	/* for vm_map_wire_and_extract() */
	6466	if (kr == KERN_SUCCESS) {
	6467	assert(object == VM_PAGE_OBJECT(m));
	6468	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	6469	if (prot & VM_PROT_WRITE) {
	6470	vm_object_lock_assert_exclusive(object);
	6471	m->vmp_dirty = TRUE;
	6472	}
	6473	} else {
	6474	*physpage_p = 0;
	6475	}
	6476	}
	6477
	6478	PAGE_WAKEUP_DONE(m);
	6479	UNLOCK_AND_DEALLOCATE;
	6480
	6481	return kr;
	6482	}
	6483
	6484	/*
	6485	* Routine: vm_fault_copy_cleanup
	6486	* Purpose:
	6487	* Release a page used by vm_fault_copy.
	6488	*/
	6489
	6490	static void
	6491	vm_fault_copy_cleanup(
	6492	vm_page_t page,
	6493	vm_page_t top_page)
	6494	{
	6495	vm_object_t object = VM_PAGE_OBJECT(page);
	6496
	6497	vm_object_lock(object);
	6498	PAGE_WAKEUP_DONE(page);
	6499	if (!VM_PAGE_PAGEABLE(page)) {
	6500	vm_page_lockspin_queues();
	6501	if (!VM_PAGE_PAGEABLE(page)) {
	6502	vm_page_activate(page);
	6503	}
	6504	vm_page_unlock_queues();
	6505	}
	6506	vm_fault_cleanup(object, top_page);
	6507	}
	6508
	6509	static void
	6510	vm_fault_copy_dst_cleanup(
	6511	vm_page_t page)
	6512	{
	6513	vm_object_t object;
	6514
	6515	if (page != VM_PAGE_NULL) {
	6516	object = VM_PAGE_OBJECT(page);
	6517	vm_object_lock(object);
	6518	vm_page_lockspin_queues();
	6519	vm_page_unwire(page, TRUE);
	6520	vm_page_unlock_queues();
	6521	vm_object_paging_end(object);
	6522	vm_object_unlock(object);
	6523	}
	6524	}
	6525
	6526	/*
	6527	* Routine: vm_fault_copy
	6528	*
	6529	* Purpose:
	6530	* Copy pages from one virtual memory object to another --
	6531	* neither the source nor destination pages need be resident.
	6532	*
	6533	* Before actually copying a page, the version associated with
	6534	* the destination address map wil be verified.
	6535	*
	6536	* In/out conditions:
	6537	* The caller must hold a reference, but not a lock, to
	6538	* each of the source and destination objects and to the
	6539	* destination map.
	6540	*
	6541	* Results:
	6542	* Returns KERN_SUCCESS if no errors were encountered in
	6543	* reading or writing the data. Returns KERN_INTERRUPTED if
	6544	* the operation was interrupted (only possible if the
	6545	* "interruptible" argument is asserted). Other return values
	6546	* indicate a permanent error in copying the data.
	6547	*
	6548	* The actual amount of data copied will be returned in the
	6549	* "copy_size" argument. In the event that the destination map
	6550	* verification failed, this amount may be less than the amount
	6551	* requested.
	6552	*/
	6553	kern_return_t
	6554	vm_fault_copy(
	6555	vm_object_t src_object,
	6556	vm_object_offset_t src_offset,
	6557	vm_map_size_t copy_size, / INOUT */
	6558	vm_object_t dst_object,
	6559	vm_object_offset_t dst_offset,
	6560	vm_map_t dst_map,
	6561	vm_map_version_t *dst_version,
	6562	int interruptible)
	6563	{
	6564	vm_page_t result_page;
	6565
	6566	vm_page_t src_page;
	6567	vm_page_t src_top_page;
	6568	vm_prot_t src_prot;
	6569
	6570	vm_page_t dst_page;
	6571	vm_page_t dst_top_page;
	6572	vm_prot_t dst_prot;
	6573
	6574	vm_map_size_t amount_left;
	6575	vm_object_t old_copy_object;
	6576	vm_object_t result_page_object = NULL;
	6577	kern_return_t error = 0;
	6578	vm_fault_return_t result;
	6579
	6580	vm_map_size_t part_size;
	6581	struct vm_object_fault_info fault_info_src = {};
	6582	struct vm_object_fault_info fault_info_dst = {};
	6583
	6584	/*
	6585	* In order not to confuse the clustered pageins, align
	6586	* the different offsets on a page boundary.
	6587	*/
	6588
	6589	#define RETURN(x) \
	6590	MACRO_BEGIN \
	6591	*copy_size -= amount_left; \
	6592	MACRO_RETURN(x); \
	6593	MACRO_END
	6594
	6595	amount_left = *copy_size;
	6596
	6597	fault_info_src.interruptible = interruptible;
	6598	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
	6599	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
	6600	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
	6601	fault_info_src.stealth = TRUE;
	6602
	6603	fault_info_dst.interruptible = interruptible;
	6604	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
	6605	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
	6606	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
	6607	fault_info_dst.stealth = TRUE;
	6608
	6609	do { /* while (amount_left > 0) */
	6610	/*
	6611	* There may be a deadlock if both source and destination
	6612	* pages are the same. To avoid this deadlock, the copy must
	6613	* start by getting the destination page in order to apply
	6614	* COW semantics if any.
	6615	*/
	6616
	6617	RetryDestinationFault:;
	6618
	6619	dst_prot = VM_PROT_WRITE \| VM_PROT_READ;
	6620
	6621	vm_object_lock(dst_object);
	6622	vm_object_paging_begin(dst_object);
	6623
	6624	/* cap cluster size at maximum UPL size */
	6625	upl_size_t cluster_size;
	6626	if (os_convert_overflow(amount_left, &cluster_size)) {
	6627	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6628	}
	6629	fault_info_dst.cluster_size = cluster_size;
	6630
	6631	dst_page = VM_PAGE_NULL;
	6632	result = vm_fault_page(dst_object,
	6633	vm_object_trunc_page(dst_offset),
	6634	VM_PROT_WRITE \| VM_PROT_READ,
	6635	FALSE,
	6636	FALSE, /* page not looked up */
	6637	&dst_prot, &dst_page, &dst_top_page,
	6638	(int *)0,
	6639	&error,
	6640	dst_map->no_zero_fill,
	6641	FALSE, &fault_info_dst);
	6642	switch (result) {
	6643	case VM_FAULT_SUCCESS:
	6644	break;
	6645	case VM_FAULT_RETRY:
	6646	goto RetryDestinationFault;
	6647	case VM_FAULT_MEMORY_SHORTAGE:
	6648	if (vm_page_wait(interruptible)) {
	6649	goto RetryDestinationFault;
	6650	}
	6651	OS_FALLTHROUGH;
	6652	case VM_FAULT_INTERRUPTED:
	6653	RETURN(MACH_SEND_INTERRUPTED);
	6654	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6655	/* success but no VM page: fail the copy */
	6656	vm_object_paging_end(dst_object);
	6657	vm_object_unlock(dst_object);
	6658	OS_FALLTHROUGH;
	6659	case VM_FAULT_MEMORY_ERROR:
	6660	if (error) {
	6661	return error;
	6662	} else {
	6663	return KERN_MEMORY_ERROR;
	6664	}
	6665	default:
	6666	panic("vm_fault_copy: unexpected error 0x%x from "
	6667	"vm_fault_page()\n", result);
	6668	}
	6669	assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
	6670
	6671	assert(dst_object == VM_PAGE_OBJECT(dst_page));
	6672	old_copy_object = dst_object->copy;
	6673
	6674	/*
	6675	* There exists the possiblity that the source and
	6676	* destination page are the same. But we can't
	6677	* easily determine that now. If they are the
	6678	* same, the call to vm_fault_page() for the
	6679	* destination page will deadlock. To prevent this we
	6680	* wire the page so we can drop busy without having
	6681	* the page daemon steal the page. We clean up the
	6682	* top page but keep the paging reference on the object
	6683	* holding the dest page so it doesn't go away.
	6684	*/
	6685
	6686	vm_page_lockspin_queues();
	6687	vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
	6688	vm_page_unlock_queues();
	6689	PAGE_WAKEUP_DONE(dst_page);
	6690	vm_object_unlock(dst_object);
	6691
	6692	if (dst_top_page != VM_PAGE_NULL) {
	6693	vm_object_lock(dst_object);
	6694	VM_PAGE_FREE(dst_top_page);
	6695	vm_object_paging_end(dst_object);
	6696	vm_object_unlock(dst_object);
	6697	}
	6698
	6699	RetrySourceFault:;
	6700
	6701	if (src_object == VM_OBJECT_NULL) {
	6702	/*
	6703	* No source object. We will just
	6704	* zero-fill the page in dst_object.
	6705	*/
	6706	src_page = VM_PAGE_NULL;
	6707	result_page = VM_PAGE_NULL;
	6708	} else {
	6709	vm_object_lock(src_object);
	6710	src_page = vm_page_lookup(src_object,
	6711	vm_object_trunc_page(src_offset));
	6712	if (src_page == dst_page) {
	6713	src_prot = dst_prot;
	6714	result_page = VM_PAGE_NULL;
	6715	} else {
	6716	src_prot = VM_PROT_READ;
	6717	vm_object_paging_begin(src_object);
	6718
	6719	/* cap cluster size at maximum UPL size */
	6720	if (os_convert_overflow(amount_left, &cluster_size)) {
	6721	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6722	}
	6723	fault_info_src.cluster_size = cluster_size;
	6724
	6725	result_page = VM_PAGE_NULL;
	6726	result = vm_fault_page(
	6727	src_object,
	6728	vm_object_trunc_page(src_offset),
	6729	VM_PROT_READ, FALSE,
	6730	FALSE, /* page not looked up */
	6731	&src_prot,
	6732	&result_page, &src_top_page,
	6733	(int *)0, &error, FALSE,
	6734	FALSE, &fault_info_src);
	6735
	6736	switch (result) {
	6737	case VM_FAULT_SUCCESS:
	6738	break;
	6739	case VM_FAULT_RETRY:
	6740	goto RetrySourceFault;
	6741	case VM_FAULT_MEMORY_SHORTAGE:
	6742	if (vm_page_wait(interruptible)) {
	6743	goto RetrySourceFault;
	6744	}
	6745	OS_FALLTHROUGH;
	6746	case VM_FAULT_INTERRUPTED:
	6747	vm_fault_copy_dst_cleanup(dst_page);
	6748	RETURN(MACH_SEND_INTERRUPTED);
	6749	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6750	/* success but no VM page: fail */
	6751	vm_object_paging_end(src_object);
	6752	vm_object_unlock(src_object);
	6753	OS_FALLTHROUGH;
	6754	case VM_FAULT_MEMORY_ERROR:
	6755	vm_fault_copy_dst_cleanup(dst_page);
	6756	if (error) {
	6757	return error;
	6758	} else {
	6759	return KERN_MEMORY_ERROR;
	6760	}
	6761	default:
	6762	panic("vm_fault_copy(2): unexpected "
	6763	"error 0x%x from "
	6764	"vm_fault_page()\n", result);
	6765	}
	6766
	6767	result_page_object = VM_PAGE_OBJECT(result_page);
	6768	assert((src_top_page == VM_PAGE_NULL) ==
	6769	(result_page_object == src_object));
	6770	}
	6771	assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
	6772	vm_object_unlock(result_page_object);
	6773	}
	6774
	6775	vm_map_lock_read(dst_map);
	6776
	6777	if (!vm_map_verify(dst_map, dst_version)) {
	6778	vm_map_unlock_read(dst_map);
	6779	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6780	vm_fault_copy_cleanup(result_page, src_top_page);
	6781	}
	6782	vm_fault_copy_dst_cleanup(dst_page);
	6783	break;
	6784	}
	6785	assert(dst_object == VM_PAGE_OBJECT(dst_page));
	6786
	6787	vm_object_lock(dst_object);
	6788
	6789	if (dst_object->copy != old_copy_object) {
	6790	vm_object_unlock(dst_object);
	6791	vm_map_unlock_read(dst_map);
	6792	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6793	vm_fault_copy_cleanup(result_page, src_top_page);
	6794	}
	6795	vm_fault_copy_dst_cleanup(dst_page);
	6796	break;
	6797	}
	6798	vm_object_unlock(dst_object);
	6799
	6800	/*
	6801	* Copy the page, and note that it is dirty
	6802	* immediately.
	6803	*/
	6804
	6805	if (!page_aligned(src_offset) \|\|
	6806	!page_aligned(dst_offset) \|\|
	6807	!page_aligned(amount_left)) {
	6808	vm_object_offset_t src_po,
	6809	dst_po;
	6810
	6811	src_po = src_offset - vm_object_trunc_page(src_offset);
	6812	dst_po = dst_offset - vm_object_trunc_page(dst_offset);
	6813
	6814	if (dst_po > src_po) {
	6815	part_size = PAGE_SIZE - dst_po;
	6816	} else {
	6817	part_size = PAGE_SIZE - src_po;
	6818	}
	6819	if (part_size > (amount_left)) {
	6820	part_size = amount_left;
	6821	}
	6822
	6823	if (result_page == VM_PAGE_NULL) {
	6824	assert((vm_offset_t) dst_po == dst_po);
	6825	assert((vm_size_t) part_size == part_size);
	6826	vm_page_part_zero_fill(dst_page,
	6827	(vm_offset_t) dst_po,
	6828	(vm_size_t) part_size);
	6829	} else {
	6830	assert((vm_offset_t) src_po == src_po);
	6831	assert((vm_offset_t) dst_po == dst_po);
	6832	assert((vm_size_t) part_size == part_size);
	6833	vm_page_part_copy(result_page,
	6834	(vm_offset_t) src_po,
	6835	dst_page,
	6836	(vm_offset_t) dst_po,
	6837	(vm_size_t)part_size);
	6838	if (!dst_page->vmp_dirty) {
	6839	vm_object_lock(dst_object);
	6840	SET_PAGE_DIRTY(dst_page, TRUE);
	6841	vm_object_unlock(dst_object);
	6842	}
	6843	}
	6844	} else {
	6845	part_size = PAGE_SIZE;
	6846
	6847	if (result_page == VM_PAGE_NULL) {
	6848	vm_page_zero_fill(dst_page);
	6849	} else {
	6850	vm_object_lock(result_page_object);
	6851	vm_page_copy(result_page, dst_page);
	6852	vm_object_unlock(result_page_object);
	6853
	6854	if (!dst_page->vmp_dirty) {
	6855	vm_object_lock(dst_object);
	6856	SET_PAGE_DIRTY(dst_page, TRUE);
	6857	vm_object_unlock(dst_object);
	6858	}
	6859	}
	6860	}
	6861
	6862	/*
	6863	* Unlock everything, and return
	6864	*/
	6865
	6866	vm_map_unlock_read(dst_map);
	6867
	6868	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6869	vm_fault_copy_cleanup(result_page, src_top_page);
	6870	}
	6871	vm_fault_copy_dst_cleanup(dst_page);
	6872
	6873	amount_left -= part_size;
	6874	src_offset += part_size;
	6875	dst_offset += part_size;
	6876	} while (amount_left > 0);
	6877
	6878	RETURN(KERN_SUCCESS);
	6879	#undef RETURN
	6880
	6881	/NOTREACHED/
	6882	}
	6883
	6884	#if VM_FAULT_CLASSIFY
	6885	/*
	6886	* Temporary statistics gathering support.
	6887	*/
	6888
	6889	/*
	6890	* Statistics arrays:
	6891	*/
	6892	#define VM_FAULT_TYPES_MAX 5
	6893	#define VM_FAULT_LEVEL_MAX 8
	6894
	6895	int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
	6896
	6897	#define VM_FAULT_TYPE_ZERO_FILL 0
	6898	#define VM_FAULT_TYPE_MAP_IN 1
	6899	#define VM_FAULT_TYPE_PAGER 2
	6900	#define VM_FAULT_TYPE_COPY 3
	6901	#define VM_FAULT_TYPE_OTHER 4
	6902
	6903
	6904	void
	6905	vm_fault_classify(vm_object_t object,
	6906	vm_object_offset_t offset,
	6907	vm_prot_t fault_type)
	6908	{
	6909	int type, level = 0;
	6910	vm_page_t m;
	6911
	6912	while (TRUE) {
	6913	m = vm_page_lookup(object, offset);
	6914	if (m != VM_PAGE_NULL) {
	6915	if (m->vmp_busy \|\| m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent) {
	6916	type = VM_FAULT_TYPE_OTHER;
	6917	break;
	6918	}
	6919	if (((fault_type & VM_PROT_WRITE) == 0) \|\|
	6920	((level == 0) && object->copy == VM_OBJECT_NULL)) {
	6921	type = VM_FAULT_TYPE_MAP_IN;
	6922	break;
	6923	}
	6924	type = VM_FAULT_TYPE_COPY;
	6925	break;
	6926	} else {
	6927	if (object->pager_created) {
	6928	type = VM_FAULT_TYPE_PAGER;
	6929	break;
	6930	}
	6931	if (object->shadow == VM_OBJECT_NULL) {
	6932	type = VM_FAULT_TYPE_ZERO_FILL;
	6933	break;
	6934	}
	6935
	6936	offset += object->vo_shadow_offset;
	6937	object = object->shadow;
	6938	level++;
	6939	continue;
	6940	}
	6941	}
	6942
	6943	if (level > VM_FAULT_LEVEL_MAX) {
	6944	level = VM_FAULT_LEVEL_MAX;
	6945	}
	6946
	6947	vm_fault_stats[type][level] += 1;
	6948
	6949	return;
	6950	}
	6951
	6952	/* cleanup routine to call from debugger */
	6953
	6954	void
	6955	vm_fault_classify_init(void)
	6956	{
	6957	int type, level;
	6958
	6959	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
	6960	for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
	6961	vm_fault_stats[type][level] = 0;
	6962	}
	6963	}
	6964
	6965	return;
	6966	}
	6967	#endif /* VM_FAULT_CLASSIFY */
	6968
	6969	vm_offset_t
	6970	kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
	6971	{
	6972	vm_map_entry_t entry;
	6973	vm_object_t object;
	6974	vm_offset_t object_offset;
	6975	vm_page_t m;
	6976	int compressor_external_state, compressed_count_delta;
	6977	int compressor_flags = (C_DONT_BLOCK \| C_KEEP \| C_KDP);
	6978	int my_fault_type = VM_PROT_READ;
	6979	kern_return_t kr;
	6980	int effective_page_mask, effective_page_size;
	6981
	6982	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
	6983	effective_page_mask = VM_MAP_PAGE_MASK(map);
	6984	effective_page_size = VM_MAP_PAGE_SIZE(map);
	6985	} else {
	6986	effective_page_mask = PAGE_MASK;
	6987	effective_page_size = PAGE_SIZE;
	6988	}
	6989
	6990	if (not_in_kdp) {
	6991	panic("kdp_lightweight_fault called from outside of debugger context");
	6992	}
	6993
	6994	assert(map != VM_MAP_NULL);
	6995
	6996	assert((cur_target_addr & effective_page_mask) == 0);
	6997	if ((cur_target_addr & effective_page_mask) != 0) {
	6998	return 0;
	6999	}
	7000
	7001	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
	7002	return 0;
	7003	}
	7004
	7005	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
	7006	return 0;
	7007	}
	7008
	7009	if (entry->is_sub_map) {
	7010	return 0;
	7011	}
	7012
	7013	object = VME_OBJECT(entry);
	7014	if (object == VM_OBJECT_NULL) {
	7015	return 0;
	7016	}
	7017
	7018	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
	7019
	7020	while (TRUE) {
	7021	if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
	7022	return 0;
	7023	}
	7024
	7025	if (object->pager_created && (object->paging_in_progress \|\|
	7026	object->activity_in_progress)) {
	7027	return 0;
	7028	}
	7029
	7030	m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
	7031
	7032	if (m != VM_PAGE_NULL) {
	7033	if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
	7034	return 0;
	7035	}
	7036
	7037	if (m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_free_when_done \|\| m->vmp_absent \|\| m->vmp_error \|\| m->vmp_cleaning \|\|
	7038	m->vmp_overwriting \|\| m->vmp_restart \|\| m->vmp_unusual) {
	7039	return 0;
	7040	}
	7041
	7042	assert(!m->vmp_private);
	7043	if (m->vmp_private) {
	7044	return 0;
	7045	}
	7046
	7047	assert(!m->vmp_fictitious);
	7048	if (m->vmp_fictitious) {
	7049	return 0;
	7050	}
	7051
	7052	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
	7053	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
	7054	return 0;
	7055	}
	7056
	7057	return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
	7058	}
	7059
	7060	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
	7061
	7062	if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
	7063	if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
	7064	kr = vm_compressor_pager_get(object->pager,
	7065	vm_object_trunc_page(object_offset + object->paging_offset),
	7066	kdp_compressor_decompressed_page_ppnum, &my_fault_type,
	7067	compressor_flags, &compressed_count_delta);
	7068	if (kr == KERN_SUCCESS) {
	7069	return kdp_compressor_decompressed_page_paddr;
	7070	} else {
	7071	return 0;
	7072	}
	7073	}
	7074	}
	7075
	7076	if (object->shadow == VM_OBJECT_NULL) {
	7077	return 0;
	7078	}
	7079
	7080	object_offset += object->vo_shadow_offset;
	7081	object = object->shadow;
	7082	}
	7083	}
	7084
	7085	/*
	7086	* vm_page_validate_cs_fast():
	7087	* Performs a few quick checks to determine if the page's code signature
	7088	* really needs to be fully validated. It could:
	7089	* 1. have been modified (i.e. automatically tainted),
	7090	* 2. have already been validated,
	7091	* 3. have already been found to be tainted,
	7092	* 4. no longer have a backing store.
	7093	* Returns FALSE if the page needs to be fully validated.
	7094	*/
	7095	static boolean_t
	7096	vm_page_validate_cs_fast(
	7097	vm_page_t page,
	7098	vm_map_size_t fault_page_size,
	7099	vm_map_offset_t fault_phys_offset)
	7100	{
	7101	vm_object_t object;
	7102
	7103	object = VM_PAGE_OBJECT(page);
	7104	vm_object_lock_assert_held(object);
	7105
	7106	if (page->vmp_wpmapped &&
	7107	!VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	7108	/*
	7109	* This page was mapped for "write" access sometime in the
	7110	* past and could still be modifiable in the future.
	7111	* Consider it tainted.
	7112	* [ If the page was already found to be "tainted", no
	7113	* need to re-validate. ]
	7114	*/
	7115	vm_object_lock_assert_exclusive(object);
	7116	VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
	7117	VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
	7118	if (cs_debug) {
	7119	printf("CODESIGNING: %s: "
	7120	"page %p obj %p off 0x%llx "
	7121	"was modified\n",
	7122	__FUNCTION__,
	7123	page, object, page->vmp_offset);
	7124	}
	7125	vm_cs_validated_dirtied++;
	7126	}
	7127
	7128	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) \|\|
	7129	VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	7130	return TRUE;
	7131	}
	7132	vm_object_lock_assert_exclusive(object);
	7133
	7134	#if CHECK_CS_VALIDATION_BITMAP
	7135	kern_return_t kr;
	7136
	7137	kr = vnode_pager_cs_check_validation_bitmap(
	7138	object->pager,
	7139	page->vmp_offset + object->paging_offset,
	7140	CS_BITMAP_CHECK);
	7141	if (kr == KERN_SUCCESS) {
	7142	page->vmp_cs_validated = VMP_CS_ALL_TRUE;
	7143	page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
	7144	vm_cs_bitmap_validated++;
	7145	return TRUE;
	7146	}
	7147	#endif /* CHECK_CS_VALIDATION_BITMAP */
	7148
	7149	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7150	/*
	7151	* The object is terminating and we don't have its pager
	7152	* so we can't validate the data...
	7153	*/
	7154	return TRUE;
	7155	}
	7156
	7157	/* we need to really validate this page */
	7158	vm_object_lock_assert_exclusive(object);
	7159	return FALSE;
	7160	}
	7161
	7162	void
	7163	vm_page_validate_cs_mapped_slow(
	7164	vm_page_t page,
	7165	const void *kaddr)
	7166	{
	7167	vm_object_t object;
	7168	memory_object_offset_t mo_offset;
	7169	memory_object_t pager;
	7170	struct vnode *vnode;
	7171	int validated, tainted, nx;
	7172
	7173	assert(page->vmp_busy);
	7174	object = VM_PAGE_OBJECT(page);
	7175	vm_object_lock_assert_exclusive(object);
	7176
	7177	vm_cs_validates++;
	7178
	7179	/*
	7180	* Since we get here to validate a page that was brought in by
	7181	* the pager, we know that this pager is all setup and ready
	7182	* by now.
	7183	*/
	7184	assert(object->code_signed);
	7185	assert(!object->internal);
	7186	assert(object->pager != NULL);
	7187	assert(object->pager_ready);
	7188
	7189	pager = object->pager;
	7190	assert(object->paging_in_progress);
	7191	vnode = vnode_pager_lookup_vnode(pager);
	7192	mo_offset = page->vmp_offset + object->paging_offset;
	7193
	7194	/* verify the SHA1 hash for this page */
	7195	validated = 0;
	7196	tainted = 0;
	7197	nx = 0;
	7198	cs_validate_page(vnode,
	7199	pager,
	7200	mo_offset,
	7201	(const void )((const char )kaddr),
	7202	&validated,
	7203	&tainted,
	7204	&nx);
	7205
	7206	page->vmp_cs_validated \|= validated;
	7207	page->vmp_cs_tainted \|= tainted;
	7208	page->vmp_cs_nx \|= nx;
	7209
	7210	#if CHECK_CS_VALIDATION_BITMAP
	7211	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
	7212	page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
	7213	vnode_pager_cs_check_validation_bitmap(object->pager,
	7214	mo_offset,
	7215	CS_BITMAP_SET);
	7216	}
	7217	#endif /* CHECK_CS_VALIDATION_BITMAP */
	7218	}
	7219
	7220	void
	7221	vm_page_validate_cs_mapped(
	7222	vm_page_t page,
	7223	vm_map_size_t fault_page_size,
	7224	vm_map_offset_t fault_phys_offset,
	7225	const void *kaddr)
	7226	{
	7227	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
	7228	vm_page_validate_cs_mapped_slow(page, kaddr);
	7229	}
	7230	}
	7231
	7232	void
	7233	vm_page_validate_cs(
	7234	vm_page_t page,
	7235	vm_map_size_t fault_page_size,
	7236	vm_map_offset_t fault_phys_offset)
	7237	{
	7238	vm_object_t object;
	7239	vm_object_offset_t offset;
	7240	vm_map_offset_t koffset;
	7241	vm_map_size_t ksize;
	7242	vm_offset_t kaddr;
	7243	kern_return_t kr;
	7244	boolean_t busy_page;
	7245	boolean_t need_unmap;
	7246
	7247	object = VM_PAGE_OBJECT(page);
	7248	vm_object_lock_assert_held(object);
	7249
	7250	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
	7251	return;
	7252	}
	7253	vm_object_lock_assert_exclusive(object);
	7254
	7255	assert(object->code_signed);
	7256	offset = page->vmp_offset;
	7257
	7258	busy_page = page->vmp_busy;
	7259	if (!busy_page) {
	7260	/* keep page busy while we map (and unlock) the VM object */
	7261	page->vmp_busy = TRUE;
	7262	}
	7263
	7264	/*
	7265	* Take a paging reference on the VM object
	7266	* to protect it from collapse or bypass,
	7267	* and keep it from disappearing too.
	7268	*/
	7269	vm_object_paging_begin(object);
	7270
	7271	/* map the page in the kernel address space */
	7272	ksize = PAGE_SIZE_64;
	7273	koffset = 0;
	7274	need_unmap = FALSE;
	7275	kr = vm_paging_map_object(page,
	7276	object,
	7277	offset,
	7278	VM_PROT_READ,
	7279	FALSE, /* can't unlock object ! */
	7280	&ksize,
	7281	&koffset,
	7282	&need_unmap);
	7283	if (kr != KERN_SUCCESS) {
	7284	panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
	7285	}
	7286	kaddr = CAST_DOWN(vm_offset_t, koffset);
	7287
	7288	/* validate the mapped page */
	7289	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
	7290
	7291	assert(page->vmp_busy);
	7292	assert(object == VM_PAGE_OBJECT(page));
	7293	vm_object_lock_assert_exclusive(object);
	7294
	7295	if (!busy_page) {
	7296	PAGE_WAKEUP_DONE(page);
	7297	}
	7298	if (need_unmap) {
	7299	/* unmap the map from the kernel address space */
	7300	vm_paging_unmap_object(object, koffset, koffset + ksize);
	7301	koffset = 0;
	7302	ksize = 0;
	7303	kaddr = 0;
	7304	}
	7305	vm_object_paging_end(object);
	7306	}
	7307
	7308	void
	7309	vm_page_validate_cs_mapped_chunk(
	7310	vm_page_t page,
	7311	const void *kaddr,
	7312	vm_offset_t chunk_offset,
	7313	vm_size_t chunk_size,
	7314	boolean_t *validated_p,
	7315	unsigned *tainted_p)
	7316	{
	7317	vm_object_t object;
	7318	vm_object_offset_t offset, offset_in_page;
	7319	memory_object_t pager;
	7320	struct vnode *vnode;
	7321	boolean_t validated;
	7322	unsigned tainted;
	7323
	7324	*validated_p = FALSE;
	7325	*tainted_p = 0;
	7326
	7327	assert(page->vmp_busy);
	7328	object = VM_PAGE_OBJECT(page);
	7329	vm_object_lock_assert_exclusive(object);
	7330
	7331	assert(object->code_signed);
	7332	offset = page->vmp_offset;
	7333
	7334	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7335	/*
	7336	* The object is terminating and we don't have its pager
	7337	* so we can't validate the data...
	7338	*/
	7339	return;
	7340	}
	7341	/*
	7342	* Since we get here to validate a page that was brought in by
	7343	* the pager, we know that this pager is all setup and ready
	7344	* by now.
	7345	*/
	7346	assert(!object->internal);
	7347	assert(object->pager != NULL);
	7348	assert(object->pager_ready);
	7349
	7350	pager = object->pager;
	7351	assert(object->paging_in_progress);
	7352	vnode = vnode_pager_lookup_vnode(pager);
	7353
	7354	/* verify the signature for this chunk */
	7355	offset_in_page = chunk_offset;
	7356	assert(offset_in_page < PAGE_SIZE);
	7357
	7358	tainted = 0;
	7359	validated = cs_validate_range(vnode,
	7360	pager,
	7361	(object->paging_offset +
	7362	offset +
	7363	offset_in_page),
	7364	(const void )((const char )kaddr
	7365	+ offset_in_page),
	7366	chunk_size,
	7367	&tainted);
	7368	if (validated) {
	7369	*validated_p = TRUE;
	7370	}
	7371	if (tainted) {
	7372	*tainted_p = tainted;
	7373	}
	7374	}
	7375
	7376	static void
	7377	vm_rtfrecord_lock(void)
	7378	{
	7379	lck_spin_lock(&vm_rtfr_slock);
	7380	}
	7381
	7382	static void
	7383	vm_rtfrecord_unlock(void)
	7384	{
	7385	lck_spin_unlock(&vm_rtfr_slock);
	7386	}
	7387
	7388	unsigned int
	7389	vmrtfaultinfo_bufsz(void)
	7390	{
	7391	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
	7392	}
	7393
	7394	#include <kern/backtrace.h>
	7395
	7396	__attribute__((noinline))
	7397	static void
	7398	vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
	7399	{
	7400	uint64_t fend = mach_continuous_time();
	7401
	7402	uint64_t cfpc = 0;
	7403	uint64_t ctid = cthread->thread_id;
	7404	uint64_t cupid = get_current_unique_pid();
	7405
	7406	uintptr_t bpc = 0;
	7407	int btr = 0;
	7408	bool u64 = false;
	7409
	7410	/* Capture a single-frame backtrace; this extracts just the program
	7411	* counter at the point of the fault into "bpc", and should perform no
	7412	* further user stack traversals, thus avoiding copyin()s and further
	7413	* faults.
	7414	*/
	7415	unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL, false);
	7416
	7417	if ((btr == 0) && (bfrs > 0)) {
	7418	cfpc = bpc;
	7419	}
	7420
	7421	assert((fstart != 0) && fend >= fstart);
	7422	vm_rtfrecord_lock();
	7423	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
	7424
	7425	vmrtfrs.vmrtf_total++;
	7426	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
	7427
	7428	cvmr->rtfabstime = fstart;
	7429	cvmr->rtfduration = fend - fstart;
	7430	cvmr->rtfaddr = fault_vaddr;
	7431	cvmr->rtfpc = cfpc;
	7432	cvmr->rtftype = type_of_fault;
	7433	cvmr->rtfupid = cupid;
	7434	cvmr->rtftid = ctid;
	7435
	7436	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
	7437	vmrtfrs.vmrtfr_curi = 0;
	7438	}
	7439
	7440	vm_rtfrecord_unlock();
	7441	}
	7442
	7443	int
	7444	vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void vrecords, unsigned long vmrtfrv)
	7445	{
	7446	vm_rtfault_record_t *cvmrd = vrecords;
	7447	size_t residue = vrecordsz;
	7448	size_t numextracted = 0;
	7449	boolean_t early_exit = FALSE;
	7450
	7451	vm_rtfrecord_lock();
	7452
	7453	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
	7454	if (residue < sizeof(vm_rtfault_record_t)) {
	7455	early_exit = TRUE;
	7456	break;
	7457	}
	7458
	7459	if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
	7460	#if DEVELOPMENT \|\| DEBUG
	7461	if (isroot == FALSE) {
	7462	continue;
	7463	}
	7464	#else
	7465	continue;
	7466	#endif /* DEVDEBUG */
	7467	}
	7468
	7469	*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
	7470	cvmrd++;
	7471	residue -= sizeof(vm_rtfault_record_t);
	7472	numextracted++;
	7473	}
	7474
	7475	vm_rtfrecord_unlock();
	7476
	7477	*vmrtfrv = numextracted;
	7478	return early_exit;
	7479	}