git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: vm_fault.c
	60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
	61	*
	62	* Page fault handling module.
	63	*/
	64
	65	#include <mach_cluster_stats.h>
	66	#include <mach_pagemap.h>
	67	#include <libkern/OSAtomic.h>
	68
	69	#include <mach/mach_types.h>
	70	#include <mach/kern_return.h>
	71	#include <mach/message.h> /* for error codes */
	72	#include <mach/vm_param.h>
	73	#include <mach/vm_behavior.h>
	74	#include <mach/memory_object.h>
	75	/* For memory_object_data_{request,unlock} */
	76	#include <mach/sdt.h>
	77
	78	#include <kern/kern_types.h>
	79	#include <kern/host_statistics.h>
	80	#include <kern/counter.h>
	81	#include <kern/task.h>
	82	#include <kern/thread.h>
	83	#include <kern/sched_prim.h>
	84	#include <kern/host.h>
	85	#include <kern/mach_param.h>
	86	#include <kern/macro_help.h>
	87	#include <kern/zalloc.h>
	88	#include <kern/misc_protos.h>
	89	#include <kern/policy_internal.h>
	90
	91	#include <vm/vm_compressor.h>
	92	#include <vm/vm_compressor_pager.h>
	93	#include <vm/vm_fault.h>
	94	#include <vm/vm_map.h>
	95	#include <vm/vm_object.h>
	96	#include <vm/vm_page.h>
	97	#include <vm/vm_kern.h>
	98	#include <vm/pmap.h>
	99	#include <vm/vm_pageout.h>
	100	#include <vm/vm_protos.h>
	101	#include <vm/vm_external.h>
	102	#include <vm/memory_object.h>
	103	#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
	104	#include <vm/vm_shared_region.h>
	105
	106	#include <sys/codesign.h>
	107	#include <sys/reason.h>
	108	#include <sys/signalvar.h>
	109
	110	#include <san/kasan.h>
	111
	112	#define VM_FAULT_CLASSIFY 0
	113
	114	#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
	115
	116	int vm_protect_privileged_from_untrusted = 1;
	117
	118	unsigned int vm_object_pagein_throttle = 16;
	119
	120	/*
	121	* We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
	122	* kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
	123	* of memory if they're buggy and can run the system completely out of swap space. If this happens, we
	124	* impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
	125	* keep the UI active so that the user has a chance to kill the offending task before the system
	126	* completely hangs.
	127	*
	128	* The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
	129	* to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
	130	* will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
	131	* delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
	132	*/
	133
	134	extern void throttle_lowpri_io(int);
	135
	136	extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
	137
	138	uint64_t vm_hard_throttle_threshold;
	139
	140	#if DEBUG \|\| DEVELOPMENT
	141	static bool vmtc_panic_instead = false;
	142	#endif /* DEBUG \|\| DEVELOPMENT */
	143
	144	OS_ALWAYS_INLINE
	145	boolean_t
	146	NEED_TO_HARD_THROTTLE_THIS_TASK(void)
	147	{
	148	return vm_wants_task_throttled(current_task()) \|\|
	149	((vm_page_free_count < vm_page_throttle_limit \|\|
	150	HARD_THROTTLE_LIMIT_REACHED()) &&
	151	proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
	152	}
	153
	154	#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
	155	#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
	156
	157	#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
	158	#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
	159
	160
	161	#define VM_STAT_DECOMPRESSIONS() \
	162	MACRO_BEGIN \
	163	counter_inc(&vm_statistics_decompressions); \
	164	current_thread()->decompressions++; \
	165	MACRO_END
	166
	167	boolean_t current_thread_aborted(void);
	168
	169	/* Forward declarations of internal routines. */
	170	static kern_return_t vm_fault_wire_fast(
	171	vm_map_t map,
	172	vm_map_offset_t va,
	173	vm_prot_t prot,
	174	vm_tag_t wire_tag,
	175	vm_map_entry_t entry,
	176	pmap_t pmap,
	177	vm_map_offset_t pmap_addr,
	178	ppnum_t *physpage_p);
	179
	180	static kern_return_t vm_fault_internal(
	181	vm_map_t map,
	182	vm_map_offset_t vaddr,
	183	vm_prot_t caller_prot,
	184	boolean_t change_wiring,
	185	vm_tag_t wire_tag,
	186	int interruptible,
	187	pmap_t pmap,
	188	vm_map_offset_t pmap_addr,
	189	ppnum_t *physpage_p);
	190
	191	static void vm_fault_copy_cleanup(
	192	vm_page_t page,
	193	vm_page_t top_page);
	194
	195	static void vm_fault_copy_dst_cleanup(
	196	vm_page_t page);
	197
	198	#if VM_FAULT_CLASSIFY
	199	extern void vm_fault_classify(vm_object_t object,
	200	vm_object_offset_t offset,
	201	vm_prot_t fault_type);
	202
	203	extern void vm_fault_classify_init(void);
	204	#endif
	205
	206	unsigned long vm_pmap_enter_blocked = 0;
	207	unsigned long vm_pmap_enter_retried = 0;
	208
	209	unsigned long vm_cs_validates = 0;
	210	unsigned long vm_cs_revalidates = 0;
	211	unsigned long vm_cs_query_modified = 0;
	212	unsigned long vm_cs_validated_dirtied = 0;
	213	unsigned long vm_cs_bitmap_validated = 0;
	214
	215	void vm_pre_fault(vm_map_offset_t, vm_prot_t);
	216
	217	extern char *kdp_compressor_decompressed_page;
	218	extern addr64_t kdp_compressor_decompressed_page_paddr;
	219	extern ppnum_t kdp_compressor_decompressed_page_ppnum;
	220
	221	struct vmrtfr {
	222	int vmrtfr_maxi;
	223	int vmrtfr_curi;
	224	int64_t vmrtf_total;
	225	vm_rtfault_record_t *vm_rtf_records;
	226	} vmrtfrs;
	227	#define VMRTF_DEFAULT_BUFSIZE (4096)
	228	#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
	229	TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
	230
	231	static void vm_rtfrecord_lock(void);
	232	static void vm_rtfrecord_unlock(void);
	233	static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
	234
	235	extern lck_grp_t vm_page_lck_grp_bucket;
	236	extern lck_attr_t vm_page_lck_attr;
	237	LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
	238
	239	/*
	240	* Routine: vm_fault_init
	241	* Purpose:
	242	* Initialize our private data structures.
	243	*/
	244	__startup_func
	245	void
	246	vm_fault_init(void)
	247	{
	248	int i, vm_compressor_temp;
	249	boolean_t need_default_val = TRUE;
	250	/*
	251	* Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
	252	* computed as a percentage of available memory, and the percentage used is scaled inversely with
	253	* the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
	254	* and reduce the value down to 10% for very large memory configurations. This helps give us a
	255	* definition of a memory hog that makes more sense relative to the amount of ram in the machine.
	256	* The formula here simply uses the number of gigabytes of ram to adjust the percentage.
	257	*/
	258
	259	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
	260
	261	/*
	262	* Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
	263	*/
	264
	265	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
	266	for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
	267	if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
	268	need_default_val = FALSE;
	269	vm_compressor_mode = vm_compressor_temp;
	270	break;
	271	}
	272	}
	273	if (need_default_val) {
	274	printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
	275	}
	276	}
	277	if (need_default_val) {
	278	/* If no boot arg or incorrect boot arg, try device tree. */
	279	PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
	280	}
	281	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
	282
	283	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
	284	&vm_protect_privileged_from_untrusted,
	285	sizeof(vm_protect_privileged_from_untrusted));
	286
	287	#if DEBUG \|\| DEVELOPMENT
	288	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
	289	#endif /* DEBUG \|\| DEVELOPMENT */
	290	}
	291
	292	__startup_func
	293	static void
	294	vm_rtfault_record_init(void)
	295	{
	296	size_t size;
	297
	298	vmrtf_num_records = MAX(vmrtf_num_records, 1);
	299	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
	300	vmrtfrs.vm_rtf_records = zalloc_permanent(size,
	301	ZALIGN(vm_rtfault_record_t));
	302	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
	303	}
	304	STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
	305
	306	/*
	307	* Routine: vm_fault_cleanup
	308	* Purpose:
	309	* Clean up the result of vm_fault_page.
	310	* Results:
	311	* The paging reference for "object" is released.
	312	* "object" is unlocked.
	313	* If "top_page" is not null, "top_page" is
	314	* freed and the paging reference for the object
	315	* containing it is released.
	316	*
	317	* In/out conditions:
	318	* "object" must be locked.
	319	*/
	320	void
	321	vm_fault_cleanup(
	322	vm_object_t object,
	323	vm_page_t top_page)
	324	{
	325	vm_object_paging_end(object);
	326	vm_object_unlock(object);
	327
	328	if (top_page != VM_PAGE_NULL) {
	329	object = VM_PAGE_OBJECT(top_page);
	330
	331	vm_object_lock(object);
	332	VM_PAGE_FREE(top_page);
	333	vm_object_paging_end(object);
	334	vm_object_unlock(object);
	335	}
	336	}
	337
	338	#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
	339
	340
	341	boolean_t vm_page_deactivate_behind = TRUE;
	342	/*
	343	* default sizes given VM_BEHAVIOR_DEFAULT reference behavior
	344	*/
	345	#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
	346	#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
	347	/* we use it to size an array on the stack */
	348
	349	int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
	350
	351	#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
	352
	353	/*
	354	* vm_page_is_sequential
	355	*
	356	* Determine if sequential access is in progress
	357	* in accordance with the behavior specified.
	358	* Update state to indicate current access pattern.
	359	*
	360	* object must have at least the shared lock held
	361	*/
	362	static
	363	void
	364	vm_fault_is_sequential(
	365	vm_object_t object,
	366	vm_object_offset_t offset,
	367	vm_behavior_t behavior)
	368	{
	369	vm_object_offset_t last_alloc;
	370	int sequential;
	371	int orig_sequential;
	372
	373	last_alloc = object->last_alloc;
	374	sequential = object->sequential;
	375	orig_sequential = sequential;
	376
	377	offset = vm_object_trunc_page(offset);
	378	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
	379	/* re-faulting in the same page: no change in behavior */
	380	return;
	381	}
	382
	383	switch (behavior) {
	384	case VM_BEHAVIOR_RANDOM:
	385	/*
	386	* reset indicator of sequential behavior
	387	*/
	388	sequential = 0;
	389	break;
	390
	391	case VM_BEHAVIOR_SEQUENTIAL:
	392	if (offset && last_alloc == offset - PAGE_SIZE_64) {
	393	/*
	394	* advance indicator of sequential behavior
	395	*/
	396	if (sequential < MAX_SEQUENTIAL_RUN) {
	397	sequential += PAGE_SIZE;
	398	}
	399	} else {
	400	/*
	401	* reset indicator of sequential behavior
	402	*/
	403	sequential = 0;
	404	}
	405	break;
	406
	407	case VM_BEHAVIOR_RSEQNTL:
	408	if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
	409	/*
	410	* advance indicator of sequential behavior
	411	*/
	412	if (sequential > -MAX_SEQUENTIAL_RUN) {
	413	sequential -= PAGE_SIZE;
	414	}
	415	} else {
	416	/*
	417	* reset indicator of sequential behavior
	418	*/
	419	sequential = 0;
	420	}
	421	break;
	422
	423	case VM_BEHAVIOR_DEFAULT:
	424	default:
	425	if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
	426	/*
	427	* advance indicator of sequential behavior
	428	*/
	429	if (sequential < 0) {
	430	sequential = 0;
	431	}
	432	if (sequential < MAX_SEQUENTIAL_RUN) {
	433	sequential += PAGE_SIZE;
	434	}
	435	} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
	436	/*
	437	* advance indicator of sequential behavior
	438	*/
	439	if (sequential > 0) {
	440	sequential = 0;
	441	}
	442	if (sequential > -MAX_SEQUENTIAL_RUN) {
	443	sequential -= PAGE_SIZE;
	444	}
	445	} else {
	446	/*
	447	* reset indicator of sequential behavior
	448	*/
	449	sequential = 0;
	450	}
	451	break;
	452	}
	453	if (sequential != orig_sequential) {
	454	if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
	455	/*
	456	* if someone else has already updated object->sequential
	457	* don't bother trying to update it or object->last_alloc
	458	*/
	459	return;
	460	}
	461	}
	462	/*
	463	* I'd like to do this with a OSCompareAndSwap64, but that
	464	* doesn't exist for PPC... however, it shouldn't matter
	465	* that much... last_alloc is maintained so that we can determine
	466	* if a sequential access pattern is taking place... if only
	467	* one thread is banging on this object, no problem with the unprotected
	468	* update... if 2 or more threads are banging away, we run the risk of
	469	* someone seeing a mangled update... however, in the face of multiple
	470	* accesses, no sequential access pattern can develop anyway, so we
	471	* haven't lost any real info.
	472	*/
	473	object->last_alloc = offset;
	474	}
	475
	476
	477	int vm_page_deactivate_behind_count = 0;
	478
	479	/*
	480	* vm_page_deactivate_behind
	481	*
	482	* Determine if sequential access is in progress
	483	* in accordance with the behavior specified. If
	484	* so, compute a potential page to deactivate and
	485	* deactivate it.
	486	*
	487	* object must be locked.
	488	*
	489	* return TRUE if we actually deactivate a page
	490	*/
	491	static
	492	boolean_t
	493	vm_fault_deactivate_behind(
	494	vm_object_t object,
	495	vm_object_offset_t offset,
	496	vm_behavior_t behavior)
	497	{
	498	int n;
	499	int pages_in_run = 0;
	500	int max_pages_in_run = 0;
	501	int sequential_run;
	502	int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	503	vm_object_offset_t run_offset = 0;
	504	vm_object_offset_t pg_offset = 0;
	505	vm_page_t m;
	506	vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
	507
	508	pages_in_run = 0;
	509	#if TRACEFAULTPAGE
	510	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
	511	#endif
	512	if (object == kernel_object \|\| vm_page_deactivate_behind == FALSE \|\| (vm_object_trunc_page(offset) != offset)) {
	513	/*
	514	* Do not deactivate pages from the kernel object: they
	515	* are not intended to become pageable.
	516	* or we've disabled the deactivate behind mechanism
	517	* or we are dealing with an offset that is not aligned to
	518	* the system's PAGE_SIZE because in that case we will
	519	* handle the deactivation on the aligned offset and, thus,
	520	* the full PAGE_SIZE page once. This helps us avoid the redundant
	521	* deactivates and the extra faults.
	522	*/
	523	return FALSE;
	524	}
	525	if ((sequential_run = object->sequential)) {
	526	if (sequential_run < 0) {
	527	sequential_behavior = VM_BEHAVIOR_RSEQNTL;
	528	sequential_run = 0 - sequential_run;
	529	} else {
	530	sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	531	}
	532	}
	533	switch (behavior) {
	534	case VM_BEHAVIOR_RANDOM:
	535	break;
	536	case VM_BEHAVIOR_SEQUENTIAL:
	537	if (sequential_run >= (int)PAGE_SIZE) {
	538	run_offset = 0 - PAGE_SIZE_64;
	539	max_pages_in_run = 1;
	540	}
	541	break;
	542	case VM_BEHAVIOR_RSEQNTL:
	543	if (sequential_run >= (int)PAGE_SIZE) {
	544	run_offset = PAGE_SIZE_64;
	545	max_pages_in_run = 1;
	546	}
	547	break;
	548	case VM_BEHAVIOR_DEFAULT:
	549	default:
	550	{ vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
	551
	552	/*
	553	* determine if the run of sequential accesss has been
	554	* long enough on an object with default access behavior
	555	* to consider it for deactivation
	556	*/
	557	if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
	558	/*
	559	* the comparisons between offset and behind are done
	560	* in this kind of odd fashion in order to prevent wrap around
	561	* at the end points
	562	*/
	563	if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
	564	if (offset >= behind) {
	565	run_offset = 0 - behind;
	566	pg_offset = PAGE_SIZE_64;
	567	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	568	}
	569	} else {
	570	if (offset < -behind) {
	571	run_offset = behind;
	572	pg_offset = 0 - PAGE_SIZE_64;
	573	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	574	}
	575	}
	576	}
	577	break;}
	578	}
	579	for (n = 0; n < max_pages_in_run; n++) {
	580	m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
	581
	582	if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
	583	page_run[pages_in_run++] = m;
	584
	585	/*
	586	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
	587	*
	588	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
	589	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
	590	* new reference happens. If no futher references happen on the page after that remote TLB flushes
	591	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
	592	* by pageout_scan, which is just fine since the last reference would have happened quite far
	593	* in the past (TLB caches don't hang around for very long), and of course could just as easily
	594	* have happened before we did the deactivate_behind.
	595	*/
	596	pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
	597	}
	598	}
	599	if (pages_in_run) {
	600	vm_page_lockspin_queues();
	601
	602	for (n = 0; n < pages_in_run; n++) {
	603	m = page_run[n];
	604
	605	vm_page_deactivate_internal(m, FALSE);
	606
	607	vm_page_deactivate_behind_count++;
	608	#if TRACEFAULTPAGE
	609	dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	610	#endif
	611	}
	612	vm_page_unlock_queues();
	613
	614	return TRUE;
	615	}
	616	return FALSE;
	617	}
	618
	619
	620	#if (DEVELOPMENT \|\| DEBUG)
	621	uint32_t vm_page_creation_throttled_hard = 0;
	622	uint32_t vm_page_creation_throttled_soft = 0;
	623	uint64_t vm_page_creation_throttle_avoided = 0;
	624	#endif /* DEVELOPMENT \|\| DEBUG */
	625
	626	static int
	627	vm_page_throttled(boolean_t page_kept)
	628	{
	629	clock_sec_t elapsed_sec;
	630	clock_sec_t tv_sec;
	631	clock_usec_t tv_usec;
	632
	633	thread_t thread = current_thread();
	634
	635	if (thread->options & TH_OPT_VMPRIV) {
	636	return 0;
	637	}
	638
	639	if (thread->t_page_creation_throttled) {
	640	thread->t_page_creation_throttled = 0;
	641
	642	if (page_kept == FALSE) {
	643	goto no_throttle;
	644	}
	645	}
	646	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
	647	#if (DEVELOPMENT \|\| DEBUG)
	648	thread->t_page_creation_throttled_hard++;
	649	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	650	#endif /* DEVELOPMENT \|\| DEBUG */
	651	return HARD_THROTTLE_DELAY;
	652	}
	653
	654	if ((vm_page_free_count < vm_page_throttle_limit \|\| (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
	655	thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
	656	if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
	657	#if (DEVELOPMENT \|\| DEBUG)
	658	OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
	659	#endif
	660	goto no_throttle;
	661	}
	662	clock_get_system_microtime(&tv_sec, &tv_usec);
	663
	664	elapsed_sec = tv_sec - thread->t_page_creation_time;
	665
	666	if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS \|\|
	667	(thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
	668	if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
	669	/*
	670	* we'll reset our stats to give a well behaved app
	671	* that was unlucky enough to accumulate a bunch of pages
	672	* over a long period of time a chance to get out of
	673	* the throttled state... we reset the counter and timestamp
	674	* so that if it stays under the rate limit for the next second
	675	* it will be back in our good graces... if it exceeds it, it
	676	* will remain in the throttled state
	677	*/
	678	thread->t_page_creation_time = tv_sec;
	679	thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
	680	}
	681	VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
	682
	683	thread->t_page_creation_throttled = 1;
	684
	685	if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
	686	#if (DEVELOPMENT \|\| DEBUG)
	687	thread->t_page_creation_throttled_hard++;
	688	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	689	#endif /* DEVELOPMENT \|\| DEBUG */
	690	return HARD_THROTTLE_DELAY;
	691	} else {
	692	#if (DEVELOPMENT \|\| DEBUG)
	693	thread->t_page_creation_throttled_soft++;
	694	OSAddAtomic(1, &vm_page_creation_throttled_soft);
	695	#endif /* DEVELOPMENT \|\| DEBUG */
	696	return SOFT_THROTTLE_DELAY;
	697	}
	698	}
	699	thread->t_page_creation_time = tv_sec;
	700	thread->t_page_creation_count = 0;
	701	}
	702	no_throttle:
	703	thread->t_page_creation_count++;
	704
	705	return 0;
	706	}
	707
	708
	709	/*
	710	* check for various conditions that would
	711	* prevent us from creating a ZF page...
	712	* cleanup is based on being called from vm_fault_page
	713	*
	714	* object must be locked
	715	* object == m->vmp_object
	716	*/
	717	static vm_fault_return_t
	718	vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
	719	{
	720	int throttle_delay;
	721
	722	if (object->shadow_severed \|\|
	723	VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
	724	/*
	725	* Either:
	726	* 1. the shadow chain was severed,
	727	* 2. the purgeable object is volatile or empty and is marked
	728	* to fault on access while volatile.
	729	* Just have to return an error at this point
	730	*/
	731	if (m != VM_PAGE_NULL) {
	732	VM_PAGE_FREE(m);
	733	}
	734	vm_fault_cleanup(object, first_m);
	735
	736	thread_interrupt_level(interruptible_state);
	737
	738	return VM_FAULT_MEMORY_ERROR;
	739	}
	740	if (page_throttle == TRUE) {
	741	if ((throttle_delay = vm_page_throttled(FALSE))) {
	742	/*
	743	* we're throttling zero-fills...
	744	* treat this as if we couldn't grab a page
	745	*/
	746	if (m != VM_PAGE_NULL) {
	747	VM_PAGE_FREE(m);
	748	}
	749	vm_fault_cleanup(object, first_m);
	750
	751	VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	752
	753	delay(throttle_delay);
	754
	755	if (current_thread_aborted()) {
	756	thread_interrupt_level(interruptible_state);
	757	return VM_FAULT_INTERRUPTED;
	758	}
	759	thread_interrupt_level(interruptible_state);
	760
	761	return VM_FAULT_MEMORY_SHORTAGE;
	762	}
	763	}
	764	return VM_FAULT_SUCCESS;
	765	}
	766
	767	/*
	768	* Clear the code signing bits on the given page_t
	769	*/
	770	static void
	771	vm_fault_cs_clear(vm_page_t m)
	772	{
	773	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
	774	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
	775	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
	776	}
	777
	778	/*
	779	* Enqueues the given page on the throttled queue.
	780	* The caller must hold the vm_page_queue_lock and it will be held on return.
	781	*/
	782	static void
	783	vm_fault_enqueue_throttled_locked(vm_page_t m)
	784	{
	785	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	786	assert(!VM_PAGE_WIRED(m));
	787
	788	/*
	789	* can't be on the pageout queue since we don't
	790	* have a pager to try and clean to
	791	*/
	792	vm_page_queues_remove(m, TRUE);
	793	vm_page_check_pageable_safe(m);
	794	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
	795	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
	796	vm_page_throttled_count++;
	797	}
	798
	799	/*
	800	* do the work to zero fill a page and
	801	* inject it into the correct paging queue
	802	*
	803	* m->vmp_object must be locked
	804	* page queue lock must NOT be held
	805	*/
	806	static int
	807	vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
	808	{
	809	int my_fault = DBG_ZERO_FILL_FAULT;
	810	vm_object_t object;
	811
	812	object = VM_PAGE_OBJECT(m);
	813
	814	/*
	815	* This is is a zero-fill page fault...
	816	*
	817	* Checking the page lock is a waste of
	818	* time; this page was absent, so
	819	* it can't be page locked by a pager.
	820	*
	821	* we also consider it undefined
	822	* with respect to instruction
	823	* execution. i.e. it is the responsibility
	824	* of higher layers to call for an instruction
	825	* sync after changing the contents and before
	826	* sending a program into this area. We
	827	* choose this approach for performance
	828	*/
	829	vm_fault_cs_clear(m);
	830	m->vmp_pmapped = TRUE;
	831
	832	if (no_zero_fill == TRUE) {
	833	my_fault = DBG_NZF_PAGE_FAULT;
	834
	835	if (m->vmp_absent && m->vmp_busy) {
	836	return my_fault;
	837	}
	838	} else {
	839	vm_page_zero_fill(m);
	840
	841	counter_inc(&vm_statistics_zero_fill_count);
	842	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
	843	}
	844	assert(!m->vmp_laundry);
	845	assert(object != kernel_object);
	846	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
	847	if (!VM_DYNAMIC_PAGING_ENABLED() &&
	848	(object->purgable == VM_PURGABLE_DENY \|\|
	849	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	850	object->purgable == VM_PURGABLE_VOLATILE)) {
	851	vm_page_lockspin_queues();
	852	if (!VM_DYNAMIC_PAGING_ENABLED()) {
	853	vm_fault_enqueue_throttled_locked(m);
	854	}
	855	vm_page_unlock_queues();
	856	}
	857	return my_fault;
	858	}
	859
	860
	861	/*
	862	* Routine: vm_fault_page
	863	* Purpose:
	864	* Find the resident page for the virtual memory
	865	* specified by the given virtual memory object
	866	* and offset.
	867	* Additional arguments:
	868	* The required permissions for the page is given
	869	* in "fault_type". Desired permissions are included
	870	* in "protection".
	871	* fault_info is passed along to determine pagein cluster
	872	* limits... it contains the expected reference pattern,
	873	* cluster size if available, etc...
	874	*
	875	* If the desired page is known to be resident (for
	876	* example, because it was previously wired down), asserting
	877	* the "unwiring" parameter will speed the search.
	878	*
	879	* If the operation can be interrupted (by thread_abort
	880	* or thread_terminate), then the "interruptible"
	881	* parameter should be asserted.
	882	*
	883	* Results:
	884	* The page containing the proper data is returned
	885	* in "result_page".
	886	*
	887	* In/out conditions:
	888	* The source object must be locked and referenced,
	889	* and must donate one paging reference. The reference
	890	* is not affected. The paging reference and lock are
	891	* consumed.
	892	*
	893	* If the call succeeds, the object in which "result_page"
	894	* resides is left locked and holding a paging reference.
	895	* If this is not the original object, a busy page in the
	896	* original object is returned in "top_page", to prevent other
	897	* callers from pursuing this same data, along with a paging
	898	* reference for the original object. The "top_page" should
	899	* be destroyed when this guarantee is no longer required.
	900	* The "result_page" is also left busy. It is not removed
	901	* from the pageout queues.
	902	* Special Case:
	903	* A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
	904	* fault succeeded but there's no VM page (i.e. the VM object
	905	* does not actually hold VM pages, but device memory or
	906	* large pages). The object is still locked and we still hold a
	907	* paging_in_progress reference.
	908	*/
	909	unsigned int vm_fault_page_blocked_access = 0;
	910	unsigned int vm_fault_page_forced_retry = 0;
	911
	912	vm_fault_return_t
	913	vm_fault_page(
	914	/* Arguments: */
	915	vm_object_t first_object, /* Object to begin search */
	916	vm_object_offset_t first_offset, /* Offset into object */
	917	vm_prot_t fault_type, /* What access is requested */
	918	boolean_t must_be_resident,/* Must page be resident? */
	919	boolean_t caller_lookup, /* caller looked up page */
	920	/* Modifies in place: */
	921	vm_prot_t protection, / Protection for mapping */
	922	vm_page_t result_page, / Page found, if successful */
	923	/* Returns: */
	924	vm_page_t top_page, / Page in top object, if
	925	* not result_page. */
	926	int type_of_fault, / if non-null, fill in with type of fault
	927	* COW, zero-fill, etc... returned in trace point */
	928	/* More arguments: */
	929	kern_return_t error_code, / code if page is in error */
	930	boolean_t no_zero_fill, /* don't zero fill absent pages */
	931	boolean_t data_supply, /* treat as data_supply if
	932	* it is a write fault and a full
	933	* page is provided */
	934	vm_object_fault_info_t fault_info)
	935	{
	936	vm_page_t m;
	937	vm_object_t object;
	938	vm_object_offset_t offset;
	939	vm_page_t first_m;
	940	vm_object_t next_object;
	941	vm_object_t copy_object;
	942	boolean_t look_for_page;
	943	boolean_t force_fault_retry = FALSE;
	944	vm_prot_t access_required = fault_type;
	945	vm_prot_t wants_copy_flag;
	946	kern_return_t wait_result;
	947	wait_interrupt_t interruptible_state;
	948	boolean_t data_already_requested = FALSE;
	949	vm_behavior_t orig_behavior;
	950	vm_size_t orig_cluster_size;
	951	vm_fault_return_t error;
	952	int my_fault;
	953	uint32_t try_failed_count;
	954	int interruptible; /* how may fault be interrupted? */
	955	int external_state = VM_EXTERNAL_STATE_UNKNOWN;
	956	memory_object_t pager;
	957	vm_fault_return_t retval;
	958	int grab_options;
	959
	960	/*
	961	* MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
	962	* marked as paged out in the compressor pager or the pager doesn't exist.
	963	* Note also that if the pager for an internal object
	964	* has not been created, the pager is not invoked regardless of the value
	965	* of MUST_ASK_PAGER().
	966	*
	967	* PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
	968	* is marked as paged out in the compressor pager.
	969	* PAGED_OUT() is used to determine if a page has already been pushed
	970	* into a copy object in order to avoid a redundant page out operation.
	971	*/
	972	#define MUST_ASK_PAGER(o, f, s) \
	973	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
	974
	975	#define PAGED_OUT(o, f) \
	976	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
	977
	978	/*
	979	* Recovery actions
	980	*/
	981	#define RELEASE_PAGE(m) \
	982	MACRO_BEGIN \
	983	PAGE_WAKEUP_DONE(m); \
	984	if ( !VM_PAGE_PAGEABLE(m)) { \
	985	vm_page_lockspin_queues(); \
	986	if ( !VM_PAGE_PAGEABLE(m)) { \
	987	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
	988	vm_page_deactivate(m); \
	989	else \
	990	vm_page_activate(m); \
	991	} \
	992	vm_page_unlock_queues(); \
	993	} \
	994	MACRO_END
	995
	996	#if TRACEFAULTPAGE
	997	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
	998	#endif
	999
	1000	interruptible = fault_info->interruptible;
	1001	interruptible_state = thread_interrupt_level(interruptible);
	1002
	1003	/*
	1004	* INVARIANTS (through entire routine):
	1005	*
	1006	* 1) At all times, we must either have the object
	1007	* lock or a busy page in some object to prevent
	1008	* some other thread from trying to bring in
	1009	* the same page.
	1010	*
	1011	* Note that we cannot hold any locks during the
	1012	* pager access or when waiting for memory, so
	1013	* we use a busy page then.
	1014	*
	1015	* 2) To prevent another thread from racing us down the
	1016	* shadow chain and entering a new page in the top
	1017	* object before we do, we must keep a busy page in
	1018	* the top object while following the shadow chain.
	1019	*
	1020	* 3) We must increment paging_in_progress on any object
	1021	* for which we have a busy page before dropping
	1022	* the object lock
	1023	*
	1024	* 4) We leave busy pages on the pageout queues.
	1025	* If the pageout daemon comes across a busy page,
	1026	* it will remove the page from the pageout queues.
	1027	*/
	1028
	1029	object = first_object;
	1030	offset = first_offset;
	1031	first_m = VM_PAGE_NULL;
	1032	access_required = fault_type;
	1033
	1034	/*
	1035	* default type of fault
	1036	*/
	1037	my_fault = DBG_CACHE_HIT_FAULT;
	1038
	1039	while (TRUE) {
	1040	#if TRACEFAULTPAGE
	1041	dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
	1042	#endif
	1043
	1044	grab_options = 0;
	1045	#if CONFIG_SECLUDED_MEMORY
	1046	if (object->can_grab_secluded) {
	1047	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	1048	}
	1049	#endif /* CONFIG_SECLUDED_MEMORY */
	1050
	1051	if (!object->alive) {
	1052	/*
	1053	* object is no longer valid
	1054	* clean up and return error
	1055	*/
	1056	vm_fault_cleanup(object, first_m);
	1057	thread_interrupt_level(interruptible_state);
	1058
	1059	return VM_FAULT_MEMORY_ERROR;
	1060	}
	1061
	1062	if (!object->pager_created && object->phys_contiguous) {
	1063	/*
	1064	* A physically-contiguous object without a pager:
	1065	* must be a "large page" object. We do not deal
	1066	* with VM pages for this object.
	1067	*/
	1068	caller_lookup = FALSE;
	1069	m = VM_PAGE_NULL;
	1070	goto phys_contig_object;
	1071	}
	1072
	1073	if (object->blocked_access) {
	1074	/*
	1075	* Access to this VM object has been blocked.
	1076	* Replace our "paging_in_progress" reference with
	1077	* a "activity_in_progress" reference and wait for
	1078	* access to be unblocked.
	1079	*/
	1080	caller_lookup = FALSE; /* no longer valid after sleep */
	1081	vm_object_activity_begin(object);
	1082	vm_object_paging_end(object);
	1083	while (object->blocked_access) {
	1084	vm_object_sleep(object,
	1085	VM_OBJECT_EVENT_UNBLOCKED,
	1086	THREAD_UNINT);
	1087	}
	1088	vm_fault_page_blocked_access++;
	1089	vm_object_paging_begin(object);
	1090	vm_object_activity_end(object);
	1091	}
	1092
	1093	/*
	1094	* See whether the page at 'offset' is resident
	1095	*/
	1096	if (caller_lookup == TRUE) {
	1097	/*
	1098	* The caller has already looked up the page
	1099	* and gave us the result in "result_page".
	1100	* We can use this for the first lookup but
	1101	* it loses its validity as soon as we unlock
	1102	* the object.
	1103	*/
	1104	m = *result_page;
	1105	caller_lookup = FALSE; /* no longer valid after that */
	1106	} else {
	1107	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	1108	}
	1109	#if TRACEFAULTPAGE
	1110	dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1111	#endif
	1112	if (m != VM_PAGE_NULL) {
	1113	if (m->vmp_busy) {
	1114	/*
	1115	* The page is being brought in,
	1116	* wait for it and then retry.
	1117	*/
	1118	#if TRACEFAULTPAGE
	1119	dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1120	#endif
	1121	wait_result = PAGE_SLEEP(object, m, interruptible);
	1122
	1123	if (wait_result != THREAD_AWAKENED) {
	1124	vm_fault_cleanup(object, first_m);
	1125	thread_interrupt_level(interruptible_state);
	1126
	1127	if (wait_result == THREAD_RESTART) {
	1128	return VM_FAULT_RETRY;
	1129	} else {
	1130	return VM_FAULT_INTERRUPTED;
	1131	}
	1132	}
	1133	continue;
	1134	}
	1135	if (m->vmp_laundry) {
	1136	m->vmp_free_when_done = FALSE;
	1137
	1138	if (!m->vmp_cleaning) {
	1139	vm_pageout_steal_laundry(m, FALSE);
	1140	}
	1141	}
	1142	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	1143	/*
	1144	* Guard page: off limits !
	1145	*/
	1146	if (fault_type == VM_PROT_NONE) {
	1147	/*
	1148	* The fault is not requesting any
	1149	* access to the guard page, so it must
	1150	* be just to wire or unwire it.
	1151	* Let's pretend it succeeded...
	1152	*/
	1153	m->vmp_busy = TRUE;
	1154	*result_page = m;
	1155	assert(first_m == VM_PAGE_NULL);
	1156	*top_page = first_m;
	1157	if (type_of_fault) {
	1158	*type_of_fault = DBG_GUARD_FAULT;
	1159	}
	1160	thread_interrupt_level(interruptible_state);
	1161	return VM_FAULT_SUCCESS;
	1162	} else {
	1163	/*
	1164	* The fault requests access to the
	1165	* guard page: let's deny that !
	1166	*/
	1167	vm_fault_cleanup(object, first_m);
	1168	thread_interrupt_level(interruptible_state);
	1169	return VM_FAULT_MEMORY_ERROR;
	1170	}
	1171	}
	1172
	1173	if (m->vmp_error) {
	1174	/*
	1175	* The page is in error, give up now.
	1176	*/
	1177	#if TRACEFAULTPAGE
	1178	dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
	1179	#endif
	1180	if (error_code) {
	1181	*error_code = KERN_MEMORY_ERROR;
	1182	}
	1183	VM_PAGE_FREE(m);
	1184
	1185	vm_fault_cleanup(object, first_m);
	1186	thread_interrupt_level(interruptible_state);
	1187
	1188	return VM_FAULT_MEMORY_ERROR;
	1189	}
	1190	if (m->vmp_restart) {
	1191	/*
	1192	* The pager wants us to restart
	1193	* at the top of the chain,
	1194	* typically because it has moved the
	1195	* page to another pager, then do so.
	1196	*/
	1197	#if TRACEFAULTPAGE
	1198	dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1199	#endif
	1200	VM_PAGE_FREE(m);
	1201
	1202	vm_fault_cleanup(object, first_m);
	1203	thread_interrupt_level(interruptible_state);
	1204
	1205	return VM_FAULT_RETRY;
	1206	}
	1207	if (m->vmp_absent) {
	1208	/*
	1209	* The page isn't busy, but is absent,
	1210	* therefore it's deemed "unavailable".
	1211	*
	1212	* Remove the non-existent page (unless it's
	1213	* in the top object) and move on down to the
	1214	* next object (if there is one).
	1215	*/
	1216	#if TRACEFAULTPAGE
	1217	dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
	1218	#endif
	1219	next_object = object->shadow;
	1220
	1221	if (next_object == VM_OBJECT_NULL) {
	1222	/*
	1223	* Absent page at bottom of shadow
	1224	* chain; zero fill the page we left
	1225	* busy in the first object, and free
	1226	* the absent page.
	1227	*/
	1228	assert(!must_be_resident);
	1229
	1230	/*
	1231	* check for any conditions that prevent
	1232	* us from creating a new zero-fill page
	1233	* vm_fault_check will do all of the
	1234	* fault cleanup in the case of an error condition
	1235	* including resetting the thread_interrupt_level
	1236	*/
	1237	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
	1238
	1239	if (error != VM_FAULT_SUCCESS) {
	1240	return error;
	1241	}
	1242
	1243	if (object != first_object) {
	1244	/*
	1245	* free the absent page we just found
	1246	*/
	1247	VM_PAGE_FREE(m);
	1248
	1249	/*
	1250	* drop reference and lock on current object
	1251	*/
	1252	vm_object_paging_end(object);
	1253	vm_object_unlock(object);
	1254
	1255	/*
	1256	* grab the original page we
	1257	* 'soldered' in place and
	1258	* retake lock on 'first_object'
	1259	*/
	1260	m = first_m;
	1261	first_m = VM_PAGE_NULL;
	1262
	1263	object = first_object;
	1264	offset = first_offset;
	1265
	1266	vm_object_lock(object);
	1267	} else {
	1268	/*
	1269	* we're going to use the absent page we just found
	1270	* so convert it to a 'busy' page
	1271	*/
	1272	m->vmp_absent = FALSE;
	1273	m->vmp_busy = TRUE;
	1274	}
	1275	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
	1276	m->vmp_absent = TRUE;
	1277	}
	1278	/*
	1279	* zero-fill the page and put it on
	1280	* the correct paging queue
	1281	*/
	1282	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1283
	1284	break;
	1285	} else {
	1286	if (must_be_resident) {
	1287	vm_object_paging_end(object);
	1288	} else if (object != first_object) {
	1289	vm_object_paging_end(object);
	1290	VM_PAGE_FREE(m);
	1291	} else {
	1292	first_m = m;
	1293	m->vmp_absent = FALSE;
	1294	m->vmp_busy = TRUE;
	1295
	1296	vm_page_lockspin_queues();
	1297	vm_page_queues_remove(m, FALSE);
	1298	vm_page_unlock_queues();
	1299	}
	1300
	1301	offset += object->vo_shadow_offset;
	1302	fault_info->lo_offset += object->vo_shadow_offset;
	1303	fault_info->hi_offset += object->vo_shadow_offset;
	1304	access_required = VM_PROT_READ;
	1305
	1306	vm_object_lock(next_object);
	1307	vm_object_unlock(object);
	1308	object = next_object;
	1309	vm_object_paging_begin(object);
	1310
	1311	/*
	1312	* reset to default type of fault
	1313	*/
	1314	my_fault = DBG_CACHE_HIT_FAULT;
	1315
	1316	continue;
	1317	}
	1318	}
	1319	if ((m->vmp_cleaning)
	1320	&& ((object != first_object) \|\| (object->copy != VM_OBJECT_NULL))
	1321	&& (fault_type & VM_PROT_WRITE)) {
	1322	/*
	1323	* This is a copy-on-write fault that will
	1324	* cause us to revoke access to this page, but
	1325	* this page is in the process of being cleaned
	1326	* in a clustered pageout. We must wait until
	1327	* the cleaning operation completes before
	1328	* revoking access to the original page,
	1329	* otherwise we might attempt to remove a
	1330	* wired mapping.
	1331	*/
	1332	#if TRACEFAULTPAGE
	1333	dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
	1334	#endif
	1335	/*
	1336	* take an extra ref so that object won't die
	1337	*/
	1338	vm_object_reference_locked(object);
	1339
	1340	vm_fault_cleanup(object, first_m);
	1341
	1342	vm_object_lock(object);
	1343	assert(object->ref_count > 0);
	1344
	1345	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	1346
	1347	if (m != VM_PAGE_NULL && m->vmp_cleaning) {
	1348	PAGE_ASSERT_WAIT(m, interruptible);
	1349
	1350	vm_object_unlock(object);
	1351	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1352	vm_object_deallocate(object);
	1353
	1354	goto backoff;
	1355	} else {
	1356	vm_object_unlock(object);
	1357
	1358	vm_object_deallocate(object);
	1359	thread_interrupt_level(interruptible_state);
	1360
	1361	return VM_FAULT_RETRY;
	1362	}
	1363	}
	1364	if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
	1365	!(fault_info != NULL && fault_info->stealth)) {
	1366	/*
	1367	* If we were passed a non-NULL pointer for
	1368	* "type_of_fault", than we came from
	1369	* vm_fault... we'll let it deal with
	1370	* this condition, since it
	1371	* needs to see m->vmp_speculative to correctly
	1372	* account the pageins, otherwise...
	1373	* take it off the speculative queue, we'll
	1374	* let the caller of vm_fault_page deal
	1375	* with getting it onto the correct queue
	1376	*
	1377	* If the caller specified in fault_info that
	1378	* it wants a "stealth" fault, we also leave
	1379	* the page in the speculative queue.
	1380	*/
	1381	vm_page_lockspin_queues();
	1382	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
	1383	vm_page_queues_remove(m, FALSE);
	1384	}
	1385	vm_page_unlock_queues();
	1386	}
	1387	assert(object == VM_PAGE_OBJECT(m));
	1388
	1389	if (object->code_signed) {
	1390	/*
	1391	* CODE SIGNING:
	1392	* We just paged in a page from a signed
	1393	* memory object but we don't need to
	1394	* validate it now. We'll validate it if
	1395	* when it gets mapped into a user address
	1396	* space for the first time or when the page
	1397	* gets copied to another object as a result
	1398	* of a copy-on-write.
	1399	*/
	1400	}
	1401
	1402	/*
	1403	* We mark the page busy and leave it on
	1404	* the pageout queues. If the pageout
	1405	* deamon comes across it, then it will
	1406	* remove the page from the queue, but not the object
	1407	*/
	1408	#if TRACEFAULTPAGE
	1409	dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1410	#endif
	1411	assert(!m->vmp_busy);
	1412	assert(!m->vmp_absent);
	1413
	1414	m->vmp_busy = TRUE;
	1415	break;
	1416	}
	1417
	1418
	1419	/*
	1420	* we get here when there is no page present in the object at
	1421	* the offset we're interested in... we'll allocate a page
	1422	* at this point if the pager associated with
	1423	* this object can provide the data or we're the top object...
	1424	* object is locked; m == NULL
	1425	*/
	1426
	1427	if (must_be_resident) {
	1428	if (fault_type == VM_PROT_NONE &&
	1429	object == kernel_object) {
	1430	/*
	1431	* We've been called from vm_fault_unwire()
	1432	* while removing a map entry that was allocated
	1433	* with KMA_KOBJECT and KMA_VAONLY. This page
	1434	* is not present and there's nothing more to
	1435	* do here (nothing to unwire).
	1436	*/
	1437	vm_fault_cleanup(object, first_m);
	1438	thread_interrupt_level(interruptible_state);
	1439
	1440	return VM_FAULT_MEMORY_ERROR;
	1441	}
	1442
	1443	goto dont_look_for_page;
	1444	}
	1445
	1446	/* Don't expect to fault pages into the kernel object. */
	1447	assert(object != kernel_object);
	1448
	1449	data_supply = FALSE;
	1450
	1451	look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
	1452
	1453	#if TRACEFAULTPAGE
	1454	dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
	1455	#endif
	1456	if (!look_for_page && object == first_object && !object->phys_contiguous) {
	1457	/*
	1458	* Allocate a new page for this object/offset pair as a placeholder
	1459	*/
	1460	m = vm_page_grab_options(grab_options);
	1461	#if TRACEFAULTPAGE
	1462	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1463	#endif
	1464	if (m == VM_PAGE_NULL) {
	1465	vm_fault_cleanup(object, first_m);
	1466	thread_interrupt_level(interruptible_state);
	1467
	1468	return VM_FAULT_MEMORY_SHORTAGE;
	1469	}
	1470
	1471	if (fault_info && fault_info->batch_pmap_op == TRUE) {
	1472	vm_page_insert_internal(m, object,
	1473	vm_object_trunc_page(offset),
	1474	VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
	1475	} else {
	1476	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1477	}
	1478	}
	1479	if (look_for_page) {
	1480	kern_return_t rc;
	1481	int my_fault_type;
	1482
	1483	/*
	1484	* If the memory manager is not ready, we
	1485	* cannot make requests.
	1486	*/
	1487	if (!object->pager_ready) {
	1488	#if TRACEFAULTPAGE
	1489	dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
	1490	#endif
	1491	if (m != VM_PAGE_NULL) {
	1492	VM_PAGE_FREE(m);
	1493	}
	1494
	1495	/*
	1496	* take an extra ref so object won't die
	1497	*/
	1498	vm_object_reference_locked(object);
	1499	vm_fault_cleanup(object, first_m);
	1500
	1501	vm_object_lock(object);
	1502	assert(object->ref_count > 0);
	1503
	1504	if (!object->pager_ready) {
	1505	wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
	1506
	1507	vm_object_unlock(object);
	1508	if (wait_result == THREAD_WAITING) {
	1509	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1510	}
	1511	vm_object_deallocate(object);
	1512
	1513	goto backoff;
	1514	} else {
	1515	vm_object_unlock(object);
	1516	vm_object_deallocate(object);
	1517	thread_interrupt_level(interruptible_state);
	1518
	1519	return VM_FAULT_RETRY;
	1520	}
	1521	}
	1522	if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
	1523	/*
	1524	* If there are too many outstanding page
	1525	* requests pending on this external object, we
	1526	* wait for them to be resolved now.
	1527	*/
	1528	#if TRACEFAULTPAGE
	1529	dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
	1530	#endif
	1531	if (m != VM_PAGE_NULL) {
	1532	VM_PAGE_FREE(m);
	1533	}
	1534	/*
	1535	* take an extra ref so object won't die
	1536	*/
	1537	vm_object_reference_locked(object);
	1538
	1539	vm_fault_cleanup(object, first_m);
	1540
	1541	vm_object_lock(object);
	1542	assert(object->ref_count > 0);
	1543
	1544	if (object->paging_in_progress >= vm_object_pagein_throttle) {
	1545	vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
	1546
	1547	vm_object_unlock(object);
	1548	wait_result = thread_block(THREAD_CONTINUE_NULL);
	1549	vm_object_deallocate(object);
	1550
	1551	goto backoff;
	1552	} else {
	1553	vm_object_unlock(object);
	1554	vm_object_deallocate(object);
	1555	thread_interrupt_level(interruptible_state);
	1556
	1557	return VM_FAULT_RETRY;
	1558	}
	1559	}
	1560	if (object->internal) {
	1561	int compressed_count_delta;
	1562
	1563	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	1564
	1565	if (m == VM_PAGE_NULL) {
	1566	/*
	1567	* Allocate a new page for this object/offset pair as a placeholder
	1568	*/
	1569	m = vm_page_grab_options(grab_options);
	1570	#if TRACEFAULTPAGE
	1571	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
	1572	#endif
	1573	if (m == VM_PAGE_NULL) {
	1574	vm_fault_cleanup(object, first_m);
	1575	thread_interrupt_level(interruptible_state);
	1576
	1577	return VM_FAULT_MEMORY_SHORTAGE;
	1578	}
	1579
	1580	m->vmp_absent = TRUE;
	1581	if (fault_info && fault_info->batch_pmap_op == TRUE) {
	1582	vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
	1583	} else {
	1584	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1585	}
	1586	}
	1587	assert(m->vmp_busy);
	1588
	1589	m->vmp_absent = TRUE;
	1590	pager = object->pager;
	1591
	1592	assert(object->paging_in_progress > 0);
	1593	vm_object_unlock(object);
	1594
	1595	rc = vm_compressor_pager_get(
	1596	pager,
	1597	offset + object->paging_offset,
	1598	VM_PAGE_GET_PHYS_PAGE(m),
	1599	&my_fault_type,
	1600	0,
	1601	&compressed_count_delta);
	1602
	1603	if (type_of_fault == NULL) {
	1604	int throttle_delay;
	1605
	1606	/*
	1607	* we weren't called from vm_fault, so we
	1608	* need to apply page creation throttling
	1609	* do it before we re-acquire any locks
	1610	*/
	1611	if (my_fault_type == DBG_COMPRESSOR_FAULT) {
	1612	if ((throttle_delay = vm_page_throttled(TRUE))) {
	1613	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
	1614	delay(throttle_delay);
	1615	}
	1616	}
	1617	}
	1618	vm_object_lock(object);
	1619	assert(object->paging_in_progress > 0);
	1620
	1621	vm_compressor_pager_count(
	1622	pager,
	1623	compressed_count_delta,
	1624	FALSE, /* shared_lock */
	1625	object);
	1626
	1627	switch (rc) {
	1628	case KERN_SUCCESS:
	1629	m->vmp_absent = FALSE;
	1630	m->vmp_dirty = TRUE;
	1631	if ((object->wimg_bits &
	1632	VM_WIMG_MASK) !=
	1633	VM_WIMG_USE_DEFAULT) {
	1634	/*
	1635	* If the page is not cacheable,
	1636	* we can't let its contents
	1637	* linger in the data cache
	1638	* after the decompression.
	1639	*/
	1640	pmap_sync_page_attributes_phys(
	1641	VM_PAGE_GET_PHYS_PAGE(m));
	1642	} else {
	1643	m->vmp_written_by_kernel = TRUE;
	1644	}
	1645
	1646	/*
	1647	* If the object is purgeable, its
	1648	* owner's purgeable ledgers have been
	1649	* updated in vm_page_insert() but the
	1650	* page was also accounted for in a
	1651	* "compressed purgeable" ledger, so
	1652	* update that now.
	1653	*/
	1654	if (((object->purgable !=
	1655	VM_PURGABLE_DENY) \|\|
	1656	object->vo_ledger_tag) &&
	1657	(object->vo_owner !=
	1658	NULL)) {
	1659	/*
	1660	* One less compressed
	1661	* purgeable/tagged page.
	1662	*/
	1663	vm_object_owner_compressed_update(
	1664	object,
	1665	-1);
	1666	}
	1667
	1668	break;
	1669	case KERN_MEMORY_FAILURE:
	1670	m->vmp_unusual = TRUE;
	1671	m->vmp_error = TRUE;
	1672	m->vmp_absent = FALSE;
	1673	break;
	1674	case KERN_MEMORY_ERROR:
	1675	assert(m->vmp_absent);
	1676	break;
	1677	default:
	1678	panic("vm_fault_page(): unexpected "
	1679	"error %d from "
	1680	"vm_compressor_pager_get()\n",
	1681	rc);
	1682	}
	1683	PAGE_WAKEUP_DONE(m);
	1684
	1685	rc = KERN_SUCCESS;
	1686	goto data_requested;
	1687	}
	1688	my_fault_type = DBG_PAGEIN_FAULT;
	1689
	1690	if (m != VM_PAGE_NULL) {
	1691	VM_PAGE_FREE(m);
	1692	m = VM_PAGE_NULL;
	1693	}
	1694
	1695	#if TRACEFAULTPAGE
	1696	dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
	1697	#endif
	1698
	1699	/*
	1700	* It's possible someone called vm_object_destroy while we weren't
	1701	* holding the object lock. If that has happened, then bail out
	1702	* here.
	1703	*/
	1704
	1705	pager = object->pager;
	1706
	1707	if (pager == MEMORY_OBJECT_NULL) {
	1708	vm_fault_cleanup(object, first_m);
	1709	thread_interrupt_level(interruptible_state);
	1710	return VM_FAULT_MEMORY_ERROR;
	1711	}
	1712
	1713	/*
	1714	* We have an absent page in place for the faulting offset,
	1715	* so we can release the object lock.
	1716	*/
	1717
	1718	if (object->object_is_shared_cache) {
	1719	set_thread_rwlock_boost();
	1720	}
	1721
	1722	vm_object_unlock(object);
	1723
	1724	/*
	1725	* If this object uses a copy_call strategy,
	1726	* and we are interested in a copy of this object
	1727	* (having gotten here only by following a
	1728	* shadow chain), then tell the memory manager
	1729	* via a flag added to the desired_access
	1730	* parameter, so that it can detect a race
	1731	* between our walking down the shadow chain
	1732	* and its pushing pages up into a copy of
	1733	* the object that it manages.
	1734	*/
	1735	if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
	1736	wants_copy_flag = VM_PROT_WANTS_COPY;
	1737	} else {
	1738	wants_copy_flag = VM_PROT_NONE;
	1739	}
	1740
	1741	if (object->copy == first_object) {
	1742	/*
	1743	* if we issue the memory_object_data_request in
	1744	* this state, we are subject to a deadlock with
	1745	* the underlying filesystem if it is trying to
	1746	* shrink the file resulting in a push of pages
	1747	* into the copy object... that push will stall
	1748	* on the placeholder page, and if the pushing thread
	1749	* is holding a lock that is required on the pagein
	1750	* path (such as a truncate lock), we'll deadlock...
	1751	* to avoid this potential deadlock, we throw away
	1752	* our placeholder page before calling memory_object_data_request
	1753	* and force this thread to retry the vm_fault_page after
	1754	* we have issued the I/O. the second time through this path
	1755	* we will find the page already in the cache (presumably still
	1756	* busy waiting for the I/O to complete) and then complete
	1757	* the fault w/o having to go through memory_object_data_request again
	1758	*/
	1759	assert(first_m != VM_PAGE_NULL);
	1760	assert(VM_PAGE_OBJECT(first_m) == first_object);
	1761
	1762	vm_object_lock(first_object);
	1763	VM_PAGE_FREE(first_m);
	1764	vm_object_paging_end(first_object);
	1765	vm_object_unlock(first_object);
	1766
	1767	first_m = VM_PAGE_NULL;
	1768	force_fault_retry = TRUE;
	1769
	1770	vm_fault_page_forced_retry++;
	1771	}
	1772
	1773	if (data_already_requested == TRUE) {
	1774	orig_behavior = fault_info->behavior;
	1775	orig_cluster_size = fault_info->cluster_size;
	1776
	1777	fault_info->behavior = VM_BEHAVIOR_RANDOM;
	1778	fault_info->cluster_size = PAGE_SIZE;
	1779	}
	1780	/*
	1781	* Call the memory manager to retrieve the data.
	1782	*/
	1783	rc = memory_object_data_request(
	1784	pager,
	1785	vm_object_trunc_page(offset) + object->paging_offset,
	1786	PAGE_SIZE,
	1787	access_required \| wants_copy_flag,
	1788	(memory_object_fault_info_t)fault_info);
	1789
	1790	if (data_already_requested == TRUE) {
	1791	fault_info->behavior = orig_behavior;
	1792	fault_info->cluster_size = orig_cluster_size;
	1793	} else {
	1794	data_already_requested = TRUE;
	1795	}
	1796
	1797	DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
	1798	#if TRACEFAULTPAGE
	1799	dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
	1800	#endif
	1801	vm_object_lock(object);
	1802
	1803	if (object->object_is_shared_cache) {
	1804	clear_thread_rwlock_boost();
	1805	}
	1806
	1807	data_requested:
	1808	if (rc != KERN_SUCCESS) {
	1809	vm_fault_cleanup(object, first_m);
	1810	thread_interrupt_level(interruptible_state);
	1811
	1812	return (rc == MACH_SEND_INTERRUPTED) ?
	1813	VM_FAULT_INTERRUPTED :
	1814	VM_FAULT_MEMORY_ERROR;
	1815	} else {
	1816	clock_sec_t tv_sec;
	1817	clock_usec_t tv_usec;
	1818
	1819	if (my_fault_type == DBG_PAGEIN_FAULT) {
	1820	clock_get_system_microtime(&tv_sec, &tv_usec);
	1821	current_thread()->t_page_creation_time = tv_sec;
	1822	current_thread()->t_page_creation_count = 0;
	1823	}
	1824	}
	1825	if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
	1826	vm_fault_cleanup(object, first_m);
	1827	thread_interrupt_level(interruptible_state);
	1828
	1829	return VM_FAULT_INTERRUPTED;
	1830	}
	1831	if (force_fault_retry == TRUE) {
	1832	vm_fault_cleanup(object, first_m);
	1833	thread_interrupt_level(interruptible_state);
	1834
	1835	return VM_FAULT_RETRY;
	1836	}
	1837	if (m == VM_PAGE_NULL && object->phys_contiguous) {
	1838	/*
	1839	* No page here means that the object we
	1840	* initially looked up was "physically
	1841	* contiguous" (i.e. device memory). However,
	1842	* with Virtual VRAM, the object might not
	1843	* be backed by that device memory anymore,
	1844	* so we're done here only if the object is
	1845	* still "phys_contiguous".
	1846	* Otherwise, if the object is no longer
	1847	* "phys_contiguous", we need to retry the
	1848	* page fault against the object's new backing
	1849	* store (different memory object).
	1850	*/
	1851	phys_contig_object:
	1852	goto done;
	1853	}
	1854	/*
	1855	* potentially a pagein fault
	1856	* if we make it through the state checks
	1857	* above, than we'll count it as such
	1858	*/
	1859	my_fault = my_fault_type;
	1860
	1861	/*
	1862	* Retry with same object/offset, since new data may
	1863	* be in a different page (i.e., m is meaningless at
	1864	* this point).
	1865	*/
	1866	continue;
	1867	}
	1868	dont_look_for_page:
	1869	/*
	1870	* We get here if the object has no pager, or an existence map
	1871	* exists and indicates the page isn't present on the pager
	1872	* or we're unwiring a page. If a pager exists, but there
	1873	* is no existence map, then the m->vmp_absent case above handles
	1874	* the ZF case when the pager can't provide the page
	1875	*/
	1876	#if TRACEFAULTPAGE
	1877	dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	1878	#endif
	1879	if (object == first_object) {
	1880	first_m = m;
	1881	} else {
	1882	assert(m == VM_PAGE_NULL);
	1883	}
	1884
	1885	next_object = object->shadow;
	1886
	1887	if (next_object == VM_OBJECT_NULL) {
	1888	/*
	1889	* we've hit the bottom of the shadown chain,
	1890	* fill the page in the top object with zeros.
	1891	*/
	1892	assert(!must_be_resident);
	1893
	1894	if (object != first_object) {
	1895	vm_object_paging_end(object);
	1896	vm_object_unlock(object);
	1897
	1898	object = first_object;
	1899	offset = first_offset;
	1900	vm_object_lock(object);
	1901	}
	1902	m = first_m;
	1903	assert(VM_PAGE_OBJECT(m) == object);
	1904	first_m = VM_PAGE_NULL;
	1905
	1906	/*
	1907	* check for any conditions that prevent
	1908	* us from creating a new zero-fill page
	1909	* vm_fault_check will do all of the
	1910	* fault cleanup in the case of an error condition
	1911	* including resetting the thread_interrupt_level
	1912	*/
	1913	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
	1914
	1915	if (error != VM_FAULT_SUCCESS) {
	1916	return error;
	1917	}
	1918
	1919	if (m == VM_PAGE_NULL) {
	1920	m = vm_page_grab_options(grab_options);
	1921
	1922	if (m == VM_PAGE_NULL) {
	1923	vm_fault_cleanup(object, VM_PAGE_NULL);
	1924	thread_interrupt_level(interruptible_state);
	1925
	1926	return VM_FAULT_MEMORY_SHORTAGE;
	1927	}
	1928	vm_page_insert(m, object, vm_object_trunc_page(offset));
	1929	}
	1930	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
	1931	m->vmp_absent = TRUE;
	1932	}
	1933
	1934	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1935
	1936	break;
	1937	} else {
	1938	/*
	1939	* Move on to the next object. Lock the next
	1940	* object before unlocking the current one.
	1941	*/
	1942	if ((object != first_object) \|\| must_be_resident) {
	1943	vm_object_paging_end(object);
	1944	}
	1945
	1946	offset += object->vo_shadow_offset;
	1947	fault_info->lo_offset += object->vo_shadow_offset;
	1948	fault_info->hi_offset += object->vo_shadow_offset;
	1949	access_required = VM_PROT_READ;
	1950
	1951	vm_object_lock(next_object);
	1952	vm_object_unlock(object);
	1953
	1954	object = next_object;
	1955	vm_object_paging_begin(object);
	1956	}
	1957	}
	1958
	1959	/*
	1960	* PAGE HAS BEEN FOUND.
	1961	*
	1962	* This page (m) is:
	1963	* busy, so that we can play with it;
	1964	* not absent, so that nobody else will fill it;
	1965	* possibly eligible for pageout;
	1966	*
	1967	* The top-level page (first_m) is:
	1968	* VM_PAGE_NULL if the page was found in the
	1969	* top-level object;
	1970	* busy, not absent, and ineligible for pageout.
	1971	*
	1972	* The current object (object) is locked. A paging
	1973	* reference is held for the current and top-level
	1974	* objects.
	1975	*/
	1976
	1977	#if TRACEFAULTPAGE
	1978	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
	1979	#endif
	1980	#if EXTRA_ASSERTIONS
	1981	assert(m->vmp_busy && !m->vmp_absent);
	1982	assert((first_m == VM_PAGE_NULL) \|\|
	1983	(first_m->vmp_busy && !first_m->vmp_absent &&
	1984	!first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
	1985	#endif /* EXTRA_ASSERTIONS */
	1986
	1987	/*
	1988	* If the page is being written, but isn't
	1989	* already owned by the top-level object,
	1990	* we have to copy it into a new page owned
	1991	* by the top-level object.
	1992	*/
	1993	if (object != first_object) {
	1994	#if TRACEFAULTPAGE
	1995	dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
	1996	#endif
	1997	if (fault_type & VM_PROT_WRITE) {
	1998	vm_page_t copy_m;
	1999
	2000	/*
	2001	* We only really need to copy if we
	2002	* want to write it.
	2003	*/
	2004	assert(!must_be_resident);
	2005
	2006	/*
	2007	* If we try to collapse first_object at this
	2008	* point, we may deadlock when we try to get
	2009	* the lock on an intermediate object (since we
	2010	* have the bottom object locked). We can't
	2011	* unlock the bottom object, because the page
	2012	* we found may move (by collapse) if we do.
	2013	*
	2014	* Instead, we first copy the page. Then, when
	2015	* we have no more use for the bottom object,
	2016	* we unlock it and try to collapse.
	2017	*
	2018	* Note that we copy the page even if we didn't
	2019	* need to... that's the breaks.
	2020	*/
	2021
	2022	/*
	2023	* Allocate a page for the copy
	2024	*/
	2025	copy_m = vm_page_grab_options(grab_options);
	2026
	2027	if (copy_m == VM_PAGE_NULL) {
	2028	RELEASE_PAGE(m);
	2029
	2030	vm_fault_cleanup(object, first_m);
	2031	thread_interrupt_level(interruptible_state);
	2032
	2033	return VM_FAULT_MEMORY_SHORTAGE;
	2034	}
	2035
	2036	vm_page_copy(m, copy_m);
	2037
	2038	/*
	2039	* If another map is truly sharing this
	2040	* page with us, we have to flush all
	2041	* uses of the original page, since we
	2042	* can't distinguish those which want the
	2043	* original from those which need the
	2044	* new copy.
	2045	*
	2046	* XXXO If we know that only one map has
	2047	* access to this page, then we could
	2048	* avoid the pmap_disconnect() call.
	2049	*/
	2050	if (m->vmp_pmapped) {
	2051	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2052	}
	2053
	2054	if (m->vmp_clustered) {
	2055	VM_PAGE_COUNT_AS_PAGEIN(m);
	2056	VM_PAGE_CONSUME_CLUSTERED(m);
	2057	}
	2058	assert(!m->vmp_cleaning);
	2059
	2060	/*
	2061	* We no longer need the old page or object.
	2062	*/
	2063	RELEASE_PAGE(m);
	2064
	2065	/*
	2066	* This check helps with marking the object as having a sequential pattern
	2067	* Normally we'll miss doing this below because this fault is about COW to
	2068	* the first_object i.e. bring page in from disk, push to object above but
	2069	* don't update the file object's sequential pattern.
	2070	*/
	2071	if (object->internal == FALSE) {
	2072	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2073	}
	2074
	2075	vm_object_paging_end(object);
	2076	vm_object_unlock(object);
	2077
	2078	my_fault = DBG_COW_FAULT;
	2079	counter_inc(&vm_statistics_cow_faults);
	2080	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
	2081	current_task()->cow_faults++;
	2082
	2083	object = first_object;
	2084	offset = first_offset;
	2085
	2086	vm_object_lock(object);
	2087	/*
	2088	* get rid of the place holder
	2089	* page that we soldered in earlier
	2090	*/
	2091	VM_PAGE_FREE(first_m);
	2092	first_m = VM_PAGE_NULL;
	2093
	2094	/*
	2095	* and replace it with the
	2096	* page we just copied into
	2097	*/
	2098	assert(copy_m->vmp_busy);
	2099	vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
	2100	SET_PAGE_DIRTY(copy_m, TRUE);
	2101
	2102	m = copy_m;
	2103	/*
	2104	* Now that we've gotten the copy out of the
	2105	* way, let's try to collapse the top object.
	2106	* But we have to play ugly games with
	2107	* paging_in_progress to do that...
	2108	*/
	2109	vm_object_paging_end(object);
	2110	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
	2111	vm_object_paging_begin(object);
	2112	} else {
	2113	*protection &= (~VM_PROT_WRITE);
	2114	}
	2115	}
	2116	/*
	2117	* Now check whether the page needs to be pushed into the
	2118	* copy object. The use of asymmetric copy on write for
	2119	* shared temporary objects means that we may do two copies to
	2120	* satisfy the fault; one above to get the page from a
	2121	* shadowed object, and one here to push it into the copy.
	2122	*/
	2123	try_failed_count = 0;
	2124
	2125	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
	2126	vm_object_offset_t copy_offset;
	2127	vm_page_t copy_m;
	2128
	2129	#if TRACEFAULTPAGE
	2130	dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
	2131	#endif
	2132	/*
	2133	* If the page is being written, but hasn't been
	2134	* copied to the copy-object, we have to copy it there.
	2135	*/
	2136	if ((fault_type & VM_PROT_WRITE) == 0) {
	2137	*protection &= ~VM_PROT_WRITE;
	2138	break;
	2139	}
	2140
	2141	/*
	2142	* If the page was guaranteed to be resident,
	2143	* we must have already performed the copy.
	2144	*/
	2145	if (must_be_resident) {
	2146	break;
	2147	}
	2148
	2149	/*
	2150	* Try to get the lock on the copy_object.
	2151	*/
	2152	if (!vm_object_lock_try(copy_object)) {
	2153	vm_object_unlock(object);
	2154	try_failed_count++;
	2155
	2156	mutex_pause(try_failed_count); /* wait a bit */
	2157	vm_object_lock(object);
	2158
	2159	continue;
	2160	}
	2161	try_failed_count = 0;
	2162
	2163	/*
	2164	* Make another reference to the copy-object,
	2165	* to keep it from disappearing during the
	2166	* copy.
	2167	*/
	2168	vm_object_reference_locked(copy_object);
	2169
	2170	/*
	2171	* Does the page exist in the copy?
	2172	*/
	2173	copy_offset = first_offset - copy_object->vo_shadow_offset;
	2174	copy_offset = vm_object_trunc_page(copy_offset);
	2175
	2176	if (copy_object->vo_size <= copy_offset) {
	2177	/*
	2178	* Copy object doesn't cover this page -- do nothing.
	2179	*/
	2180	;
	2181	} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
	2182	/*
	2183	* Page currently exists in the copy object
	2184	*/
	2185	if (copy_m->vmp_busy) {
	2186	/*
	2187	* If the page is being brought
	2188	* in, wait for it and then retry.
	2189	*/
	2190	RELEASE_PAGE(m);
	2191
	2192	/*
	2193	* take an extra ref so object won't die
	2194	*/
	2195	vm_object_reference_locked(copy_object);
	2196	vm_object_unlock(copy_object);
	2197	vm_fault_cleanup(object, first_m);
	2198
	2199	vm_object_lock(copy_object);
	2200	assert(copy_object->ref_count > 0);
	2201	vm_object_lock_assert_exclusive(copy_object);
	2202	copy_object->ref_count--;
	2203	assert(copy_object->ref_count > 0);
	2204	copy_m = vm_page_lookup(copy_object, copy_offset);
	2205
	2206	if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
	2207	PAGE_ASSERT_WAIT(copy_m, interruptible);
	2208
	2209	vm_object_unlock(copy_object);
	2210	wait_result = thread_block(THREAD_CONTINUE_NULL);
	2211	vm_object_deallocate(copy_object);
	2212
	2213	goto backoff;
	2214	} else {
	2215	vm_object_unlock(copy_object);
	2216	vm_object_deallocate(copy_object);
	2217	thread_interrupt_level(interruptible_state);
	2218
	2219	return VM_FAULT_RETRY;
	2220	}
	2221	}
	2222	} else if (!PAGED_OUT(copy_object, copy_offset)) {
	2223	/*
	2224	* If PAGED_OUT is TRUE, then the page used to exist
	2225	* in the copy-object, and has already been paged out.
	2226	* We don't need to repeat this. If PAGED_OUT is
	2227	* FALSE, then either we don't know (!pager_created,
	2228	* for example) or it hasn't been paged out.
	2229	* (VM_EXTERNAL_STATE_UNKNOWN\|\|VM_EXTERNAL_STATE_ABSENT)
	2230	* We must copy the page to the copy object.
	2231	*
	2232	* Allocate a page for the copy
	2233	*/
	2234	copy_m = vm_page_alloc(copy_object, copy_offset);
	2235
	2236	if (copy_m == VM_PAGE_NULL) {
	2237	RELEASE_PAGE(m);
	2238
	2239	vm_object_lock_assert_exclusive(copy_object);
	2240	copy_object->ref_count--;
	2241	assert(copy_object->ref_count > 0);
	2242
	2243	vm_object_unlock(copy_object);
	2244	vm_fault_cleanup(object, first_m);
	2245	thread_interrupt_level(interruptible_state);
	2246
	2247	return VM_FAULT_MEMORY_SHORTAGE;
	2248	}
	2249	/*
	2250	* Must copy page into copy-object.
	2251	*/
	2252	vm_page_copy(m, copy_m);
	2253
	2254	/*
	2255	* If the old page was in use by any users
	2256	* of the copy-object, it must be removed
	2257	* from all pmaps. (We can't know which
	2258	* pmaps use it.)
	2259	*/
	2260	if (m->vmp_pmapped) {
	2261	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2262	}
	2263
	2264	if (m->vmp_clustered) {
	2265	VM_PAGE_COUNT_AS_PAGEIN(m);
	2266	VM_PAGE_CONSUME_CLUSTERED(m);
	2267	}
	2268	/*
	2269	* If there's a pager, then immediately
	2270	* page out this page, using the "initialize"
	2271	* option. Else, we use the copy.
	2272	*/
	2273	if ((!copy_object->pager_ready)
	2274	\|\| VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
	2275	) {
	2276	vm_page_lockspin_queues();
	2277	assert(!m->vmp_cleaning);
	2278	vm_page_activate(copy_m);
	2279	vm_page_unlock_queues();
	2280
	2281	SET_PAGE_DIRTY(copy_m, TRUE);
	2282	PAGE_WAKEUP_DONE(copy_m);
	2283	} else {
	2284	assert(copy_m->vmp_busy == TRUE);
	2285	assert(!m->vmp_cleaning);
	2286
	2287	/*
	2288	* dirty is protected by the object lock
	2289	*/
	2290	SET_PAGE_DIRTY(copy_m, TRUE);
	2291
	2292	/*
	2293	* The page is already ready for pageout:
	2294	* not on pageout queues and busy.
	2295	* Unlock everything except the
	2296	* copy_object itself.
	2297	*/
	2298	vm_object_unlock(object);
	2299
	2300	/*
	2301	* Write the page to the copy-object,
	2302	* flushing it from the kernel.
	2303	*/
	2304	vm_pageout_initialize_page(copy_m);
	2305
	2306	/*
	2307	* Since the pageout may have
	2308	* temporarily dropped the
	2309	* copy_object's lock, we
	2310	* check whether we'll have
	2311	* to deallocate the hard way.
	2312	*/
	2313	if ((copy_object->shadow != object) \|\| (copy_object->ref_count == 1)) {
	2314	vm_object_unlock(copy_object);
	2315	vm_object_deallocate(copy_object);
	2316	vm_object_lock(object);
	2317
	2318	continue;
	2319	}
	2320	/*
	2321	* Pick back up the old object's
	2322	* lock. [It is safe to do so,
	2323	* since it must be deeper in the
	2324	* object tree.]
	2325	*/
	2326	vm_object_lock(object);
	2327	}
	2328
	2329	/*
	2330	* Because we're pushing a page upward
	2331	* in the object tree, we must restart
	2332	* any faults that are waiting here.
	2333	* [Note that this is an expansion of
	2334	* PAGE_WAKEUP that uses the THREAD_RESTART
	2335	* wait result]. Can't turn off the page's
	2336	* busy bit because we're not done with it.
	2337	*/
	2338	if (m->vmp_wanted) {
	2339	m->vmp_wanted = FALSE;
	2340	thread_wakeup_with_result((event_t) m, THREAD_RESTART);
	2341	}
	2342	}
	2343	/*
	2344	* The reference count on copy_object must be
	2345	* at least 2: one for our extra reference,
	2346	* and at least one from the outside world
	2347	* (we checked that when we last locked
	2348	* copy_object).
	2349	*/
	2350	vm_object_lock_assert_exclusive(copy_object);
	2351	copy_object->ref_count--;
	2352	assert(copy_object->ref_count > 0);
	2353
	2354	vm_object_unlock(copy_object);
	2355
	2356	break;
	2357	}
	2358
	2359	done:
	2360	*result_page = m;
	2361	*top_page = first_m;
	2362
	2363	if (m != VM_PAGE_NULL) {
	2364	assert(VM_PAGE_OBJECT(m) == object);
	2365
	2366	retval = VM_FAULT_SUCCESS;
	2367
	2368	if (my_fault == DBG_PAGEIN_FAULT) {
	2369	VM_PAGE_COUNT_AS_PAGEIN(m);
	2370
	2371	if (object->internal) {
	2372	my_fault = DBG_PAGEIND_FAULT;
	2373	} else {
	2374	my_fault = DBG_PAGEINV_FAULT;
	2375	}
	2376
	2377	/*
	2378	* evaluate access pattern and update state
	2379	* vm_fault_deactivate_behind depends on the
	2380	* state being up to date
	2381	*/
	2382	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2383	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
	2384	} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
	2385	/*
	2386	* we weren't called from vm_fault, so handle the
	2387	* accounting here for hits in the cache
	2388	*/
	2389	if (m->vmp_clustered) {
	2390	VM_PAGE_COUNT_AS_PAGEIN(m);
	2391	VM_PAGE_CONSUME_CLUSTERED(m);
	2392	}
	2393	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2394	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
	2395	} else if (my_fault == DBG_COMPRESSOR_FAULT \|\| my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
	2396	VM_STAT_DECOMPRESSIONS();
	2397	}
	2398	if (type_of_fault) {
	2399	*type_of_fault = my_fault;
	2400	}
	2401	} else {
	2402	retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
	2403	assert(first_m == VM_PAGE_NULL);
	2404	assert(object == first_object);
	2405	}
	2406
	2407	thread_interrupt_level(interruptible_state);
	2408
	2409	#if TRACEFAULTPAGE
	2410	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
	2411	#endif
	2412	return retval;
	2413
	2414	backoff:
	2415	thread_interrupt_level(interruptible_state);
	2416
	2417	if (wait_result == THREAD_INTERRUPTED) {
	2418	return VM_FAULT_INTERRUPTED;
	2419	}
	2420	return VM_FAULT_RETRY;
	2421
	2422	#undef RELEASE_PAGE
	2423	}
	2424
	2425
	2426	extern int panic_on_cs_killed;
	2427	extern int proc_selfpid(void);
	2428	extern char proc_name_address(void p);
	2429	unsigned long cs_enter_tainted_rejected = 0;
	2430	unsigned long cs_enter_tainted_accepted = 0;
	2431
	2432	/*
	2433	* CODE SIGNING:
	2434	* When soft faulting a page, we have to validate the page if:
	2435	* 1. the page is being mapped in user space
	2436	* 2. the page hasn't already been found to be "tainted"
	2437	* 3. the page belongs to a code-signed object
	2438	* 4. the page has not been validated yet or has been mapped for write.
	2439	*/
	2440	static bool
	2441	vm_fault_cs_need_validation(
	2442	pmap_t pmap,
	2443	vm_page_t page,
	2444	vm_object_t page_obj,
	2445	vm_map_size_t fault_page_size,
	2446	vm_map_offset_t fault_phys_offset)
	2447	{
	2448	if (pmap == kernel_pmap) {
	2449	/* 1 - not user space */
	2450	return false;
	2451	}
	2452	if (!page_obj->code_signed) {
	2453	/* 3 - page does not belong to a code-signed object */
	2454	return false;
	2455	}
	2456	if (fault_page_size == PAGE_SIZE) {
	2457	/* looking at the whole page */
	2458	assertf(fault_phys_offset == 0,
	2459	"fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
	2460	(uint64_t)fault_page_size,
	2461	(uint64_t)fault_phys_offset);
	2462	if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
	2463	/* 2 - page is all tainted */
	2464	return false;
	2465	}
	2466	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
	2467	!page->vmp_wpmapped) {
	2468	/* 4 - already fully validated and never mapped writable */
	2469	return false;
	2470	}
	2471	} else {
	2472	/* looking at a specific sub-page */
	2473	if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	2474	/* 2 - sub-page was already marked as tainted */
	2475	return false;
	2476	}
	2477	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
	2478	!page->vmp_wpmapped) {
	2479	/* 4 - already validated and never mapped writable */
	2480	return false;
	2481	}
	2482	}
	2483	/* page needs to be validated */
	2484	return true;
	2485	}
	2486
	2487
	2488	static bool
	2489	vm_fault_cs_page_immutable(
	2490	vm_page_t m,
	2491	vm_map_size_t fault_page_size,
	2492	vm_map_offset_t fault_phys_offset,
	2493	vm_prot_t prot __unused)
	2494	{
	2495	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
	2496	/&& ((prot) & VM_PROT_EXECUTE)/) {
	2497	return true;
	2498	}
	2499	return false;
	2500	}
	2501
	2502	static bool
	2503	vm_fault_cs_page_nx(
	2504	vm_page_t m,
	2505	vm_map_size_t fault_page_size,
	2506	vm_map_offset_t fault_phys_offset)
	2507	{
	2508	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2509	}
	2510
	2511	/*
	2512	* Check if the page being entered into the pmap violates code signing.
	2513	*/
	2514	static kern_return_t
	2515	vm_fault_cs_check_violation(
	2516	bool cs_bypass,
	2517	vm_object_t object,
	2518	vm_page_t m,
	2519	pmap_t pmap,
	2520	vm_prot_t prot,
	2521	vm_prot_t caller_prot,
	2522	vm_map_size_t fault_page_size,
	2523	vm_map_offset_t fault_phys_offset,
	2524	vm_object_fault_info_t fault_info,
	2525	bool map_is_switched,
	2526	bool map_is_switch_protected,
	2527	bool *cs_violation)
	2528	{
	2529	#if !PMAP_CS
	2530	#pragma unused(caller_prot)
	2531	#pragma unused(fault_info)
	2532	#endif /* !PMAP_CS */
	2533	int cs_enforcement_enabled;
	2534	if (!cs_bypass &&
	2535	vm_fault_cs_need_validation(pmap, m, object,
	2536	fault_page_size, fault_phys_offset)) {
	2537	vm_object_lock_assert_exclusive(object);
	2538
	2539	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
	2540	vm_cs_revalidates++;
	2541	}
	2542
	2543	/* VM map is locked, so 1 ref will remain on VM object -
	2544	* so no harm if vm_page_validate_cs drops the object lock */
	2545
	2546	vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
	2547	}
	2548
	2549	/* If the map is switched, and is switch-protected, we must protect
	2550	* some pages from being write-faulted: immutable pages because by
	2551	* definition they may not be written, and executable pages because that
	2552	* would provide a way to inject unsigned code.
	2553	* If the page is immutable, we can simply return. However, we can't
	2554	* immediately determine whether a page is executable anywhere. But,
	2555	* we can disconnect it everywhere and remove the executable protection
	2556	* from the current map. We do that below right before we do the
	2557	* PMAP_ENTER.
	2558	*/
	2559	if (pmap == kernel_pmap) {
	2560	/* kernel fault: cs_enforcement does not apply */
	2561	cs_enforcement_enabled = 0;
	2562	} else {
	2563	cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
	2564	}
	2565
	2566	if (cs_enforcement_enabled && map_is_switched &&
	2567	map_is_switch_protected &&
	2568	vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
	2569	(prot & VM_PROT_WRITE)) {
	2570	return KERN_CODESIGN_ERROR;
	2571	}
	2572
	2573	if (cs_enforcement_enabled &&
	2574	vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
	2575	(prot & VM_PROT_EXECUTE)) {
	2576	if (cs_debug) {
	2577	printf("page marked to be NX, not letting it be mapped EXEC\n");
	2578	}
	2579	return KERN_CODESIGN_ERROR;
	2580	}
	2581
	2582	/* A page could be tainted, or pose a risk of being tainted later.
	2583	* Check whether the receiving process wants it, and make it feel
	2584	* the consequences (that hapens in cs_invalid_page()).
	2585	* For CS Enforcement, two other conditions will
	2586	* cause that page to be tainted as well:
	2587	* - pmapping an unsigned page executable - this means unsigned code;
	2588	* - writeable mapping of a validated page - the content of that page
	2589	* can be changed without the kernel noticing, therefore unsigned
	2590	* code can be created
	2591	*/
	2592	if (cs_bypass) {
	2593	/* code-signing is bypassed */
	2594	*cs_violation = FALSE;
	2595	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
	2596	/* tainted page */
	2597	*cs_violation = TRUE;
	2598	} else if (!cs_enforcement_enabled) {
	2599	/* no further code-signing enforcement */
	2600	*cs_violation = FALSE;
	2601	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
	2602	((prot & VM_PROT_WRITE) \|\|
	2603	m->vmp_wpmapped)) {
	2604	/*
	2605	* The page should be immutable, but is in danger of being
	2606	* modified.
	2607	* This is the case where we want policy from the code
	2608	* directory - is the page immutable or not? For now we have
	2609	* to assume that code pages will be immutable, data pages not.
	2610	* We'll assume a page is a code page if it has a code directory
	2611	* and we fault for execution.
	2612	* That is good enough since if we faulted the code page for
	2613	* writing in another map before, it is wpmapped; if we fault
	2614	* it for writing in this map later it will also be faulted for
	2615	* executing at the same time; and if we fault for writing in
	2616	* another map later, we will disconnect it from this pmap so
	2617	* we'll notice the change.
	2618	*/
	2619	*cs_violation = TRUE;
	2620	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
	2621	(prot & VM_PROT_EXECUTE)
	2622	) {
	2623	*cs_violation = TRUE;
	2624	} else {
	2625	*cs_violation = FALSE;
	2626	}
	2627	return KERN_SUCCESS;
	2628	}
	2629
	2630	/*
	2631	* Handles a code signing violation by either rejecting the page or forcing a disconnect.
	2632	* @param must_disconnect This value will be set to true if the caller must disconnect
	2633	* this page.
	2634	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2635	*/
	2636	static kern_return_t
	2637	vm_fault_cs_handle_violation(
	2638	vm_object_t object,
	2639	vm_page_t m,
	2640	pmap_t pmap,
	2641	vm_prot_t prot,
	2642	vm_map_offset_t vaddr,
	2643	vm_map_size_t fault_page_size,
	2644	vm_map_offset_t fault_phys_offset,
	2645	bool map_is_switched,
	2646	bool map_is_switch_protected,
	2647	bool *must_disconnect)
	2648	{
	2649	#if !MACH_ASSERT
	2650	#pragma unused(pmap)
	2651	#pragma unused(map_is_switch_protected)
	2652	#endif /* !MACH_ASSERT */
	2653	/*
	2654	* We will have a tainted page. Have to handle the special case
	2655	* of a switched map now. If the map is not switched, standard
	2656	* procedure applies - call cs_invalid_page().
	2657	* If the map is switched, the real owner is invalid already.
	2658	* There is no point in invalidating the switching process since
	2659	* it will not be executing from the map. So we don't call
	2660	* cs_invalid_page() in that case.
	2661	*/
	2662	boolean_t reject_page, cs_killed;
	2663	kern_return_t kr;
	2664	if (map_is_switched) {
	2665	assert(pmap == vm_map_pmap(current_thread()->map));
	2666	assert(!(prot & VM_PROT_WRITE) \|\| (map_is_switch_protected == FALSE));
	2667	reject_page = FALSE;
	2668	} else {
	2669	if (cs_debug > 5) {
	2670	printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
	2671	object->code_signed ? "yes" : "no",
	2672	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2673	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2674	m->vmp_wpmapped ? "yes" : "no",
	2675	(int)prot);
	2676	}
	2677	reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
	2678	}
	2679
	2680	if (reject_page) {
	2681	/* reject the invalid page: abort the page fault */
	2682	int pid;
	2683	const char *procname;
	2684	task_t task;
	2685	vm_object_t file_object, shadow;
	2686	vm_object_offset_t file_offset;
	2687	char pathname, filename;
	2688	vm_size_t pathname_len, filename_len;
	2689	boolean_t truncated_path;
	2690	#define __PATH_MAX 1024
	2691	struct timespec mtime, cs_mtime;
	2692	int shadow_depth;
	2693	os_reason_t codesigning_exit_reason = OS_REASON_NULL;
	2694
	2695	kr = KERN_CODESIGN_ERROR;
	2696	cs_enter_tainted_rejected++;
	2697
	2698	/* get process name and pid */
	2699	procname = "?";
	2700	task = current_task();
	2701	pid = proc_selfpid();
	2702	if (task->bsd_info != NULL) {
	2703	procname = proc_name_address(task->bsd_info);
	2704	}
	2705
	2706	/* get file's VM object */
	2707	file_object = object;
	2708	file_offset = m->vmp_offset;
	2709	for (shadow = file_object->shadow,
	2710	shadow_depth = 0;
	2711	shadow != VM_OBJECT_NULL;
	2712	shadow = file_object->shadow,
	2713	shadow_depth++) {
	2714	vm_object_lock_shared(shadow);
	2715	if (file_object != object) {
	2716	vm_object_unlock(file_object);
	2717	}
	2718	file_offset += file_object->vo_shadow_offset;
	2719	file_object = shadow;
	2720	}
	2721
	2722	mtime.tv_sec = 0;
	2723	mtime.tv_nsec = 0;
	2724	cs_mtime.tv_sec = 0;
	2725	cs_mtime.tv_nsec = 0;
	2726
	2727	/* get file's pathname and/or filename */
	2728	pathname = NULL;
	2729	filename = NULL;
	2730	pathname_len = 0;
	2731	filename_len = 0;
	2732	truncated_path = FALSE;
	2733	/* no pager -> no file -> no pathname, use "<nil>" in that case */
	2734	if (file_object->pager != NULL) {
	2735	pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
	2736	if (pathname) {
	2737	pathname[0] = '\0';
	2738	pathname_len = __PATH_MAX;
	2739	filename = pathname + pathname_len;
	2740	filename_len = __PATH_MAX;
	2741
	2742	if (vnode_pager_get_object_name(file_object->pager,
	2743	pathname,
	2744	pathname_len,
	2745	filename,
	2746	filename_len,
	2747	&truncated_path) == KERN_SUCCESS) {
	2748	/* safety first... */
	2749	pathname[__PATH_MAX - 1] = '\0';
	2750	filename[__PATH_MAX - 1] = '\0';
	2751
	2752	vnode_pager_get_object_mtime(file_object->pager,
	2753	&mtime,
	2754	&cs_mtime);
	2755	} else {
	2756	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
	2757	pathname = NULL;
	2758	filename = NULL;
	2759	pathname_len = 0;
	2760	filename_len = 0;
	2761	truncated_path = FALSE;
	2762	}
	2763	}
	2764	}
	2765	printf("CODE SIGNING: process %d[%s]: "
	2766	"rejecting invalid page at address 0x%llx "
	2767	"from offset 0x%llx in file \"%s%s%s\" "
	2768	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
	2769	"(signed:%d validated:%d tainted:%d nx:%d "
	2770	"wpmapped:%d dirty:%d depth:%d)\n",
	2771	pid, procname, (addr64_t) vaddr,
	2772	file_offset,
	2773	(pathname ? pathname : "<nil>"),
	2774	(truncated_path ? "/.../" : ""),
	2775	(truncated_path ? filename : ""),
	2776	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2777	((cs_mtime.tv_sec == mtime.tv_sec &&
	2778	cs_mtime.tv_nsec == mtime.tv_nsec)
	2779	? "=="
	2780	: "!="),
	2781	mtime.tv_sec, mtime.tv_nsec,
	2782	object->code_signed,
	2783	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2784	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2785	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
	2786	m->vmp_wpmapped,
	2787	m->vmp_dirty,
	2788	shadow_depth);
	2789
	2790	/*
	2791	* We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
	2792	* did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
	2793	* process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
	2794	* will deal with the segmentation fault.
	2795	*/
	2796	if (cs_killed) {
	2797	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
	2798	pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
	2799
	2800	codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
	2801	if (codesigning_exit_reason == NULL) {
	2802	printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
	2803	} else {
	2804	mach_vm_address_t data_addr = 0;
	2805	struct codesigning_exit_reason_info *ceri = NULL;
	2806	uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
	2807
	2808	if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
	2809	printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
	2810	} else {
	2811	if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
	2812	EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
	2813	ceri = (struct codesigning_exit_reason_info *)data_addr;
	2814	static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
	2815
	2816	ceri->ceri_virt_addr = vaddr;
	2817	ceri->ceri_file_offset = file_offset;
	2818	if (pathname) {
	2819	strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
	2820	} else {
	2821	ceri->ceri_pathname[0] = '\0';
	2822	}
	2823	if (filename) {
	2824	strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
	2825	} else {
	2826	ceri->ceri_filename[0] = '\0';
	2827	}
	2828	ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
	2829	ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
	2830	ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
	2831	ceri->ceri_page_modtime_secs = mtime.tv_sec;
	2832	ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
	2833	ceri->ceri_object_codesigned = (object->code_signed);
	2834	ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
	2835	ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
	2836	ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2837	ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
	2838	ceri->ceri_page_slid = 0;
	2839	ceri->ceri_page_dirty = (m->vmp_dirty);
	2840	ceri->ceri_page_shadow_depth = shadow_depth;
	2841	} else {
	2842	#if DEBUG \|\| DEVELOPMENT
	2843	panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
	2844	#else
	2845	printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
	2846	#endif /* DEBUG \|\| DEVELOPMENT */
	2847	/* Free the buffer */
	2848	os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
	2849	}
	2850	}
	2851	}
	2852
	2853	set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
	2854	}
	2855	if (panic_on_cs_killed &&
	2856	object->object_is_shared_cache) {
	2857	char *tainted_contents;
	2858	vm_map_offset_t src_vaddr;
	2859	src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
	2860	tainted_contents = kalloc(PAGE_SIZE);
	2861	bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
	2862	printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
	2863	panic("CODE SIGNING: process %d[%s]: "
	2864	"rejecting invalid page (phys#0x%x) at address 0x%llx "
	2865	"from offset 0x%llx in file \"%s%s%s\" "
	2866	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
	2867	"(signed:%d validated:%d tainted:%d nx:%d"
	2868	"wpmapped:%d dirty:%d depth:%d)\n",
	2869	pid, procname,
	2870	VM_PAGE_GET_PHYS_PAGE(m),
	2871	(addr64_t) vaddr,
	2872	file_offset,
	2873	(pathname ? pathname : "<nil>"),
	2874	(truncated_path ? "/.../" : ""),
	2875	(truncated_path ? filename : ""),
	2876	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2877	((cs_mtime.tv_sec == mtime.tv_sec &&
	2878	cs_mtime.tv_nsec == mtime.tv_nsec)
	2879	? "=="
	2880	: "!="),
	2881	mtime.tv_sec, mtime.tv_nsec,
	2882	object->code_signed,
	2883	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2884	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2885	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
	2886	m->vmp_wpmapped,
	2887	m->vmp_dirty,
	2888	shadow_depth);
	2889	}
	2890
	2891	if (file_object != object) {
	2892	vm_object_unlock(file_object);
	2893	}
	2894	if (pathname_len != 0) {
	2895	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
	2896	pathname = NULL;
	2897	filename = NULL;
	2898	}
	2899	} else {
	2900	/* proceed with the invalid page */
	2901	kr = KERN_SUCCESS;
	2902	if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
	2903	!object->code_signed) {
	2904	/*
	2905	* This page has not been (fully) validated but
	2906	* does not belong to a code-signed object
	2907	* so it should not be forcefully considered
	2908	* as tainted.
	2909	* We're just concerned about it here because
	2910	* we've been asked to "execute" it but that
	2911	* does not mean that it should cause other
	2912	* accesses to fail.
	2913	* This happens when a debugger sets a
	2914	* breakpoint and we then execute code in
	2915	* that page. Marking the page as "tainted"
	2916	* would cause any inspection tool ("leaks",
	2917	* "vmmap", "CrashReporter", ...) to get killed
	2918	* due to code-signing violation on that page,
	2919	* even though they're just reading it and not
	2920	* executing from it.
	2921	*/
	2922	} else {
	2923	/*
	2924	* Page might have been tainted before or not;
	2925	* now it definitively is. If the page wasn't
	2926	* tainted, we must disconnect it from all
	2927	* pmaps later, to force existing mappings
	2928	* through that code path for re-consideration
	2929	* of the validity of that page.
	2930	*/
	2931	if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
	2932	*must_disconnect = TRUE;
	2933	VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
	2934	}
	2935	}
	2936	cs_enter_tainted_accepted++;
	2937	}
	2938	if (kr != KERN_SUCCESS) {
	2939	if (cs_debug) {
	2940	printf("CODESIGNING: vm_fault_enter(0x%llx): "
	2941	"* INVALID PAGE *\n",
	2942	(long long)vaddr);
	2943	}
	2944	#if !SECURE_KERNEL
	2945	if (cs_enforcement_panic) {
	2946	panic("CODESIGNING: panicking on invalid page\n");
	2947	}
	2948	#endif
	2949	}
	2950	return kr;
	2951	}
	2952
	2953	/*
	2954	* Check that the code signature is valid for the given page being inserted into
	2955	* the pmap.
	2956	*
	2957	* @param must_disconnect This value will be set to true if the caller must disconnect
	2958	* this page.
	2959	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2960	*/
	2961	static kern_return_t
	2962	vm_fault_validate_cs(
	2963	bool cs_bypass,
	2964	vm_object_t object,
	2965	vm_page_t m,
	2966	pmap_t pmap,
	2967	vm_map_offset_t vaddr,
	2968	vm_prot_t prot,
	2969	vm_prot_t caller_prot,
	2970	vm_map_size_t fault_page_size,
	2971	vm_map_offset_t fault_phys_offset,
	2972	vm_object_fault_info_t fault_info,
	2973	bool *must_disconnect)
	2974	{
	2975	bool map_is_switched, map_is_switch_protected, cs_violation;
	2976	kern_return_t kr;
	2977	/* Validate code signature if necessary. */
	2978	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
	2979	(pmap == vm_map_pmap(current_thread()->map)));
	2980	map_is_switch_protected = current_thread()->map->switch_protect;
	2981	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
	2982	prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
	2983	map_is_switched, map_is_switch_protected, &cs_violation);
	2984	if (kr != KERN_SUCCESS) {
	2985	return kr;
	2986	}
	2987	if (cs_violation) {
	2988	kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
	2989	fault_page_size, fault_phys_offset,
	2990	map_is_switched, map_is_switch_protected, must_disconnect);
	2991	}
	2992	return kr;
	2993	}
	2994
	2995	/*
	2996	* Enqueue the page on the appropriate paging queue.
	2997	*/
	2998	static void
	2999	vm_fault_enqueue_page(
	3000	vm_object_t object,
	3001	vm_page_t m,
	3002	bool wired,
	3003	bool change_wiring,
	3004	vm_tag_t wire_tag,
	3005	bool no_cache,
	3006	int *type_of_fault,
	3007	kern_return_t kr)
	3008	{
	3009	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) \|\| object != compressor_object);
	3010	boolean_t page_queues_locked = FALSE;
	3011	boolean_t previously_pmapped = m->vmp_pmapped;
	3012	#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
	3013	MACRO_BEGIN \
	3014	if (! page_queues_locked) { \
	3015	page_queues_locked = TRUE; \
	3016	vm_page_lockspin_queues(); \
	3017	} \
	3018	MACRO_END
	3019	#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
	3020	MACRO_BEGIN \
	3021	if (page_queues_locked) { \
	3022	page_queues_locked = FALSE; \
	3023	vm_page_unlock_queues(); \
	3024	} \
	3025	MACRO_END
	3026
	3027	#if CONFIG_BACKGROUND_QUEUE
	3028	vm_page_update_background_state(m);
	3029	#endif
	3030	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
	3031	/*
	3032	* Compressor pages are neither wired
	3033	* nor pageable and should never change.
	3034	*/
	3035	assert(object == compressor_object);
	3036	} else if (change_wiring) {
	3037	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3038
	3039	if (wired) {
	3040	if (kr == KERN_SUCCESS) {
	3041	vm_page_wire(m, wire_tag, TRUE);
	3042	}
	3043	} else {
	3044	vm_page_unwire(m, TRUE);
	3045	}
	3046	/* we keep the page queues lock, if we need it later */
	3047	} else {
	3048	if (object->internal == TRUE) {
	3049	/*
	3050	* don't allow anonymous pages on
	3051	* the speculative queues
	3052	*/
	3053	no_cache = FALSE;
	3054	}
	3055	if (kr != KERN_SUCCESS) {
	3056	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3057	vm_page_deactivate(m);
	3058	/* we keep the page queues lock, if we need it later */
	3059	} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) \|\|
	3060	(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) \|\|
	3061	(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) \|\|
	3062	((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
	3063	!VM_PAGE_WIRED(m)) {
	3064	if (vm_page_local_q &&
	3065	(*type_of_fault == DBG_COW_FAULT \|\|
	3066	*type_of_fault == DBG_ZERO_FILL_FAULT)) {
	3067	struct vpl *lq;
	3068	uint32_t lid;
	3069
	3070	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
	3071
	3072	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
	3073	vm_object_lock_assert_exclusive(object);
	3074
	3075	/*
	3076	* we got a local queue to stuff this
	3077	* new page on...
	3078	* its safe to manipulate local and
	3079	* local_id at this point since we're
	3080	* behind an exclusive object lock and
	3081	* the page is not on any global queue.
	3082	*
	3083	* we'll use the current cpu number to
	3084	* select the queue note that we don't
	3085	* need to disable preemption... we're
	3086	* going to be behind the local queue's
	3087	* lock to do the real work
	3088	*/
	3089	lid = cpu_number();
	3090
	3091	lq = zpercpu_get_cpu(vm_page_local_q, lid);
	3092
	3093	VPL_LOCK(&lq->vpl_lock);
	3094
	3095	vm_page_check_pageable_safe(m);
	3096	vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
	3097	m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
	3098	m->vmp_local_id = lid;
	3099	lq->vpl_count++;
	3100
	3101	if (object->internal) {
	3102	lq->vpl_internal_count++;
	3103	} else {
	3104	lq->vpl_external_count++;
	3105	}
	3106
	3107	VPL_UNLOCK(&lq->vpl_lock);
	3108
	3109	if (lq->vpl_count > vm_page_local_q_soft_limit) {
	3110	/*
	3111	* we're beyond the soft limit
	3112	* for the local queue
	3113	* vm_page_reactivate_local will
	3114	* 'try' to take the global page
	3115	* queue lock... if it can't
	3116	* that's ok... we'll let the
	3117	* queue continue to grow up
	3118	* to the hard limit... at that
	3119	* point we'll wait for the
	3120	* lock... once we've got the
	3121	* lock, we'll transfer all of
	3122	* the pages from the local
	3123	* queue to the global active
	3124	* queue
	3125	*/
	3126	vm_page_reactivate_local(lid, FALSE, FALSE);
	3127	}
	3128	} else {
	3129	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3130
	3131	/*
	3132	* test again now that we hold the
	3133	* page queue lock
	3134	*/
	3135	if (!VM_PAGE_WIRED(m)) {
	3136	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
	3137	vm_page_queues_remove(m, FALSE);
	3138
	3139	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
	3140	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
	3141	}
	3142
	3143	if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) \|\|
	3144	no_cache) {
	3145	/*
	3146	* If this is a no_cache mapping
	3147	* and the page has never been
	3148	* mapped before or was
	3149	* previously a no_cache page,
	3150	* then we want to leave pages
	3151	* in the speculative state so
	3152	* that they can be readily
	3153	* recycled if free memory runs
	3154	* low. Otherwise the page is
	3155	* activated as normal.
	3156	*/
	3157
	3158	if (no_cache &&
	3159	(!previously_pmapped \|\|
	3160	m->vmp_no_cache)) {
	3161	m->vmp_no_cache = TRUE;
	3162
	3163	if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
	3164	vm_page_speculate(m, FALSE);
	3165	}
	3166	} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
	3167	vm_page_activate(m);
	3168	}
	3169	}
	3170	}
	3171	/* we keep the page queues lock, if we need it later */
	3172	}
	3173	}
	3174	}
	3175	/* we're done with the page queues lock, if we ever took it */
	3176	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
	3177	}
	3178
	3179	/*
	3180	* Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
	3181	* @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
	3182	* before being inserted into the pmap.
	3183	*/
	3184	static bool
	3185	vm_fault_enter_set_mapped(
	3186	vm_object_t object,
	3187	vm_page_t m,
	3188	vm_prot_t prot,
	3189	vm_prot_t fault_type)
	3190	{
	3191	bool page_needs_sync = false;
	3192	/*
	3193	* NOTE: we may only hold the vm_object lock SHARED
	3194	* at this point, so we need the phys_page lock to
	3195	* properly serialize updating the pmapped and
	3196	* xpmapped bits
	3197	*/
	3198	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
	3199	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3200
	3201	pmap_lock_phys_page(phys_page);
	3202	m->vmp_pmapped = TRUE;
	3203
	3204	if (!m->vmp_xpmapped) {
	3205	m->vmp_xpmapped = TRUE;
	3206
	3207	pmap_unlock_phys_page(phys_page);
	3208
	3209	if (!object->internal) {
	3210	OSAddAtomic(1, &vm_page_xpmapped_external_count);
	3211	}
	3212
	3213	#if defined(__arm__) \|\| defined(__arm64__)
	3214	page_needs_sync = true;
	3215	#else
	3216	if (object->internal &&
	3217	object->pager != NULL) {
	3218	/*
	3219	* This page could have been
	3220	* uncompressed by the
	3221	* compressor pager and its
	3222	* contents might be only in
	3223	* the data cache.
	3224	* Since it's being mapped for
	3225	* "execute" for the fist time,
	3226	* make sure the icache is in
	3227	* sync.
	3228	*/
	3229	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	3230	page_needs_sync = true;
	3231	}
	3232	#endif
	3233	} else {
	3234	pmap_unlock_phys_page(phys_page);
	3235	}
	3236	} else {
	3237	if (m->vmp_pmapped == FALSE) {
	3238	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3239
	3240	pmap_lock_phys_page(phys_page);
	3241	m->vmp_pmapped = TRUE;
	3242	pmap_unlock_phys_page(phys_page);
	3243	}
	3244	}
	3245
	3246	if (fault_type & VM_PROT_WRITE) {
	3247	if (m->vmp_wpmapped == FALSE) {
	3248	vm_object_lock_assert_exclusive(object);
	3249	if (!object->internal && object->pager) {
	3250	task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
	3251	}
	3252	m->vmp_wpmapped = TRUE;
	3253	}
	3254	}
	3255	return page_needs_sync;
	3256	}
	3257
	3258	/*
	3259	* Try to enter the given page into the pmap.
	3260	* Will retry without execute permission iff PMAP_CS is enabled and we encounter
	3261	* a codesigning failure on a non-execute fault.
	3262	*/
	3263	static kern_return_t
	3264	vm_fault_attempt_pmap_enter(
	3265	pmap_t pmap,
	3266	vm_map_offset_t vaddr,
	3267	vm_map_size_t fault_page_size,
	3268	vm_map_offset_t fault_phys_offset,
	3269	vm_page_t m,
	3270	vm_prot_t *prot,
	3271	vm_prot_t caller_prot,
	3272	vm_prot_t fault_type,
	3273	bool wired,
	3274	int pmap_options)
	3275	{
	3276	#if !PMAP_CS
	3277	#pragma unused(caller_prot)
	3278	#endif /* !PMAP_CS */
	3279	kern_return_t kr;
	3280	if (fault_page_size != PAGE_SIZE) {
	3281	DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
	3282	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	3283	fault_phys_offset < PAGE_SIZE),
	3284	"0x%llx\n", (uint64_t)fault_phys_offset);
	3285	} else {
	3286	assertf(fault_phys_offset == 0,
	3287	"0x%llx\n", (uint64_t)fault_phys_offset);
	3288	}
	3289
	3290	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3291	fault_phys_offset,
	3292	m, *prot, fault_type, 0,
	3293	wired,
	3294	pmap_options,
	3295	kr);
	3296	return kr;
	3297	}
	3298
	3299	/*
	3300	* Enter the given page into the pmap.
	3301	* The map must be locked shared.
	3302	* The vm object must NOT be locked.
	3303	*
	3304	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3305	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3306	*/
	3307	static kern_return_t
	3308	vm_fault_pmap_enter(
	3309	pmap_t pmap,
	3310	vm_map_offset_t vaddr,
	3311	vm_map_size_t fault_page_size,
	3312	vm_map_offset_t fault_phys_offset,
	3313	vm_page_t m,
	3314	vm_prot_t *prot,
	3315	vm_prot_t caller_prot,
	3316	vm_prot_t fault_type,
	3317	bool wired,
	3318	int pmap_options,
	3319	boolean_t *need_retry)
	3320	{
	3321	kern_return_t kr;
	3322	if (need_retry != NULL) {
	3323	/*
	3324	* Although we don't hold a lock on this object, we hold a lock
	3325	* on the top object in the chain. To prevent a deadlock, we
	3326	* can't allow the pmap layer to block.
	3327	*/
	3328	pmap_options \|= PMAP_OPTIONS_NOWAIT;
	3329	}
	3330	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3331	fault_page_size, fault_phys_offset,
	3332	m, prot, caller_prot, fault_type, wired, pmap_options);
	3333	if (kr == KERN_RESOURCE_SHORTAGE) {
	3334	if (need_retry) {
	3335	/*
	3336	* There's nothing we can do here since we hold the
	3337	* lock on the top object in the chain. The caller
	3338	* will need to deal with this by dropping that lock and retrying.
	3339	*/
	3340	*need_retry = TRUE;
	3341	vm_pmap_enter_retried++;
	3342	}
	3343	}
	3344	return kr;
	3345	}
	3346
	3347	/*
	3348	* Enter the given page into the pmap.
	3349	* The vm map must be locked shared.
	3350	* The vm object must be locked exclusive, unless this is a soft fault.
	3351	* For a soft fault, the object must be locked shared or exclusive.
	3352	*
	3353	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3354	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3355	*/
	3356	static kern_return_t
	3357	vm_fault_pmap_enter_with_object_lock(
	3358	vm_object_t object,
	3359	pmap_t pmap,
	3360	vm_map_offset_t vaddr,
	3361	vm_map_size_t fault_page_size,
	3362	vm_map_offset_t fault_phys_offset,
	3363	vm_page_t m,
	3364	vm_prot_t *prot,
	3365	vm_prot_t caller_prot,
	3366	vm_prot_t fault_type,
	3367	bool wired,
	3368	int pmap_options,
	3369	boolean_t *need_retry)
	3370	{
	3371	kern_return_t kr;
	3372	/*
	3373	* Prevent a deadlock by not
	3374	* holding the object lock if we need to wait for a page in
	3375	* pmap_enter() - <rdar://problem/7138958>
	3376	*/
	3377	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3378	fault_page_size, fault_phys_offset,
	3379	m, prot, caller_prot, fault_type, wired, pmap_options \| PMAP_OPTIONS_NOWAIT);
	3380	#if __x86_64__
	3381	if (kr == KERN_INVALID_ARGUMENT &&
	3382	pmap == PMAP_NULL &&
	3383	wired) {
	3384	/*
	3385	* Wiring a page in a pmap-less VM map:
	3386	* VMware's "vmmon" kernel extension does this
	3387	* to grab pages.
	3388	* Let it proceed even though the PMAP_ENTER() failed.
	3389	*/
	3390	kr = KERN_SUCCESS;
	3391	}
	3392	#endif /* __x86_64__ */
	3393
	3394	if (kr == KERN_RESOURCE_SHORTAGE) {
	3395	if (need_retry) {
	3396	/*
	3397	* this will be non-null in the case where we hold the lock
	3398	* on the top-object in this chain... we can't just drop
	3399	* the lock on the object we're inserting the page into
	3400	* and recall the PMAP_ENTER since we can still cause
	3401	* a deadlock if one of the critical paths tries to
	3402	* acquire the lock on the top-object and we're blocked
	3403	* in PMAP_ENTER waiting for memory... our only recourse
	3404	* is to deal with it at a higher level where we can
	3405	* drop both locks.
	3406	*/
	3407	*need_retry = TRUE;
	3408	vm_pmap_enter_retried++;
	3409	goto done;
	3410	}
	3411	/*
	3412	* The nonblocking version of pmap_enter did not succeed.
	3413	* and we don't need to drop other locks and retry
	3414	* at the level above us, so
	3415	* use the blocking version instead. Requires marking
	3416	* the page busy and unlocking the object
	3417	*/
	3418	boolean_t was_busy = m->vmp_busy;
	3419
	3420	vm_object_lock_assert_exclusive(object);
	3421
	3422	m->vmp_busy = TRUE;
	3423	vm_object_unlock(object);
	3424
	3425	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3426	fault_phys_offset,
	3427	m, *prot, fault_type,
	3428	0, wired,
	3429	pmap_options, kr);
	3430
	3431	assert(VM_PAGE_OBJECT(m) == object);
	3432
	3433	/* Take the object lock again. */
	3434	vm_object_lock(object);
	3435
	3436	/* If the page was busy, someone else will wake it up.
	3437	* Otherwise, we have to do it now. */
	3438	assert(m->vmp_busy);
	3439	if (!was_busy) {
	3440	PAGE_WAKEUP_DONE(m);
	3441	}
	3442	vm_pmap_enter_blocked++;
	3443	}
	3444
	3445	done:
	3446	return kr;
	3447	}
	3448
	3449	/*
	3450	* Prepare to enter a page into the pmap by checking CS, protection bits,
	3451	* and setting mapped bits on the page_t.
	3452	* Does not modify the page's paging queue.
	3453	*
	3454	* page queue lock must NOT be held
	3455	* m->vmp_object must be locked
	3456	*
	3457	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3458	* from vm_fault() as part of a soft fault.
	3459	*/
	3460	static kern_return_t
	3461	vm_fault_enter_prepare(
	3462	vm_page_t m,
	3463	pmap_t pmap,
	3464	vm_map_offset_t vaddr,
	3465	vm_prot_t *prot,
	3466	vm_prot_t caller_prot,
	3467	vm_map_size_t fault_page_size,
	3468	vm_map_offset_t fault_phys_offset,
	3469	boolean_t change_wiring,
	3470	vm_prot_t fault_type,
	3471	vm_object_fault_info_t fault_info,
	3472	int *type_of_fault,
	3473	bool *page_needs_data_sync)
	3474	{
	3475	kern_return_t kr;
	3476	bool is_tainted = false;
	3477	vm_object_t object;
	3478	boolean_t cs_bypass = fault_info->cs_bypass;
	3479
	3480	object = VM_PAGE_OBJECT(m);
	3481
	3482	vm_object_lock_assert_held(object);
	3483
	3484	#if KASAN
	3485	if (pmap == kernel_pmap) {
	3486	kasan_notify_address(vaddr, PAGE_SIZE);
	3487	}
	3488	#endif
	3489
	3490	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
	3491
	3492	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
	3493	vm_object_lock_assert_exclusive(object);
	3494	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
	3495	!change_wiring &&
	3496	(!m->vmp_wpmapped
	3497	#if VM_OBJECT_ACCESS_TRACKING
	3498	\|\| object->access_tracking
	3499	#endif /* VM_OBJECT_ACCESS_TRACKING */
	3500	)) {
	3501	/*
	3502	* This is not a "write" fault, so we
	3503	* might not have taken the object lock
	3504	* exclusively and we might not be able
	3505	* to update the "wpmapped" bit in
	3506	* vm_fault_enter().
	3507	* Let's just grant read access to
	3508	* the page for now and we'll
	3509	* soft-fault again if we need write
	3510	* access later...
	3511	*/
	3512
	3513	/* This had better not be a JIT page. */
	3514	if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
	3515	*prot &= ~VM_PROT_WRITE;
	3516	} else {
	3517	assert(cs_bypass);
	3518	}
	3519	}
	3520	if (m->vmp_pmapped == FALSE) {
	3521	if (m->vmp_clustered) {
	3522	if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
	3523	/*
	3524	* found it in the cache, but this
	3525	* is the first fault-in of the page (m->vmp_pmapped == FALSE)
	3526	* so it must have come in as part of
	3527	* a cluster... account 1 pagein against it
	3528	*/
	3529	if (object->internal) {
	3530	*type_of_fault = DBG_PAGEIND_FAULT;
	3531	} else {
	3532	*type_of_fault = DBG_PAGEINV_FAULT;
	3533	}
	3534
	3535	VM_PAGE_COUNT_AS_PAGEIN(m);
	3536	}
	3537	VM_PAGE_CONSUME_CLUSTERED(m);
	3538	}
	3539	}
	3540
	3541	if (*type_of_fault != DBG_COW_FAULT) {
	3542	DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
	3543
	3544	if (pmap == kernel_pmap) {
	3545	DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
	3546	}
	3547	}
	3548
	3549	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
	3550	*prot, caller_prot, fault_page_size, fault_phys_offset,
	3551	fault_info, &is_tainted);
	3552	if (kr == KERN_SUCCESS) {
	3553	/*
	3554	* We either have a good page, or a tainted page that has been accepted by the process.
	3555	* In both cases the page will be entered into the pmap.
	3556	*/
	3557	page_needs_data_sync = vm_fault_enter_set_mapped(object, m, prot, fault_type);
	3558	if ((fault_type & VM_PROT_WRITE) && is_tainted) {
	3559	/*
	3560	* This page is tainted but we're inserting it anyways.
	3561	* Since it's writeable, we need to disconnect it from other pmaps
	3562	* now so those processes can take note.
	3563	*/
	3564
	3565	/*
	3566	* We can only get here
	3567	* because of the CSE logic
	3568	*/
	3569	assert(pmap_get_vm_map_cs_enforced(pmap));
	3570	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	3571	/*
	3572	* If we are faulting for a write, we can clear
	3573	* the execute bit - that will ensure the page is
	3574	* checked again before being executable, which
	3575	* protects against a map switch.
	3576	* This only happens the first time the page
	3577	* gets tainted, so we won't get stuck here
	3578	* to make an already writeable page executable.
	3579	*/
	3580	if (!cs_bypass) {
	3581	assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
	3582	*prot &= ~VM_PROT_EXECUTE;
	3583	}
	3584	}
	3585	assert(VM_PAGE_OBJECT(m) == object);
	3586
	3587	#if VM_OBJECT_ACCESS_TRACKING
	3588	if (object->access_tracking) {
	3589	DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
	3590	if (fault_type & VM_PROT_WRITE) {
	3591	object->access_tracking_writes++;
	3592	vm_object_access_tracking_writes++;
	3593	} else {
	3594	object->access_tracking_reads++;
	3595	vm_object_access_tracking_reads++;
	3596	}
	3597	}
	3598	#endif /* VM_OBJECT_ACCESS_TRACKING */
	3599	}
	3600
	3601	return kr;
	3602	}
	3603
	3604	/*
	3605	* page queue lock must NOT be held
	3606	* m->vmp_object must be locked
	3607	*
	3608	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3609	* from vm_fault() as part of a soft fault. If so, we must be
	3610	* careful not to modify the VM object in any way that is not
	3611	* legal under a shared lock...
	3612	*/
	3613	kern_return_t
	3614	vm_fault_enter(
	3615	vm_page_t m,
	3616	pmap_t pmap,
	3617	vm_map_offset_t vaddr,
	3618	vm_map_size_t fault_page_size,
	3619	vm_map_offset_t fault_phys_offset,
	3620	vm_prot_t prot,
	3621	vm_prot_t caller_prot,
	3622	boolean_t wired,
	3623	boolean_t change_wiring,
	3624	vm_tag_t wire_tag,
	3625	vm_object_fault_info_t fault_info,
	3626	boolean_t *need_retry,
	3627	int *type_of_fault)
	3628	{
	3629	kern_return_t kr;
	3630	vm_object_t object;
	3631	bool page_needs_data_sync;
	3632	vm_prot_t fault_type;
	3633	int pmap_options = fault_info->pmap_options;
	3634
	3635	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	3636	assert(m->vmp_fictitious);
	3637	return KERN_SUCCESS;
	3638	}
	3639
	3640	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
	3641
	3642	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
	3643	fault_page_size, fault_phys_offset, change_wiring, fault_type,
	3644	fault_info, type_of_fault, &page_needs_data_sync);
	3645	object = VM_PAGE_OBJECT(m);
	3646
	3647	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
	3648
	3649	if (kr == KERN_SUCCESS) {
	3650	if (page_needs_data_sync) {
	3651	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
	3652	}
	3653
	3654	kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
	3655	fault_page_size, fault_phys_offset, m,
	3656	&prot, caller_prot, fault_type, wired, pmap_options, need_retry);
	3657	}
	3658
	3659	return kr;
	3660	}
	3661
	3662	void
	3663	vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
	3664	{
	3665	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
	3666	vm_fault(current_map(), /* map */
	3667	vaddr, /* vaddr */
	3668	prot, /* fault_type */
	3669	FALSE, /* change_wiring */
	3670	VM_KERN_MEMORY_NONE, /* tag - not wiring */
	3671	THREAD_UNINT, /* interruptible */
	3672	NULL, /* caller_pmap */
	3673	0 /* caller_pmap_addr */);
	3674	}
	3675	}
	3676
	3677
	3678	/*
	3679	* Routine: vm_fault
	3680	* Purpose:
	3681	* Handle page faults, including pseudo-faults
	3682	* used to change the wiring status of pages.
	3683	* Returns:
	3684	* Explicit continuations have been removed.
	3685	* Implementation:
	3686	* vm_fault and vm_fault_page save mucho state
	3687	* in the moral equivalent of a closure. The state
	3688	* structure is allocated when first entering vm_fault
	3689	* and deallocated when leaving vm_fault.
	3690	*/
	3691
	3692	extern uint64_t get_current_unique_pid(void);
	3693
	3694	unsigned long vm_fault_collapse_total = 0;
	3695	unsigned long vm_fault_collapse_skipped = 0;
	3696
	3697
	3698	kern_return_t
	3699	vm_fault_external(
	3700	vm_map_t map,
	3701	vm_map_offset_t vaddr,
	3702	vm_prot_t fault_type,
	3703	boolean_t change_wiring,
	3704	int interruptible,
	3705	pmap_t caller_pmap,
	3706	vm_map_offset_t caller_pmap_addr)
	3707	{
	3708	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
	3709	change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
	3710	interruptible, caller_pmap, caller_pmap_addr,
	3711	NULL);
	3712	}
	3713
	3714	kern_return_t
	3715	vm_fault(
	3716	vm_map_t map,
	3717	vm_map_offset_t vaddr,
	3718	vm_prot_t fault_type,
	3719	boolean_t change_wiring,
	3720	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3721	int interruptible,
	3722	pmap_t caller_pmap,
	3723	vm_map_offset_t caller_pmap_addr)
	3724	{
	3725	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
	3726	interruptible, caller_pmap, caller_pmap_addr,
	3727	NULL);
	3728	}
	3729
	3730	static boolean_t
	3731	current_proc_is_privileged(void)
	3732	{
	3733	return csproc_get_platform_binary(current_proc());
	3734	}
	3735
	3736	uint64_t vm_copied_on_read = 0;
	3737
	3738	/*
	3739	* Cleanup after a vm_fault_enter.
	3740	* At this point, the fault should either have failed (kr != KERN_SUCCESS)
	3741	* or the page should be in the pmap and on the correct paging queue.
	3742	*
	3743	* Precondition:
	3744	* map must be locked shared.
	3745	* m_object must be locked.
	3746	* If top_object != VM_OBJECT_NULL, it must be locked.
	3747	* real_map must be locked.
	3748	*
	3749	* Postcondition:
	3750	* map will be unlocked
	3751	* m_object will be unlocked
	3752	* top_object will be unlocked
	3753	* If real_map != map, it will be unlocked
	3754	*/
	3755	static void
	3756	vm_fault_complete(
	3757	vm_map_t map,
	3758	vm_map_t real_map,
	3759	vm_object_t object,
	3760	vm_object_t m_object,
	3761	vm_page_t m,
	3762	vm_map_offset_t offset,
	3763	vm_map_offset_t trace_real_vaddr,
	3764	vm_object_fault_info_t fault_info,
	3765	vm_prot_t caller_prot,
	3766	#if CONFIG_DTRACE
	3767	vm_map_offset_t real_vaddr,
	3768	#else
	3769	__unused vm_map_offset_t real_vaddr,
	3770	#endif /* CONFIG_DTRACE */
	3771	int type_of_fault,
	3772	boolean_t need_retry,
	3773	kern_return_t kr,
	3774	ppnum_t *physpage_p,
	3775	vm_prot_t prot,
	3776	vm_object_t top_object,
	3777	boolean_t need_collapse,
	3778	vm_map_offset_t cur_offset,
	3779	vm_prot_t fault_type,
	3780	vm_object_t *written_on_object,
	3781	memory_object_t *written_on_pager,
	3782	vm_object_offset_t *written_on_offset)
	3783	{
	3784	int event_code = 0;
	3785	vm_map_lock_assert_shared(map);
	3786	vm_object_lock_assert_held(m_object);
	3787	if (top_object != VM_OBJECT_NULL) {
	3788	vm_object_lock_assert_held(top_object);
	3789	}
	3790	vm_map_lock_assert_held(real_map);
	3791
	3792	if (m_object->internal) {
	3793	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	3794	} else if (m_object->object_is_shared_cache) {
	3795	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	3796	} else {
	3797	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	3798	}
	3799
	3800	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) \| (caller_prot << 8) \| type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
	3801	if (need_retry == FALSE) {
	3802	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
	3803	}
	3804	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
	3805	if (kr == KERN_SUCCESS &&
	3806	physpage_p != NULL) {
	3807	/* for vm_map_wire_and_extract() */
	3808	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	3809	if (prot & VM_PROT_WRITE) {
	3810	vm_object_lock_assert_exclusive(m_object);
	3811	m->vmp_dirty = TRUE;
	3812	}
	3813	}
	3814
	3815	if (top_object != VM_OBJECT_NULL) {
	3816	/*
	3817	* It's safe to drop the top object
	3818	* now that we've done our
	3819	* vm_fault_enter(). Any other fault
	3820	* in progress for that virtual
	3821	* address will either find our page
	3822	* and translation or put in a new page
	3823	* and translation.
	3824	*/
	3825	vm_object_unlock(top_object);
	3826	top_object = VM_OBJECT_NULL;
	3827	}
	3828
	3829	if (need_collapse == TRUE) {
	3830	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
	3831	}
	3832
	3833	if (need_retry == FALSE &&
	3834	(type_of_fault == DBG_PAGEIND_FAULT \|\| type_of_fault == DBG_PAGEINV_FAULT \|\| type_of_fault == DBG_CACHE_HIT_FAULT)) {
	3835	/*
	3836	* evaluate access pattern and update state
	3837	* vm_fault_deactivate_behind depends on the
	3838	* state being up to date
	3839	*/
	3840	vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
	3841
	3842	vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
	3843	}
	3844	/*
	3845	* That's it, clean up and return.
	3846	*/
	3847	if (m->vmp_busy) {
	3848	vm_object_lock_assert_exclusive(m_object);
	3849	PAGE_WAKEUP_DONE(m);
	3850	}
	3851
	3852	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
	3853	vm_object_paging_begin(m_object);
	3854
	3855	assert(*written_on_object == VM_OBJECT_NULL);
	3856	*written_on_object = m_object;
	3857	*written_on_pager = m_object->pager;
	3858	*written_on_offset = m_object->paging_offset + m->vmp_offset;
	3859	}
	3860	vm_object_unlock(object);
	3861
	3862	vm_map_unlock_read(map);
	3863	if (real_map != map) {
	3864	vm_map_unlock(real_map);
	3865	}
	3866	}
	3867
	3868	static inline int
	3869	vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
	3870	{
	3871	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
	3872	return DBG_COR_FAULT;
	3873	}
	3874	return type_of_fault;
	3875	}
	3876
	3877	kern_return_t
	3878	vm_fault_internal(
	3879	vm_map_t map,
	3880	vm_map_offset_t vaddr,
	3881	vm_prot_t caller_prot,
	3882	boolean_t change_wiring,
	3883	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3884	int interruptible,
	3885	pmap_t caller_pmap,
	3886	vm_map_offset_t caller_pmap_addr,
	3887	ppnum_t *physpage_p)
	3888	{
	3889	vm_map_version_t version; /* Map version for verificiation */
	3890	boolean_t wired; /* Should mapping be wired down? */
	3891	vm_object_t object; /* Top-level object */
	3892	vm_object_offset_t offset; /* Top-level offset */
	3893	vm_prot_t prot; /* Protection for mapping */
	3894	vm_object_t old_copy_object; /* Saved copy object */
	3895	vm_page_t result_page; /* Result of vm_fault_page */
	3896	vm_page_t top_page; /* Placeholder page */
	3897	kern_return_t kr;
	3898
	3899	vm_page_t m; /* Fast access to result_page */
	3900	kern_return_t error_code;
	3901	vm_object_t cur_object;
	3902	vm_object_t m_object = NULL;
	3903	vm_object_offset_t cur_offset;
	3904	vm_page_t cur_m;
	3905	vm_object_t new_object;
	3906	int type_of_fault;
	3907	pmap_t pmap;
	3908	wait_interrupt_t interruptible_state;
	3909	vm_map_t real_map = map;
	3910	vm_map_t original_map = map;
	3911	bool object_locks_dropped = FALSE;
	3912	vm_prot_t fault_type;
	3913	vm_prot_t original_fault_type;
	3914	struct vm_object_fault_info fault_info = {};
	3915	bool need_collapse = FALSE;
	3916	boolean_t need_retry = FALSE;
	3917	boolean_t *need_retry_ptr = NULL;
	3918	uint8_t object_lock_type = 0;
	3919	uint8_t cur_object_lock_type;
	3920	vm_object_t top_object = VM_OBJECT_NULL;
	3921	vm_object_t written_on_object = VM_OBJECT_NULL;
	3922	memory_object_t written_on_pager = NULL;
	3923	vm_object_offset_t written_on_offset = 0;
	3924	int throttle_delay;
	3925	int compressed_count_delta;
	3926	uint8_t grab_options;
	3927	bool need_copy;
	3928	bool need_copy_on_read;
	3929	vm_map_offset_t trace_vaddr;
	3930	vm_map_offset_t trace_real_vaddr;
	3931	vm_map_size_t fault_page_size;
	3932	vm_map_size_t fault_page_mask;
	3933	vm_map_offset_t fault_phys_offset;
	3934	vm_map_offset_t real_vaddr;
	3935	bool resilient_media_retry = FALSE;
	3936	vm_object_t resilient_media_object = VM_OBJECT_NULL;
	3937	vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
	3938	bool page_needs_data_sync = false;
	3939	/*
	3940	* Was the VM object contended when vm_map_lookup_locked locked it?
	3941	* If so, the zero fill path will drop the lock
	3942	* NB: Ideally we would always drop the lock rather than rely on
	3943	* this heuristic, but vm_object_unlock currently takes > 30 cycles.
	3944	*/
	3945	bool object_is_contended = false;
	3946
	3947	real_vaddr = vaddr;
	3948	trace_real_vaddr = vaddr;
	3949
	3950	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
	3951	fault_phys_offset = (vm_map_offset_t)-1;
	3952	fault_page_size = VM_MAP_PAGE_SIZE(original_map);
	3953	fault_page_mask = VM_MAP_PAGE_MASK(original_map);
	3954	if (fault_page_size < PAGE_SIZE) {
	3955	DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
	3956	vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
	3957	}
	3958	} else {
	3959	fault_phys_offset = 0;
	3960	fault_page_size = PAGE_SIZE;
	3961	fault_page_mask = PAGE_MASK;
	3962	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
	3963	}
	3964
	3965	if (map == kernel_map) {
	3966	trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
	3967	trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
	3968	} else {
	3969	trace_vaddr = vaddr;
	3970	}
	3971
	3972	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3973	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_START,
	3974	((uint64_t)trace_vaddr >> 32),
	3975	trace_vaddr,
	3976	(map == kernel_map),
	3977	0,
	3978	0);
	3979
	3980	if (get_preemption_level() != 0) {
	3981	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3982	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	3983	((uint64_t)trace_vaddr >> 32),
	3984	trace_vaddr,
	3985	KERN_FAILURE,
	3986	0,
	3987	0);
	3988
	3989	return KERN_FAILURE;
	3990	}
	3991
	3992	thread_t cthread = current_thread();
	3993	bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
	3994	uint64_t fstart = 0;
	3995
	3996	if (rtfault) {
	3997	fstart = mach_continuous_time();
	3998	}
	3999
	4000	interruptible_state = thread_interrupt_level(interruptible);
	4001
	4002	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
	4003
	4004	counter_inc(&vm_statistics_faults);
	4005	counter_inc(&current_task()->faults);
	4006	original_fault_type = fault_type;
	4007
	4008	need_copy = FALSE;
	4009	if (fault_type & VM_PROT_WRITE) {
	4010	need_copy = TRUE;
	4011	}
	4012
	4013	if (need_copy \|\| change_wiring) {
	4014	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4015	} else {
	4016	object_lock_type = OBJECT_LOCK_SHARED;
	4017	}
	4018
	4019	cur_object_lock_type = OBJECT_LOCK_SHARED;
	4020
	4021	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
	4022	if (compressor_map) {
	4023	if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
	4024	panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void ) vaddr, caller_prot, (void ) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
	4025	}
	4026	}
	4027	}
	4028	RetryFault:
	4029	assert(written_on_object == VM_OBJECT_NULL);
	4030
	4031	/*
	4032	* assume we will hit a page in the cache
	4033	* otherwise, explicitly override with
	4034	* the real fault type once we determine it
	4035	*/
	4036	type_of_fault = DBG_CACHE_HIT_FAULT;
	4037
	4038	/*
	4039	* Find the backing store object and offset into
	4040	* it to begin the search.
	4041	*/
	4042	fault_type = original_fault_type;
	4043	map = original_map;
	4044	vm_map_lock_read(map);
	4045
	4046	if (resilient_media_retry) {
	4047	/*
	4048	* If we have to insert a fake zero-filled page to hide
	4049	* a media failure to provide the real page, we need to
	4050	* resolve any pending copy-on-write on this mapping.
	4051	* VM_PROT_COPY tells vm_map_lookup_locked() to deal
	4052	* with that even if this is not a "write" fault.
	4053	*/
	4054	need_copy = TRUE;
	4055	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4056	}
	4057
	4058	kr = vm_map_lookup_locked(&map, vaddr,
	4059	(fault_type \| (need_copy ? VM_PROT_COPY : 0)),
	4060	object_lock_type, &version,
	4061	&object, &offset, &prot, &wired,
	4062	&fault_info,
	4063	&real_map,
	4064	&object_is_contended);
	4065
	4066	if (kr != KERN_SUCCESS) {
	4067	vm_map_unlock_read(map);
	4068	goto done;
	4069	}
	4070
	4071
	4072	pmap = real_map->pmap;
	4073	fault_info.interruptible = interruptible;
	4074	fault_info.stealth = FALSE;
	4075	fault_info.io_sync = FALSE;
	4076	fault_info.mark_zf_absent = FALSE;
	4077	fault_info.batch_pmap_op = FALSE;
	4078
	4079	if (resilient_media_retry) {
	4080	/*
	4081	* We're retrying this fault after having detected a media
	4082	* failure from a "resilient_media" mapping.
	4083	* Check that the mapping is still pointing at the object
	4084	* that just failed to provide a page.
	4085	*/
	4086	assert(resilient_media_object != VM_OBJECT_NULL);
	4087	assert(resilient_media_offset != (vm_object_offset_t)-1);
	4088	if (object != VM_OBJECT_NULL &&
	4089	object == resilient_media_object &&
	4090	offset == resilient_media_offset &&
	4091	fault_info.resilient_media) {
	4092	/*
	4093	* This mapping still points at the same object
	4094	* and is still "resilient_media": proceed in
	4095	* "recovery-from-media-failure" mode, where we'll
	4096	* insert a zero-filled page in the top object.
	4097	*/
	4098	// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
	4099	} else {
	4100	/* not recovering: reset state */
	4101	// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
	4102	resilient_media_retry = FALSE;
	4103	/* release our extra reference on failed object */
	4104	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	4105	vm_object_deallocate(resilient_media_object);
	4106	resilient_media_object = VM_OBJECT_NULL;
	4107	resilient_media_offset = (vm_object_offset_t)-1;
	4108	}
	4109	} else {
	4110	assert(resilient_media_object == VM_OBJECT_NULL);
	4111	resilient_media_offset = (vm_object_offset_t)-1;
	4112	}
	4113
	4114	/*
	4115	* If the page is wired, we must fault for the current protection
	4116	* value, to avoid further faults.
	4117	*/
	4118	if (wired) {
	4119	fault_type = prot \| VM_PROT_WRITE;
	4120	}
	4121	if (wired \|\| need_copy) {
	4122	/*
	4123	* since we're treating this fault as a 'write'
	4124	* we must hold the top object lock exclusively
	4125	*/
	4126	if (object_lock_type == OBJECT_LOCK_SHARED) {
	4127	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4128
	4129	if (vm_object_lock_upgrade(object) == FALSE) {
	4130	/*
	4131	* couldn't upgrade, so explictly
	4132	* take the lock exclusively
	4133	*/
	4134	vm_object_lock(object);
	4135	}
	4136	}
	4137	}
	4138
	4139	#if VM_FAULT_CLASSIFY
	4140	/*
	4141	* Temporary data gathering code
	4142	*/
	4143	vm_fault_classify(object, offset, fault_type);
	4144	#endif
	4145	/*
	4146	* Fast fault code. The basic idea is to do as much as
	4147	* possible while holding the map lock and object locks.
	4148	* Busy pages are not used until the object lock has to
	4149	* be dropped to do something (copy, zero fill, pmap enter).
	4150	* Similarly, paging references aren't acquired until that
	4151	* point, and object references aren't used.
	4152	*
	4153	* If we can figure out what to do
	4154	* (zero fill, copy on write, pmap enter) while holding
	4155	* the locks, then it gets done. Otherwise, we give up,
	4156	* and use the original fault path (which doesn't hold
	4157	* the map lock, and relies on busy pages).
	4158	* The give up cases include:
	4159	* - Have to talk to pager.
	4160	* - Page is busy, absent or in error.
	4161	* - Pager has locked out desired access.
	4162	* - Fault needs to be restarted.
	4163	* - Have to push page into copy object.
	4164	*
	4165	* The code is an infinite loop that moves one level down
	4166	* the shadow chain each time. cur_object and cur_offset
	4167	* refer to the current object being examined. object and offset
	4168	* are the original object from the map. The loop is at the
	4169	* top level if and only if object and cur_object are the same.
	4170	*
	4171	* Invariants: Map lock is held throughout. Lock is held on
	4172	* original object and cur_object (if different) when
	4173	* continuing or exiting loop.
	4174	*
	4175	*/
	4176
	4177	#if defined(__arm64__)
	4178	/*
	4179	* Fail if reading an execute-only page in a
	4180	* pmap that enforces execute-only protection.
	4181	*/
	4182	if (fault_type == VM_PROT_READ &&
	4183	(prot & VM_PROT_EXECUTE) &&
	4184	!(prot & VM_PROT_READ) &&
	4185	pmap_enforces_execute_only(pmap)) {
	4186	vm_object_unlock(object);
	4187	vm_map_unlock_read(map);
	4188	if (real_map != map) {
	4189	vm_map_unlock(real_map);
	4190	}
	4191	kr = KERN_PROTECTION_FAILURE;
	4192	goto done;
	4193	}
	4194	#endif
	4195
	4196	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
	4197
	4198	/*
	4199	* If this page is to be inserted in a copy delay object
	4200	* for writing, and if the object has a copy, then the
	4201	* copy delay strategy is implemented in the slow fault page.
	4202	*/
	4203	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
	4204	object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
	4205	goto handle_copy_delay;
	4206	}
	4207
	4208	cur_object = object;
	4209	cur_offset = offset;
	4210
	4211	grab_options = 0;
	4212	#if CONFIG_SECLUDED_MEMORY
	4213	if (object->can_grab_secluded) {
	4214	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	4215	}
	4216	#endif /* CONFIG_SECLUDED_MEMORY */
	4217
	4218	while (TRUE) {
	4219	if (!cur_object->pager_created &&
	4220	cur_object->phys_contiguous) { /* superpage */
	4221	break;
	4222	}
	4223
	4224	if (cur_object->blocked_access) {
	4225	/*
	4226	* Access to this VM object has been blocked.
	4227	* Let the slow path handle it.
	4228	*/
	4229	break;
	4230	}
	4231
	4232	m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
	4233	m_object = NULL;
	4234
	4235	if (m != VM_PAGE_NULL) {
	4236	m_object = cur_object;
	4237
	4238	if (m->vmp_busy) {
	4239	wait_result_t result;
	4240
	4241	/*
	4242	* in order to do the PAGE_ASSERT_WAIT, we must
	4243	* have object that 'm' belongs to locked exclusively
	4244	*/
	4245	if (object != cur_object) {
	4246	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4247	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4248
	4249	if (vm_object_lock_upgrade(cur_object) == FALSE) {
	4250	/*
	4251	* couldn't upgrade so go do a full retry
	4252	* immediately since we can no longer be
	4253	* certain about cur_object (since we
	4254	* don't hold a reference on it)...
	4255	* first drop the top object lock
	4256	*/
	4257	vm_object_unlock(object);
	4258
	4259	vm_map_unlock_read(map);
	4260	if (real_map != map) {
	4261	vm_map_unlock(real_map);
	4262	}
	4263
	4264	goto RetryFault;
	4265	}
	4266	}
	4267	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4268	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4269
	4270	if (vm_object_lock_upgrade(object) == FALSE) {
	4271	/*
	4272	* couldn't upgrade, so explictly take the lock
	4273	* exclusively and go relookup the page since we
	4274	* will have dropped the object lock and
	4275	* a different thread could have inserted
	4276	* a page at this offset
	4277	* no need for a full retry since we're
	4278	* at the top level of the object chain
	4279	*/
	4280	vm_object_lock(object);
	4281
	4282	continue;
	4283	}
	4284	}
	4285	if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
	4286	/*
	4287	* m->vmp_busy == TRUE and the object is locked exclusively
	4288	* if m->pageout_queue == TRUE after we acquire the
	4289	* queues lock, we are guaranteed that it is stable on
	4290	* the pageout queue and therefore reclaimable
	4291	*
	4292	* NOTE: this is only true for the internal pageout queue
	4293	* in the compressor world
	4294	*/
	4295	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	4296
	4297	vm_page_lock_queues();
	4298
	4299	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
	4300	vm_pageout_throttle_up(m);
	4301	vm_page_unlock_queues();
	4302
	4303	PAGE_WAKEUP_DONE(m);
	4304	goto reclaimed_from_pageout;
	4305	}
	4306	vm_page_unlock_queues();
	4307	}
	4308	if (object != cur_object) {
	4309	vm_object_unlock(object);
	4310	}
	4311
	4312	vm_map_unlock_read(map);
	4313	if (real_map != map) {
	4314	vm_map_unlock(real_map);
	4315	}
	4316
	4317	result = PAGE_ASSERT_WAIT(m, interruptible);
	4318
	4319	vm_object_unlock(cur_object);
	4320
	4321	if (result == THREAD_WAITING) {
	4322	result = thread_block(THREAD_CONTINUE_NULL);
	4323	}
	4324	if (result == THREAD_AWAKENED \|\| result == THREAD_RESTART) {
	4325	goto RetryFault;
	4326	}
	4327
	4328	kr = KERN_ABORTED;
	4329	goto done;
	4330	}
	4331	reclaimed_from_pageout:
	4332	if (m->vmp_laundry) {
	4333	if (object != cur_object) {
	4334	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4335	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4336
	4337	vm_object_unlock(object);
	4338	vm_object_unlock(cur_object);
	4339
	4340	vm_map_unlock_read(map);
	4341	if (real_map != map) {
	4342	vm_map_unlock(real_map);
	4343	}
	4344
	4345	goto RetryFault;
	4346	}
	4347	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4348	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4349
	4350	if (vm_object_lock_upgrade(object) == FALSE) {
	4351	/*
	4352	* couldn't upgrade, so explictly take the lock
	4353	* exclusively and go relookup the page since we
	4354	* will have dropped the object lock and
	4355	* a different thread could have inserted
	4356	* a page at this offset
	4357	* no need for a full retry since we're
	4358	* at the top level of the object chain
	4359	*/
	4360	vm_object_lock(object);
	4361
	4362	continue;
	4363	}
	4364	}
	4365	vm_pageout_steal_laundry(m, FALSE);
	4366	}
	4367
	4368	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	4369	/*
	4370	* Guard page: let the slow path deal with it
	4371	*/
	4372	break;
	4373	}
	4374	if (m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_private \|\| m->vmp_absent)) {
	4375	/*
	4376	* Unusual case... let the slow path deal with it
	4377	*/
	4378	break;
	4379	}
	4380	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
	4381	if (object != cur_object) {
	4382	vm_object_unlock(object);
	4383	}
	4384	vm_map_unlock_read(map);
	4385	if (real_map != map) {
	4386	vm_map_unlock(real_map);
	4387	}
	4388	vm_object_unlock(cur_object);
	4389	kr = KERN_MEMORY_ERROR;
	4390	goto done;
	4391	}
	4392	assert(m_object == VM_PAGE_OBJECT(m));
	4393
	4394	if (vm_fault_cs_need_validation(map->pmap, m, m_object,
	4395	PAGE_SIZE, 0) \|\|
	4396	(physpage_p != NULL && (prot & VM_PROT_WRITE))) {
	4397	upgrade_lock_and_retry:
	4398	/*
	4399	* We might need to validate this page
	4400	* against its code signature, so we
	4401	* want to hold the VM object exclusively.
	4402	*/
	4403	if (object != cur_object) {
	4404	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4405	vm_object_unlock(object);
	4406	vm_object_unlock(cur_object);
	4407
	4408	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4409
	4410	vm_map_unlock_read(map);
	4411	if (real_map != map) {
	4412	vm_map_unlock(real_map);
	4413	}
	4414
	4415	goto RetryFault;
	4416	}
	4417	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4418	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4419
	4420	if (vm_object_lock_upgrade(object) == FALSE) {
	4421	/*
	4422	* couldn't upgrade, so explictly take the lock
	4423	* exclusively and go relookup the page since we
	4424	* will have dropped the object lock and
	4425	* a different thread could have inserted
	4426	* a page at this offset
	4427	* no need for a full retry since we're
	4428	* at the top level of the object chain
	4429	*/
	4430	vm_object_lock(object);
	4431
	4432	continue;
	4433	}
	4434	}
	4435	}
	4436	/*
	4437	* Two cases of map in faults:
	4438	* - At top level w/o copy object.
	4439	* - Read fault anywhere.
	4440	* --> must disallow write.
	4441	*/
	4442
	4443	if (object == cur_object && object->copy == VM_OBJECT_NULL) {
	4444	goto FastPmapEnter;
	4445	}
	4446
	4447	if (!need_copy &&
	4448	!fault_info.no_copy_on_read &&
	4449	cur_object != object &&
	4450	!cur_object->internal &&
	4451	!cur_object->pager_trusted &&
	4452	vm_protect_privileged_from_untrusted &&
	4453	!((prot & VM_PROT_EXECUTE) &&
	4454	cur_object->code_signed &&
	4455	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
	4456	current_proc_is_privileged()) {
	4457	/*
	4458	* We're faulting on a page in "object" and
	4459	* went down the shadow chain to "cur_object"
	4460	* to find out that "cur_object"'s pager
	4461	* is not "trusted", i.e. we can not trust it
	4462	* to always return the same contents.
	4463	* Since the target is a "privileged" process,
	4464	* let's treat this as a copy-on-read fault, as
	4465	* if it was a copy-on-write fault.
	4466	* Once "object" gets a copy of this page, it
	4467	* won't have to rely on "cur_object" to
	4468	* provide the contents again.
	4469	*
	4470	* This is done by setting "need_copy" and
	4471	* retrying the fault from the top with the
	4472	* appropriate locking.
	4473	*
	4474	* Special case: if the mapping is executable
	4475	* and the untrusted object is code-signed and
	4476	* the process is "cs_enforced", we do not
	4477	* copy-on-read because that would break
	4478	* code-signing enforcement expectations (an
	4479	* executable page must belong to a code-signed
	4480	* object) and we can rely on code-signing
	4481	* to re-validate the page if it gets evicted
	4482	* and paged back in.
	4483	*/
	4484	// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4485	vm_copied_on_read++;
	4486	need_copy = TRUE;
	4487
	4488	vm_object_unlock(object);
	4489	vm_object_unlock(cur_object);
	4490	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4491	vm_map_unlock_read(map);
	4492	if (real_map != map) {
	4493	vm_map_unlock(real_map);
	4494	}
	4495	goto RetryFault;
	4496	}
	4497
	4498	if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
	4499	if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
	4500	prot &= ~VM_PROT_WRITE;
	4501	} else {
	4502	/*
	4503	* For a protection that the pmap cares
	4504	* about, we must hand over the full
	4505	* set of protections (so that the pmap
	4506	* layer can apply any desired policy).
	4507	* This means that cs_bypass must be
	4508	* set, as this can force us to pass
	4509	* RWX.
	4510	*/
	4511	assert(fault_info.cs_bypass);
	4512	}
	4513
	4514	if (object != cur_object) {
	4515	/*
	4516	* We still need to hold the top object
	4517	* lock here to prevent a race between
	4518	* a read fault (taking only "shared"
	4519	* locks) and a write fault (taking
	4520	* an "exclusive" lock on the top
	4521	* object.
	4522	* Otherwise, as soon as we release the
	4523	* top lock, the write fault could
	4524	* proceed and actually complete before
	4525	* the read fault, and the copied page's
	4526	* translation could then be overwritten
	4527	* by the read fault's translation for
	4528	* the original page.
	4529	*
	4530	* Let's just record what the top object
	4531	* is and we'll release it later.
	4532	*/
	4533	top_object = object;
	4534
	4535	/*
	4536	* switch to the object that has the new page
	4537	*/
	4538	object = cur_object;
	4539	object_lock_type = cur_object_lock_type;
	4540	}
	4541	FastPmapEnter:
	4542	assert(m_object == VM_PAGE_OBJECT(m));
	4543
	4544	/*
	4545	* prepare for the pmap_enter...
	4546	* object and map are both locked
	4547	* m contains valid data
	4548	* object == m->vmp_object
	4549	* cur_object == NULL or it's been unlocked
	4550	* no paging references on either object or cur_object
	4551	*/
	4552	if (top_object != VM_OBJECT_NULL \|\| object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
	4553	need_retry_ptr = &need_retry;
	4554	} else {
	4555	need_retry_ptr = NULL;
	4556	}
	4557
	4558	if (fault_page_size < PAGE_SIZE) {
	4559	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	4560	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	4561	fault_phys_offset < PAGE_SIZE),
	4562	"0x%llx\n", (uint64_t)fault_phys_offset);
	4563	} else {
	4564	assertf(fault_phys_offset == 0,
	4565	"0x%llx\n", (uint64_t)fault_phys_offset);
	4566	}
	4567
	4568	if (caller_pmap) {
	4569	kr = vm_fault_enter(m,
	4570	caller_pmap,
	4571	caller_pmap_addr,
	4572	fault_page_size,
	4573	fault_phys_offset,
	4574	prot,
	4575	caller_prot,
	4576	wired,
	4577	change_wiring,
	4578	wire_tag,
	4579	&fault_info,
	4580	need_retry_ptr,
	4581	&type_of_fault);
	4582	} else {
	4583	kr = vm_fault_enter(m,
	4584	pmap,
	4585	vaddr,
	4586	fault_page_size,
	4587	fault_phys_offset,
	4588	prot,
	4589	caller_prot,
	4590	wired,
	4591	change_wiring,
	4592	wire_tag,
	4593	&fault_info,
	4594	need_retry_ptr,
	4595	&type_of_fault);
	4596	}
	4597
	4598	vm_fault_complete(
	4599	map,
	4600	real_map,
	4601	object,
	4602	m_object,
	4603	m,
	4604	offset,
	4605	trace_real_vaddr,
	4606	&fault_info,
	4607	caller_prot,
	4608	real_vaddr,
	4609	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
	4610	need_retry,
	4611	kr,
	4612	physpage_p,
	4613	prot,
	4614	top_object,
	4615	need_collapse,
	4616	cur_offset,
	4617	fault_type,
	4618	&written_on_object,
	4619	&written_on_pager,
	4620	&written_on_offset);
	4621	top_object = VM_OBJECT_NULL;
	4622	if (need_retry == TRUE) {
	4623	/*
	4624	* vm_fault_enter couldn't complete the PMAP_ENTER...
	4625	* at this point we don't hold any locks so it's safe
	4626	* to ask the pmap layer to expand the page table to
	4627	* accommodate this mapping... once expanded, we'll
	4628	* re-drive the fault which should result in vm_fault_enter
	4629	* being able to successfully enter the mapping this time around
	4630	*/
	4631	(void)pmap_enter_options(
	4632	pmap, vaddr, 0, 0, 0, 0, 0,
	4633	PMAP_OPTIONS_NOENTER, NULL);
	4634
	4635	need_retry = FALSE;
	4636	goto RetryFault;
	4637	}
	4638	goto done;
	4639	}
	4640	/*
	4641	* COPY ON WRITE FAULT
	4642	*/
	4643	assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
	4644
	4645	/*
	4646	* If objects match, then
	4647	* object->copy must not be NULL (else control
	4648	* would be in previous code block), and we
	4649	* have a potential push into the copy object
	4650	* with which we can't cope with here.
	4651	*/
	4652	if (cur_object == object) {
	4653	/*
	4654	* must take the slow path to
	4655	* deal with the copy push
	4656	*/
	4657	break;
	4658	}
	4659
	4660	/*
	4661	* This is now a shadow based copy on write
	4662	* fault -- it requires a copy up the shadow
	4663	* chain.
	4664	*/
	4665	assert(m_object == VM_PAGE_OBJECT(m));
	4666
	4667	if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
	4668	vm_fault_cs_need_validation(NULL, m, m_object,
	4669	PAGE_SIZE, 0)) {
	4670	goto upgrade_lock_and_retry;
	4671	}
	4672
	4673	/*
	4674	* Allocate a page in the original top level
	4675	* object. Give up if allocate fails. Also
	4676	* need to remember current page, as it's the
	4677	* source of the copy.
	4678	*
	4679	* at this point we hold locks on both
	4680	* object and cur_object... no need to take
	4681	* paging refs or mark pages BUSY since
	4682	* we don't drop either object lock until
	4683	* the page has been copied and inserted
	4684	*/
	4685	cur_m = m;
	4686	m = vm_page_grab_options(grab_options);
	4687	m_object = NULL;
	4688
	4689	if (m == VM_PAGE_NULL) {
	4690	/*
	4691	* no free page currently available...
	4692	* must take the slow path
	4693	*/
	4694	break;
	4695	}
	4696	/*
	4697	* Now do the copy. Mark the source page busy...
	4698	*
	4699	* NOTE: This code holds the map lock across
	4700	* the page copy.
	4701	*/
	4702	vm_page_copy(cur_m, m);
	4703	vm_page_insert(m, object, vm_object_trunc_page(offset));
	4704	if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
	4705	DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4706	}
	4707	m_object = object;
	4708	SET_PAGE_DIRTY(m, FALSE);
	4709
	4710	/*
	4711	* Now cope with the source page and object
	4712	*/
	4713	if (object->ref_count > 1 && cur_m->vmp_pmapped) {
	4714	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
	4715	} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
	4716	/*
	4717	* We've copied the full 16K page but we're
	4718	* about to call vm_fault_enter() only for
	4719	* the 4K chunk we're faulting on. The other
	4720	* three 4K chunks in that page could still
	4721	* be pmapped in this pmap.
	4722	* Since the VM object layer thinks that the
	4723	* entire page has been dealt with and the
	4724	* original page might no longer be needed,
	4725	* it might collapse/bypass the original VM
	4726	* object and free its pages, which would be
	4727	* bad (and would trigger pmap_verify_free()
	4728	* assertions) if the other 4K chunks are still
	4729	* pmapped.
	4730	*/
	4731	/*
	4732	* XXX FBDP TODO4K: to be revisisted
	4733	* Technically, we need to pmap_disconnect()
	4734	* only the target pmap's mappings for the 4K
	4735	* chunks of this 16K VM page. If other pmaps
	4736	* have PTEs on these chunks, that means that
	4737	* the associated VM map must have a reference
	4738	* on the VM object, so no need to worry about
	4739	* those.
	4740	* pmap_protect() for each 4K chunk would be
	4741	* better but we'd have to check which chunks
	4742	* are actually mapped before and after this
	4743	* one.
	4744	* A full-blown pmap_disconnect() is easier
	4745	* for now but not efficient.
	4746	*/
	4747	DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
	4748	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
	4749	}
	4750
	4751	if (cur_m->vmp_clustered) {
	4752	VM_PAGE_COUNT_AS_PAGEIN(cur_m);
	4753	VM_PAGE_CONSUME_CLUSTERED(cur_m);
	4754	vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
	4755	}
	4756	need_collapse = TRUE;
	4757
	4758	if (!cur_object->internal &&
	4759	cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
	4760	/*
	4761	* The object from which we've just
	4762	* copied a page is most probably backed
	4763	* by a vnode. We don't want to waste too
	4764	* much time trying to collapse the VM objects
	4765	* and create a bottleneck when several tasks
	4766	* map the same file.
	4767	*/
	4768	if (cur_object->copy == object) {
	4769	/*
	4770	* Shared mapping or no COW yet.
	4771	* We can never collapse a copy
	4772	* object into its backing object.
	4773	*/
	4774	need_collapse = FALSE;
	4775	} else if (cur_object->copy == object->shadow &&
	4776	object->shadow->resident_page_count == 0) {
	4777	/*
	4778	* Shared mapping after a COW occurred.
	4779	*/
	4780	need_collapse = FALSE;
	4781	}
	4782	}
	4783	vm_object_unlock(cur_object);
	4784
	4785	if (need_collapse == FALSE) {
	4786	vm_fault_collapse_skipped++;
	4787	}
	4788	vm_fault_collapse_total++;
	4789
	4790	type_of_fault = DBG_COW_FAULT;
	4791	counter_inc(&vm_statistics_cow_faults);
	4792	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
	4793	current_task()->cow_faults++;
	4794
	4795	goto FastPmapEnter;
	4796	} else {
	4797	/*
	4798	* No page at cur_object, cur_offset... m == NULL
	4799	*/
	4800	if (cur_object->pager_created) {
	4801	vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
	4802
	4803	if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
	4804	int my_fault_type;
	4805	uint8_t c_flags = C_DONT_BLOCK;
	4806	bool insert_cur_object = FALSE;
	4807
	4808	/*
	4809	* May have to talk to a pager...
	4810	* if so, take the slow path by
	4811	* doing a 'break' from the while (TRUE) loop
	4812	*
	4813	* external_state will only be set to VM_EXTERNAL_STATE_EXISTS
	4814	* if the compressor is active and the page exists there
	4815	*/
	4816	if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
	4817	break;
	4818	}
	4819
	4820	if (map == kernel_map \|\| real_map == kernel_map) {
	4821	/*
	4822	* can't call into the compressor with the kernel_map
	4823	* lock held, since the compressor may try to operate
	4824	* on the kernel map in order to return an empty c_segment
	4825	*/
	4826	break;
	4827	}
	4828	if (object != cur_object) {
	4829	if (fault_type & VM_PROT_WRITE) {
	4830	c_flags \|= C_KEEP;
	4831	} else {
	4832	insert_cur_object = TRUE;
	4833	}
	4834	}
	4835	if (insert_cur_object == TRUE) {
	4836	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4837	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4838
	4839	if (vm_object_lock_upgrade(cur_object) == FALSE) {
	4840	/*
	4841	* couldn't upgrade so go do a full retry
	4842	* immediately since we can no longer be
	4843	* certain about cur_object (since we
	4844	* don't hold a reference on it)...
	4845	* first drop the top object lock
	4846	*/
	4847	vm_object_unlock(object);
	4848
	4849	vm_map_unlock_read(map);
	4850	if (real_map != map) {
	4851	vm_map_unlock(real_map);
	4852	}
	4853
	4854	goto RetryFault;
	4855	}
	4856	}
	4857	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
	4858	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4859
	4860	if (object != cur_object) {
	4861	/*
	4862	* we can't go for the upgrade on the top
	4863	* lock since the upgrade may block waiting
	4864	* for readers to drain... since we hold
	4865	* cur_object locked at this point, waiting
	4866	* for the readers to drain would represent
	4867	* a lock order inversion since the lock order
	4868	* for objects is the reference order in the
	4869	* shadown chain
	4870	*/
	4871	vm_object_unlock(object);
	4872	vm_object_unlock(cur_object);
	4873
	4874	vm_map_unlock_read(map);
	4875	if (real_map != map) {
	4876	vm_map_unlock(real_map);
	4877	}
	4878
	4879	goto RetryFault;
	4880	}
	4881	if (vm_object_lock_upgrade(object) == FALSE) {
	4882	/*
	4883	* couldn't upgrade, so explictly take the lock
	4884	* exclusively and go relookup the page since we
	4885	* will have dropped the object lock and
	4886	* a different thread could have inserted
	4887	* a page at this offset
	4888	* no need for a full retry since we're
	4889	* at the top level of the object chain
	4890	*/
	4891	vm_object_lock(object);
	4892
	4893	continue;
	4894	}
	4895	}
	4896	m = vm_page_grab_options(grab_options);
	4897	m_object = NULL;
	4898
	4899	if (m == VM_PAGE_NULL) {
	4900	/*
	4901	* no free page currently available...
	4902	* must take the slow path
	4903	*/
	4904	break;
	4905	}
	4906
	4907	/*
	4908	* The object is and remains locked
	4909	* so no need to take a
	4910	* "paging_in_progress" reference.
	4911	*/
	4912	bool shared_lock;
	4913	if ((object == cur_object &&
	4914	object_lock_type == OBJECT_LOCK_EXCLUSIVE) \|\|
	4915	(object != cur_object &&
	4916	cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
	4917	shared_lock = FALSE;
	4918	} else {
	4919	shared_lock = TRUE;
	4920	}
	4921
	4922	kr = vm_compressor_pager_get(
	4923	cur_object->pager,
	4924	(vm_object_trunc_page(cur_offset)
	4925	+ cur_object->paging_offset),
	4926	VM_PAGE_GET_PHYS_PAGE(m),
	4927	&my_fault_type,
	4928	c_flags,
	4929	&compressed_count_delta);
	4930
	4931	vm_compressor_pager_count(
	4932	cur_object->pager,
	4933	compressed_count_delta,
	4934	shared_lock,
	4935	cur_object);
	4936
	4937	if (kr != KERN_SUCCESS) {
	4938	vm_page_release(m, FALSE);
	4939	m = VM_PAGE_NULL;
	4940	}
	4941	/*
	4942	* If vm_compressor_pager_get() returns
	4943	* KERN_MEMORY_FAILURE, then the
	4944	* compressed data is permanently lost,
	4945	* so return this error immediately.
	4946	*/
	4947	if (kr == KERN_MEMORY_FAILURE) {
	4948	if (object != cur_object) {
	4949	vm_object_unlock(cur_object);
	4950	}
	4951	vm_object_unlock(object);
	4952	vm_map_unlock_read(map);
	4953	if (real_map != map) {
	4954	vm_map_unlock(real_map);
	4955	}
	4956	goto done;
	4957	} else if (kr != KERN_SUCCESS) {
	4958	break;
	4959	}
	4960	m->vmp_dirty = TRUE;
	4961
	4962	/*
	4963	* If the object is purgeable, its
	4964	* owner's purgeable ledgers will be
	4965	* updated in vm_page_insert() but the
	4966	* page was also accounted for in a
	4967	* "compressed purgeable" ledger, so
	4968	* update that now.
	4969	*/
	4970	if (object != cur_object &&
	4971	!insert_cur_object) {
	4972	/*
	4973	* We're not going to insert
	4974	* the decompressed page into
	4975	* the object it came from.
	4976	*
	4977	* We're dealing with a
	4978	* copy-on-write fault on
	4979	* "object".
	4980	* We're going to decompress
	4981	* the page directly into the
	4982	* target "object" while
	4983	* keepin the compressed
	4984	* page for "cur_object", so
	4985	* no ledger update in that
	4986	* case.
	4987	*/
	4988	} else if (((cur_object->purgable ==
	4989	VM_PURGABLE_DENY) &&
	4990	(!cur_object->vo_ledger_tag)) \|\|
	4991	(cur_object->vo_owner ==
	4992	NULL)) {
	4993	/*
	4994	* "cur_object" is not purgeable
	4995	* and is not ledger-taged, or
	4996	* there's no owner for it,
	4997	* so no owner's ledgers to
	4998	* update.
	4999	*/
	5000	} else {
	5001	/*
	5002	* One less compressed
	5003	* purgeable/tagged page for
	5004	* cur_object's owner.
	5005	*/
	5006	vm_object_owner_compressed_update(
	5007	cur_object,
	5008	-1);
	5009	}
	5010
	5011	if (insert_cur_object) {
	5012	vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
	5013	m_object = cur_object;
	5014	} else {
	5015	vm_page_insert(m, object, vm_object_trunc_page(offset));
	5016	m_object = object;
	5017	}
	5018
	5019	if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
	5020	/*
	5021	* If the page is not cacheable,
	5022	* we can't let its contents
	5023	* linger in the data cache
	5024	* after the decompression.
	5025	*/
	5026	pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
	5027	}
	5028
	5029	type_of_fault = my_fault_type;
	5030
	5031	VM_STAT_DECOMPRESSIONS();
	5032
	5033	if (cur_object != object) {
	5034	if (insert_cur_object) {
	5035	top_object = object;
	5036	/*
	5037	* switch to the object that has the new page
	5038	*/
	5039	object = cur_object;
	5040	object_lock_type = cur_object_lock_type;
	5041	} else {
	5042	vm_object_unlock(cur_object);
	5043	cur_object = object;
	5044	}
	5045	}
	5046	goto FastPmapEnter;
	5047	}
	5048	/*
	5049	* existence map present and indicates
	5050	* that the pager doesn't have this page
	5051	*/
	5052	}
	5053	if (cur_object->shadow == VM_OBJECT_NULL \|\|
	5054	resilient_media_retry) {
	5055	/*
	5056	* Zero fill fault. Page gets
	5057	* inserted into the original object.
	5058	*/
	5059	if (cur_object->shadow_severed \|\|
	5060	VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) \|\|
	5061	cur_object == compressor_object \|\|
	5062	cur_object == kernel_object \|\|
	5063	cur_object == vm_submap_object) {
	5064	if (object != cur_object) {
	5065	vm_object_unlock(cur_object);
	5066	}
	5067	vm_object_unlock(object);
	5068
	5069	vm_map_unlock_read(map);
	5070	if (real_map != map) {
	5071	vm_map_unlock(real_map);
	5072	}
	5073
	5074	kr = KERN_MEMORY_ERROR;
	5075	goto done;
	5076	}
	5077	if (cur_object != object) {
	5078	vm_object_unlock(cur_object);
	5079
	5080	cur_object = object;
	5081	}
	5082	if (object_lock_type == OBJECT_LOCK_SHARED) {
	5083	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	5084
	5085	if (vm_object_lock_upgrade(object) == FALSE) {
	5086	/*
	5087	* couldn't upgrade so do a full retry on the fault
	5088	* since we dropped the object lock which
	5089	* could allow another thread to insert
	5090	* a page at this offset
	5091	*/
	5092	vm_map_unlock_read(map);
	5093	if (real_map != map) {
	5094	vm_map_unlock(real_map);
	5095	}
	5096
	5097	goto RetryFault;
	5098	}
	5099	}
	5100	if (!object->internal) {
	5101	panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
	5102	}
	5103	m = vm_page_alloc(object, vm_object_trunc_page(offset));
	5104	m_object = NULL;
	5105
	5106	if (m == VM_PAGE_NULL) {
	5107	/*
	5108	* no free page currently available...
	5109	* must take the slow path
	5110	*/
	5111	break;
	5112	}
	5113	m_object = object;
	5114
	5115	/*
	5116	* Zeroing the page and entering into it into the pmap
	5117	* represents a significant amount of the zero fill fault handler's work.
	5118	*
	5119	* To improve fault scalability, we'll drop the object lock, if it appears contended,
	5120	* now that we've inserted the page into the vm object.
	5121	* Before dropping the lock, we need to check protection bits and set the
	5122	* mapped bits on the page. Then we can mark the page busy, drop the lock,
	5123	* zero it, and do the pmap enter. We'll need to reacquire the lock
	5124	* to clear the busy bit and wake up any waiters.
	5125	*/
	5126	vm_fault_cs_clear(m);
	5127	m->vmp_pmapped = TRUE;
	5128	if (map->no_zero_fill) {
	5129	type_of_fault = DBG_NZF_PAGE_FAULT;
	5130	} else {
	5131	type_of_fault = DBG_ZERO_FILL_FAULT;
	5132	}
	5133	{
	5134	pmap_t destination_pmap;
	5135	vm_map_offset_t destination_pmap_vaddr;
	5136	vm_prot_t enter_fault_type;
	5137	if (caller_pmap) {
	5138	destination_pmap = caller_pmap;
	5139	destination_pmap_vaddr = caller_pmap_addr;
	5140	} else {
	5141	destination_pmap = pmap;
	5142	destination_pmap_vaddr = vaddr;
	5143	}
	5144	if (change_wiring) {
	5145	enter_fault_type = VM_PROT_NONE;
	5146	} else {
	5147	enter_fault_type = caller_prot;
	5148	}
	5149	kr = vm_fault_enter_prepare(m,
	5150	destination_pmap,
	5151	destination_pmap_vaddr,
	5152	&prot,
	5153	caller_prot,
	5154	fault_page_size,
	5155	fault_phys_offset,
	5156	change_wiring,
	5157	enter_fault_type,
	5158	&fault_info,
	5159	&type_of_fault,
	5160	&page_needs_data_sync);
	5161	if (kr != KERN_SUCCESS) {
	5162	goto zero_fill_cleanup;
	5163	}
	5164
	5165	if (object_is_contended) {
	5166	/*
	5167	* At this point the page is in the vm object, but not on a paging queue.
	5168	* Since it's accessible to another thread but its contents are invalid
	5169	* (it hasn't been zeroed) mark it busy before dropping the object lock.
	5170	*/
	5171	m->vmp_busy = TRUE;
	5172	vm_object_unlock(object);
	5173	}
	5174	if (type_of_fault == DBG_ZERO_FILL_FAULT) {
	5175	/*
	5176	* Now zero fill page...
	5177	* the page is probably going to
	5178	* be written soon, so don't bother
	5179	* to clear the modified bit
	5180	*
	5181	* NOTE: This code holds the map
	5182	* lock across the zero fill.
	5183	*/
	5184	vm_page_zero_fill(m);
	5185	counter_inc(&vm_statistics_zero_fill_count);
	5186	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
	5187	}
	5188	if (page_needs_data_sync) {
	5189	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
	5190	}
	5191
	5192	if (top_object != VM_OBJECT_NULL) {
	5193	need_retry_ptr = &need_retry;
	5194	} else {
	5195	need_retry_ptr = NULL;
	5196	}
	5197	if (object_is_contended) {
	5198	kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
	5199	fault_page_size, fault_phys_offset,
	5200	m, &prot, caller_prot, enter_fault_type, wired,
	5201	fault_info.pmap_options, need_retry_ptr);
	5202	vm_object_lock(object);
	5203	} else {
	5204	kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
	5205	fault_page_size, fault_phys_offset,
	5206	m, &prot, caller_prot, enter_fault_type, wired,
	5207	fault_info.pmap_options, need_retry_ptr);
	5208	}
	5209	}
	5210	zero_fill_cleanup:
	5211	if (!VM_DYNAMIC_PAGING_ENABLED() &&
	5212	(object->purgable == VM_PURGABLE_DENY \|\|
	5213	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	5214	object->purgable == VM_PURGABLE_VOLATILE)) {
	5215	vm_page_lockspin_queues();
	5216	if (!VM_DYNAMIC_PAGING_ENABLED()) {
	5217	vm_fault_enqueue_throttled_locked(m);
	5218	}
	5219	vm_page_unlock_queues();
	5220	}
	5221	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
	5222
	5223	vm_fault_complete(
	5224	map,
	5225	real_map,
	5226	object,
	5227	m_object,
	5228	m,
	5229	offset,
	5230	trace_real_vaddr,
	5231	&fault_info,
	5232	caller_prot,
	5233	real_vaddr,
	5234	type_of_fault,
	5235	need_retry,
	5236	kr,
	5237	physpage_p,
	5238	prot,
	5239	top_object,
	5240	need_collapse,
	5241	cur_offset,
	5242	fault_type,
	5243	&written_on_object,
	5244	&written_on_pager,
	5245	&written_on_offset);
	5246	top_object = VM_OBJECT_NULL;
	5247	if (need_retry == TRUE) {
	5248	/*
	5249	* vm_fault_enter couldn't complete the PMAP_ENTER...
	5250	* at this point we don't hold any locks so it's safe
	5251	* to ask the pmap layer to expand the page table to
	5252	* accommodate this mapping... once expanded, we'll
	5253	* re-drive the fault which should result in vm_fault_enter
	5254	* being able to successfully enter the mapping this time around
	5255	*/
	5256	(void)pmap_enter_options(
	5257	pmap, vaddr, 0, 0, 0, 0, 0,
	5258	PMAP_OPTIONS_NOENTER, NULL);
	5259
	5260	need_retry = FALSE;
	5261	goto RetryFault;
	5262	}
	5263	goto done;
	5264	}
	5265	/*
	5266	* On to the next level in the shadow chain
	5267	*/
	5268	cur_offset += cur_object->vo_shadow_offset;
	5269	new_object = cur_object->shadow;
	5270	fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
	5271
	5272	/*
	5273	* take the new_object's lock with the indicated state
	5274	*/
	5275	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	5276	vm_object_lock_shared(new_object);
	5277	} else {
	5278	vm_object_lock(new_object);
	5279	}
	5280
	5281	if (cur_object != object) {
	5282	vm_object_unlock(cur_object);
	5283	}
	5284
	5285	cur_object = new_object;
	5286
	5287	continue;
	5288	}
	5289	}
	5290	/*
	5291	* Cleanup from fast fault failure. Drop any object
	5292	* lock other than original and drop map lock.
	5293	*/
	5294	if (object != cur_object) {
	5295	vm_object_unlock(cur_object);
	5296	}
	5297
	5298	/*
	5299	* must own the object lock exclusively at this point
	5300	*/
	5301	if (object_lock_type == OBJECT_LOCK_SHARED) {
	5302	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	5303
	5304	if (vm_object_lock_upgrade(object) == FALSE) {
	5305	/*
	5306	* couldn't upgrade, so explictly
	5307	* take the lock exclusively
	5308	* no need to retry the fault at this
	5309	* point since "vm_fault_page" will
	5310	* completely re-evaluate the state
	5311	*/
	5312	vm_object_lock(object);
	5313	}
	5314	}
	5315
	5316	handle_copy_delay:
	5317	vm_map_unlock_read(map);
	5318	if (real_map != map) {
	5319	vm_map_unlock(real_map);
	5320	}
	5321
	5322	if (__improbable(object == compressor_object \|\|
	5323	object == kernel_object \|\|
	5324	object == vm_submap_object)) {
	5325	/*
	5326	* These objects are explicitly managed and populated by the
	5327	* kernel. The virtual ranges backed by these objects should
	5328	* either have wired pages or "holes" that are not supposed to
	5329	* be accessed at all until they get explicitly populated.
	5330	* We should never have to resolve a fault on a mapping backed
	5331	* by one of these VM objects and providing a zero-filled page
	5332	* would be wrong here, so let's fail the fault and let the
	5333	* caller crash or recover.
	5334	*/
	5335	vm_object_unlock(object);
	5336	kr = KERN_MEMORY_ERROR;
	5337	goto done;
	5338	}
	5339
	5340	assert(object != compressor_object);
	5341	assert(object != kernel_object);
	5342	assert(object != vm_submap_object);
	5343
	5344	if (resilient_media_retry) {
	5345	/*
	5346	* We could get here if we failed to get a free page
	5347	* to zero-fill and had to take the slow path again.
	5348	* Reset our "recovery-from-failed-media" state.
	5349	*/
	5350	assert(resilient_media_object != VM_OBJECT_NULL);
	5351	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5352	/* release our extra reference on failed object */
	5353	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5354	vm_object_deallocate(resilient_media_object);
	5355	resilient_media_object = VM_OBJECT_NULL;
	5356	resilient_media_offset = (vm_object_offset_t)-1;
	5357	resilient_media_retry = FALSE;
	5358	}
	5359
	5360	/*
	5361	* Make a reference to this object to
	5362	* prevent its disposal while we are messing with
	5363	* it. Once we have the reference, the map is free
	5364	* to be diddled. Since objects reference their
	5365	* shadows (and copies), they will stay around as well.
	5366	*/
	5367	vm_object_reference_locked(object);
	5368	vm_object_paging_begin(object);
	5369
	5370	set_thread_pagein_error(cthread, 0);
	5371	error_code = 0;
	5372
	5373	result_page = VM_PAGE_NULL;
	5374	kr = vm_fault_page(object, offset, fault_type,
	5375	(change_wiring && !wired),
	5376	FALSE, /* page not looked up */
	5377	&prot, &result_page, &top_page,
	5378	&type_of_fault,
	5379	&error_code, map->no_zero_fill,
	5380	FALSE, &fault_info);
	5381
	5382	/*
	5383	* if kr != VM_FAULT_SUCCESS, then the paging reference
	5384	* has been dropped and the object unlocked... the ref_count
	5385	* is still held
	5386	*
	5387	* if kr == VM_FAULT_SUCCESS, then the paging reference
	5388	* is still held along with the ref_count on the original object
	5389	*
	5390	* the object is returned locked with a paging reference
	5391	*
	5392	* if top_page != NULL, then it's BUSY and the
	5393	* object it belongs to has a paging reference
	5394	* but is returned unlocked
	5395	*/
	5396	if (kr != VM_FAULT_SUCCESS &&
	5397	kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
	5398	if (kr == VM_FAULT_MEMORY_ERROR &&
	5399	fault_info.resilient_media) {
	5400	assertf(object->internal, "object %p", object);
	5401	/*
	5402	* This fault failed but the mapping was
	5403	* "media resilient", so we'll retry the fault in
	5404	* recovery mode to get a zero-filled page in the
	5405	* top object.
	5406	* Keep the reference on the failing object so
	5407	* that we can check that the mapping is still
	5408	* pointing to it when we retry the fault.
	5409	*/
	5410	// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
	5411	assert(!resilient_media_retry); /* no double retry */
	5412	assert(resilient_media_object == VM_OBJECT_NULL);
	5413	assert(resilient_media_offset == (vm_object_offset_t)-1);
	5414	resilient_media_retry = TRUE;
	5415	resilient_media_object = object;
	5416	resilient_media_offset = offset;
	5417	// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
	5418	goto RetryFault;
	5419	} else {
	5420	/*
	5421	* we didn't succeed, lose the object reference
	5422	* immediately.
	5423	*/
	5424	vm_object_deallocate(object);
	5425	object = VM_OBJECT_NULL; /* no longer valid */
	5426	}
	5427
	5428	/*
	5429	* See why we failed, and take corrective action.
	5430	*/
	5431	switch (kr) {
	5432	case VM_FAULT_MEMORY_SHORTAGE:
	5433	if (vm_page_wait((change_wiring) ?
	5434	THREAD_UNINT :
	5435	THREAD_ABORTSAFE)) {
	5436	goto RetryFault;
	5437	}
	5438	OS_FALLTHROUGH;
	5439	case VM_FAULT_INTERRUPTED:
	5440	kr = KERN_ABORTED;
	5441	goto done;
	5442	case VM_FAULT_RETRY:
	5443	goto RetryFault;
	5444	case VM_FAULT_MEMORY_ERROR:
	5445	if (error_code) {
	5446	kr = error_code;
	5447	} else {
	5448	kr = KERN_MEMORY_ERROR;
	5449	}
	5450	goto done;
	5451	default:
	5452	panic("vm_fault: unexpected error 0x%x from "
	5453	"vm_fault_page()\n", kr);
	5454	}
	5455	}
	5456	m = result_page;
	5457	m_object = NULL;
	5458
	5459	if (m != VM_PAGE_NULL) {
	5460	m_object = VM_PAGE_OBJECT(m);
	5461	assert((change_wiring && !wired) ?
	5462	(top_page == VM_PAGE_NULL) :
	5463	((top_page == VM_PAGE_NULL) == (m_object == object)));
	5464	}
	5465
	5466	/*
	5467	* What to do with the resulting page from vm_fault_page
	5468	* if it doesn't get entered into the physical map:
	5469	*/
	5470	#define RELEASE_PAGE(m) \
	5471	MACRO_BEGIN \
	5472	PAGE_WAKEUP_DONE(m); \
	5473	if ( !VM_PAGE_PAGEABLE(m)) { \
	5474	vm_page_lockspin_queues(); \
	5475	if ( !VM_PAGE_PAGEABLE(m)) \
	5476	vm_page_activate(m); \
	5477	vm_page_unlock_queues(); \
	5478	} \
	5479	MACRO_END
	5480
	5481
	5482	object_locks_dropped = FALSE;
	5483	/*
	5484	* We must verify that the maps have not changed
	5485	* since our last lookup. vm_map_verify() needs the
	5486	* map lock (shared) but we are holding object locks.
	5487	* So we do a try_lock() first and, if that fails, we
	5488	* drop the object locks and go in for the map lock again.
	5489	*/
	5490	if (!vm_map_try_lock_read(original_map)) {
	5491	if (m != VM_PAGE_NULL) {
	5492	old_copy_object = m_object->copy;
	5493	vm_object_unlock(m_object);
	5494	} else {
	5495	old_copy_object = VM_OBJECT_NULL;
	5496	vm_object_unlock(object);
	5497	}
	5498
	5499	object_locks_dropped = TRUE;
	5500
	5501	vm_map_lock_read(original_map);
	5502	}
	5503
	5504	if ((map != original_map) \|\| !vm_map_verify(map, &version)) {
	5505	if (object_locks_dropped == FALSE) {
	5506	if (m != VM_PAGE_NULL) {
	5507	old_copy_object = m_object->copy;
	5508	vm_object_unlock(m_object);
	5509	} else {
	5510	old_copy_object = VM_OBJECT_NULL;
	5511	vm_object_unlock(object);
	5512	}
	5513
	5514	object_locks_dropped = TRUE;
	5515	}
	5516
	5517	/*
	5518	* no object locks are held at this point
	5519	*/
	5520	vm_object_t retry_object;
	5521	vm_object_offset_t retry_offset;
	5522	vm_prot_t retry_prot;
	5523
	5524	/*
	5525	* To avoid trying to write_lock the map while another
	5526	* thread has it read_locked (in vm_map_pageable), we
	5527	* do not try for write permission. If the page is
	5528	* still writable, we will get write permission. If it
	5529	* is not, or has been marked needs_copy, we enter the
	5530	* mapping without write permission, and will merely
	5531	* take another fault.
	5532	*/
	5533	map = original_map;
	5534
	5535	kr = vm_map_lookup_locked(&map, vaddr,
	5536	fault_type & ~VM_PROT_WRITE,
	5537	OBJECT_LOCK_EXCLUSIVE, &version,
	5538	&retry_object, &retry_offset, &retry_prot,
	5539	&wired,
	5540	&fault_info,
	5541	&real_map,
	5542	NULL);
	5543	pmap = real_map->pmap;
	5544
	5545	if (kr != KERN_SUCCESS) {
	5546	vm_map_unlock_read(map);
	5547
	5548	if (m != VM_PAGE_NULL) {
	5549	assert(VM_PAGE_OBJECT(m) == m_object);
	5550
	5551	/*
	5552	* retake the lock so that
	5553	* we can drop the paging reference
	5554	* in vm_fault_cleanup and do the
	5555	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5556	*/
	5557	vm_object_lock(m_object);
	5558
	5559	RELEASE_PAGE(m);
	5560
	5561	vm_fault_cleanup(m_object, top_page);
	5562	} else {
	5563	/*
	5564	* retake the lock so that
	5565	* we can drop the paging reference
	5566	* in vm_fault_cleanup
	5567	*/
	5568	vm_object_lock(object);
	5569
	5570	vm_fault_cleanup(object, top_page);
	5571	}
	5572	vm_object_deallocate(object);
	5573
	5574	goto done;
	5575	}
	5576	vm_object_unlock(retry_object);
	5577
	5578	if ((retry_object != object) \|\| (retry_offset != offset)) {
	5579	vm_map_unlock_read(map);
	5580	if (real_map != map) {
	5581	vm_map_unlock(real_map);
	5582	}
	5583
	5584	if (m != VM_PAGE_NULL) {
	5585	assert(VM_PAGE_OBJECT(m) == m_object);
	5586
	5587	/*
	5588	* retake the lock so that
	5589	* we can drop the paging reference
	5590	* in vm_fault_cleanup and do the
	5591	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5592	*/
	5593	vm_object_lock(m_object);
	5594
	5595	RELEASE_PAGE(m);
	5596
	5597	vm_fault_cleanup(m_object, top_page);
	5598	} else {
	5599	/*
	5600	* retake the lock so that
	5601	* we can drop the paging reference
	5602	* in vm_fault_cleanup
	5603	*/
	5604	vm_object_lock(object);
	5605
	5606	vm_fault_cleanup(object, top_page);
	5607	}
	5608	vm_object_deallocate(object);
	5609
	5610	goto RetryFault;
	5611	}
	5612	/*
	5613	* Check whether the protection has changed or the object
	5614	* has been copied while we left the map unlocked.
	5615	*/
	5616	if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
	5617	/* If the pmap layer cares, pass the full set. */
	5618	prot = retry_prot;
	5619	} else {
	5620	prot &= retry_prot;
	5621	}
	5622	}
	5623
	5624	if (object_locks_dropped == TRUE) {
	5625	if (m != VM_PAGE_NULL) {
	5626	vm_object_lock(m_object);
	5627
	5628	if (m_object->copy != old_copy_object) {
	5629	/*
	5630	* The copy object changed while the top-level object
	5631	* was unlocked, so take away write permission.
	5632	*/
	5633	assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
	5634	prot &= ~VM_PROT_WRITE;
	5635	}
	5636	} else {
	5637	vm_object_lock(object);
	5638	}
	5639
	5640	object_locks_dropped = FALSE;
	5641	}
	5642
	5643	if (!need_copy &&
	5644	!fault_info.no_copy_on_read &&
	5645	m != VM_PAGE_NULL &&
	5646	VM_PAGE_OBJECT(m) != object &&
	5647	!VM_PAGE_OBJECT(m)->pager_trusted &&
	5648	vm_protect_privileged_from_untrusted &&
	5649	!((prot & VM_PROT_EXECUTE) &&
	5650	VM_PAGE_OBJECT(m)->code_signed &&
	5651	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
	5652	current_proc_is_privileged()) {
	5653	/*
	5654	* We found the page we want in an "untrusted" VM object
	5655	* down the shadow chain. Since the target is "privileged"
	5656	* we want to perform a copy-on-read of that page, so that the
	5657	* mapped object gets a stable copy and does not have to
	5658	* rely on the "untrusted" object to provide the same
	5659	* contents if the page gets reclaimed and has to be paged
	5660	* in again later on.
	5661	*
	5662	* Special case: if the mapping is executable and the untrusted
	5663	* object is code-signed and the process is "cs_enforced", we
	5664	* do not copy-on-read because that would break code-signing
	5665	* enforcement expectations (an executable page must belong
	5666	* to a code-signed object) and we can rely on code-signing
	5667	* to re-validate the page if it gets evicted and paged back in.
	5668	*/
	5669	// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	5670	vm_copied_on_read++;
	5671	need_copy_on_read = TRUE;
	5672	need_copy = TRUE;
	5673	} else {
	5674	need_copy_on_read = FALSE;
	5675	}
	5676
	5677	/*
	5678	* If we want to wire down this page, but no longer have
	5679	* adequate permissions, we must start all over.
	5680	* If we decided to copy-on-read, we must also start all over.
	5681	*/
	5682	if ((wired && (fault_type != (prot \| VM_PROT_WRITE))) \|\|
	5683	need_copy_on_read) {
	5684	vm_map_unlock_read(map);
	5685	if (real_map != map) {
	5686	vm_map_unlock(real_map);
	5687	}
	5688
	5689	if (m != VM_PAGE_NULL) {
	5690	assert(VM_PAGE_OBJECT(m) == m_object);
	5691
	5692	RELEASE_PAGE(m);
	5693
	5694	vm_fault_cleanup(m_object, top_page);
	5695	} else {
	5696	vm_fault_cleanup(object, top_page);
	5697	}
	5698
	5699	vm_object_deallocate(object);
	5700
	5701	goto RetryFault;
	5702	}
	5703	if (m != VM_PAGE_NULL) {
	5704	/*
	5705	* Put this page into the physical map.
	5706	* We had to do the unlock above because pmap_enter
	5707	* may cause other faults. The page may be on
	5708	* the pageout queues. If the pageout daemon comes
	5709	* across the page, it will remove it from the queues.
	5710	*/
	5711	if (fault_page_size < PAGE_SIZE) {
	5712	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	5713	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	5714	fault_phys_offset < PAGE_SIZE),
	5715	"0x%llx\n", (uint64_t)fault_phys_offset);
	5716	} else {
	5717	assertf(fault_phys_offset == 0,
	5718	"0x%llx\n", (uint64_t)fault_phys_offset);
	5719	}
	5720	if (caller_pmap) {
	5721	kr = vm_fault_enter(m,
	5722	caller_pmap,
	5723	caller_pmap_addr,
	5724	fault_page_size,
	5725	fault_phys_offset,
	5726	prot,
	5727	caller_prot,
	5728	wired,
	5729	change_wiring,
	5730	wire_tag,
	5731	&fault_info,
	5732	NULL,
	5733	&type_of_fault);
	5734	} else {
	5735	kr = vm_fault_enter(m,
	5736	pmap,
	5737	vaddr,
	5738	fault_page_size,
	5739	fault_phys_offset,
	5740	prot,
	5741	caller_prot,
	5742	wired,
	5743	change_wiring,
	5744	wire_tag,
	5745	&fault_info,
	5746	NULL,
	5747	&type_of_fault);
	5748	}
	5749	assert(VM_PAGE_OBJECT(m) == m_object);
	5750
	5751	{
	5752	int event_code = 0;
	5753
	5754	if (m_object->internal) {
	5755	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	5756	} else if (m_object->object_is_shared_cache) {
	5757	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	5758	} else {
	5759	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	5760	}
	5761
	5762	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) \| (caller_prot << 8) \| vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
	5763	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
	5764
	5765	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
	5766	}
	5767	if (kr != KERN_SUCCESS) {
	5768	/* abort this page fault */
	5769	vm_map_unlock_read(map);
	5770	if (real_map != map) {
	5771	vm_map_unlock(real_map);
	5772	}
	5773	PAGE_WAKEUP_DONE(m);
	5774	vm_fault_cleanup(m_object, top_page);
	5775	vm_object_deallocate(object);
	5776	goto done;
	5777	}
	5778	if (physpage_p != NULL) {
	5779	/* for vm_map_wire_and_extract() */
	5780	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	5781	if (prot & VM_PROT_WRITE) {
	5782	vm_object_lock_assert_exclusive(m_object);
	5783	m->vmp_dirty = TRUE;
	5784	}
	5785	}
	5786	} else {
	5787	vm_map_entry_t entry;
	5788	vm_map_offset_t laddr;
	5789	vm_map_offset_t ldelta, hdelta;
	5790
	5791	/*
	5792	* do a pmap block mapping from the physical address
	5793	* in the object
	5794	*/
	5795
	5796	if (real_map != map) {
	5797	vm_map_unlock(real_map);
	5798	}
	5799
	5800	if (original_map != map) {
	5801	vm_map_unlock_read(map);
	5802	vm_map_lock_read(original_map);
	5803	map = original_map;
	5804	}
	5805	real_map = map;
	5806
	5807	laddr = vaddr;
	5808	hdelta = 0xFFFFF000;
	5809	ldelta = 0xFFFFF000;
	5810
	5811	while (vm_map_lookup_entry(map, laddr, &entry)) {
	5812	if (ldelta > (laddr - entry->vme_start)) {
	5813	ldelta = laddr - entry->vme_start;
	5814	}
	5815	if (hdelta > (entry->vme_end - laddr)) {
	5816	hdelta = entry->vme_end - laddr;
	5817	}
	5818	if (entry->is_sub_map) {
	5819	laddr = ((laddr - entry->vme_start)
	5820	+ VME_OFFSET(entry));
	5821	vm_map_lock_read(VME_SUBMAP(entry));
	5822
	5823	if (map != real_map) {
	5824	vm_map_unlock_read(map);
	5825	}
	5826	if (entry->use_pmap) {
	5827	vm_map_unlock_read(real_map);
	5828	real_map = VME_SUBMAP(entry);
	5829	}
	5830	map = VME_SUBMAP(entry);
	5831	} else {
	5832	break;
	5833	}
	5834	}
	5835
	5836	if (vm_map_lookup_entry(map, laddr, &entry) &&
	5837	(VME_OBJECT(entry) != NULL) &&
	5838	(VME_OBJECT(entry) == object)) {
	5839	uint16_t superpage;
	5840
	5841	if (!object->pager_created &&
	5842	object->phys_contiguous &&
	5843	VME_OFFSET(entry) == 0 &&
	5844	(entry->vme_end - entry->vme_start == object->vo_size) &&
	5845	VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
	5846	superpage = VM_MEM_SUPERPAGE;
	5847	} else {
	5848	superpage = 0;
	5849	}
	5850
	5851	if (superpage && physpage_p) {
	5852	/* for vm_map_wire_and_extract() */
	5853	*physpage_p = (ppnum_t)
	5854	((((vm_map_offset_t)
	5855	object->vo_shadow_offset)
	5856	+ VME_OFFSET(entry)
	5857	+ (laddr - entry->vme_start))
	5858	>> PAGE_SHIFT);
	5859	}
	5860
	5861	if (caller_pmap) {
	5862	/*
	5863	* Set up a block mapped area
	5864	*/
	5865	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
	5866	kr = pmap_map_block(caller_pmap,
	5867	(addr64_t)(caller_pmap_addr - ldelta),
	5868	(ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
	5869	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5870	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5871	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
	5872
	5873	if (kr != KERN_SUCCESS) {
	5874	goto cleanup;
	5875	}
	5876	} else {
	5877	/*
	5878	* Set up a block mapped area
	5879	*/
	5880	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
	5881	kr = pmap_map_block(real_map->pmap,
	5882	(addr64_t)(vaddr - ldelta),
	5883	(ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
	5884	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5885	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5886	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
	5887
	5888	if (kr != KERN_SUCCESS) {
	5889	goto cleanup;
	5890	}
	5891	}
	5892	}
	5893	}
	5894
	5895	/*
	5896	* Success
	5897	*/
	5898	kr = KERN_SUCCESS;
	5899
	5900	/*
	5901	* TODO: could most of the done cases just use cleanup?
	5902	*/
	5903	cleanup:
	5904	/*
	5905	* Unlock everything, and return
	5906	*/
	5907	vm_map_unlock_read(map);
	5908	if (real_map != map) {
	5909	vm_map_unlock(real_map);
	5910	}
	5911
	5912	if (m != VM_PAGE_NULL) {
	5913	assert(VM_PAGE_OBJECT(m) == m_object);
	5914
	5915	if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
	5916	vm_object_paging_begin(m_object);
	5917
	5918	assert(written_on_object == VM_OBJECT_NULL);
	5919	written_on_object = m_object;
	5920	written_on_pager = m_object->pager;
	5921	written_on_offset = m_object->paging_offset + m->vmp_offset;
	5922	}
	5923	PAGE_WAKEUP_DONE(m);
	5924
	5925	vm_fault_cleanup(m_object, top_page);
	5926	} else {
	5927	vm_fault_cleanup(object, top_page);
	5928	}
	5929
	5930	vm_object_deallocate(object);
	5931
	5932	#undef RELEASE_PAGE
	5933
	5934	done:
	5935	thread_interrupt_level(interruptible_state);
	5936
	5937	if (resilient_media_object != VM_OBJECT_NULL) {
	5938	assert(resilient_media_retry);
	5939	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5940	/* release extra reference on failed object */
	5941	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5942	vm_object_deallocate(resilient_media_object);
	5943	resilient_media_object = VM_OBJECT_NULL;
	5944	resilient_media_offset = (vm_object_offset_t)-1;
	5945	resilient_media_retry = FALSE;
	5946	}
	5947	assert(!resilient_media_retry);
	5948
	5949	/*
	5950	* Only I/O throttle on faults which cause a pagein/swapin.
	5951	*/
	5952	if ((type_of_fault == DBG_PAGEIND_FAULT) \|\| (type_of_fault == DBG_PAGEINV_FAULT) \|\| (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
	5953	throttle_lowpri_io(1);
	5954	} else {
	5955	if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
	5956	if ((throttle_delay = vm_page_throttled(TRUE))) {
	5957	if (vm_debug_events) {
	5958	if (type_of_fault == DBG_COMPRESSOR_FAULT) {
	5959	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5960	} else if (type_of_fault == DBG_COW_FAULT) {
	5961	VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5962	} else {
	5963	VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
	5964	}
	5965	}
	5966	delay(throttle_delay);
	5967	}
	5968	}
	5969	}
	5970
	5971	if (written_on_object) {
	5972	vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
	5973
	5974	vm_object_lock(written_on_object);
	5975	vm_object_paging_end(written_on_object);
	5976	vm_object_unlock(written_on_object);
	5977
	5978	written_on_object = VM_OBJECT_NULL;
	5979	}
	5980
	5981	if (rtfault) {
	5982	vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
	5983	}
	5984
	5985	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	5986	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	5987	((uint64_t)trace_vaddr >> 32),
	5988	trace_vaddr,
	5989	kr,
	5990	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
	5991	0);
	5992
	5993	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
	5994	DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
	5995	}
	5996
	5997	return kr;
	5998	}
	5999
	6000	/*
	6001	* vm_fault_wire:
	6002	*
	6003	* Wire down a range of virtual addresses in a map.
	6004	*/
	6005	kern_return_t
	6006	vm_fault_wire(
	6007	vm_map_t map,
	6008	vm_map_entry_t entry,
	6009	vm_prot_t prot,
	6010	vm_tag_t wire_tag,
	6011	pmap_t pmap,
	6012	vm_map_offset_t pmap_addr,
	6013	ppnum_t *physpage_p)
	6014	{
	6015	vm_map_offset_t va;
	6016	vm_map_offset_t end_addr = entry->vme_end;
	6017	kern_return_t rc;
	6018	vm_map_size_t effective_page_size;
	6019
	6020	assert(entry->in_transition);
	6021
	6022	if ((VME_OBJECT(entry) != NULL) &&
	6023	!entry->is_sub_map &&
	6024	VME_OBJECT(entry)->phys_contiguous) {
	6025	return KERN_SUCCESS;
	6026	}
	6027
	6028	/*
	6029	* Inform the physical mapping system that the
	6030	* range of addresses may not fault, so that
	6031	* page tables and such can be locked down as well.
	6032	*/
	6033
	6034	pmap_pageable(pmap, pmap_addr,
	6035	pmap_addr + (end_addr - entry->vme_start), FALSE);
	6036
	6037	/*
	6038	* We simulate a fault to get the page and enter it
	6039	* in the physical map.
	6040	*/
	6041
	6042	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6043	for (va = entry->vme_start;
	6044	va < end_addr;
	6045	va += effective_page_size) {
	6046	rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
	6047	pmap_addr + (va - entry->vme_start),
	6048	physpage_p);
	6049	if (rc != KERN_SUCCESS) {
	6050	rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
	6051	((pmap == kernel_pmap)
	6052	? THREAD_UNINT
	6053	: THREAD_ABORTSAFE),
	6054	pmap,
	6055	(pmap_addr +
	6056	(va - entry->vme_start)),
	6057	physpage_p);
	6058	DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
	6059	}
	6060
	6061	if (rc != KERN_SUCCESS) {
	6062	struct vm_map_entry tmp_entry = *entry;
	6063
	6064	/* unwire wired pages */
	6065	tmp_entry.vme_end = va;
	6066	vm_fault_unwire(map,
	6067	&tmp_entry, FALSE, pmap, pmap_addr);
	6068
	6069	return rc;
	6070	}
	6071	}
	6072	return KERN_SUCCESS;
	6073	}
	6074
	6075	/*
	6076	* vm_fault_unwire:
	6077	*
	6078	* Unwire a range of virtual addresses in a map.
	6079	*/
	6080	void
	6081	vm_fault_unwire(
	6082	vm_map_t map,
	6083	vm_map_entry_t entry,
	6084	boolean_t deallocate,
	6085	pmap_t pmap,
	6086	vm_map_offset_t pmap_addr)
	6087	{
	6088	vm_map_offset_t va;
	6089	vm_map_offset_t end_addr = entry->vme_end;
	6090	vm_object_t object;
	6091	struct vm_object_fault_info fault_info = {};
	6092	unsigned int unwired_pages;
	6093	vm_map_size_t effective_page_size;
	6094
	6095	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
	6096
	6097	/*
	6098	* If it's marked phys_contiguous, then vm_fault_wire() didn't actually
	6099	* do anything since such memory is wired by default. So we don't have
	6100	* anything to undo here.
	6101	*/
	6102
	6103	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
	6104	return;
	6105	}
	6106
	6107	fault_info.interruptible = THREAD_UNINT;
	6108	fault_info.behavior = entry->behavior;
	6109	fault_info.user_tag = VME_ALIAS(entry);
	6110	if (entry->iokit_acct \|\|
	6111	(!entry->is_sub_map && !entry->use_pmap)) {
	6112	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6113	}
	6114	fault_info.lo_offset = VME_OFFSET(entry);
	6115	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
	6116	fault_info.no_cache = entry->no_cache;
	6117	fault_info.stealth = TRUE;
	6118
	6119	unwired_pages = 0;
	6120
	6121	/*
	6122	* Since the pages are wired down, we must be able to
	6123	* get their mappings from the physical map system.
	6124	*/
	6125
	6126	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6127	for (va = entry->vme_start;
	6128	va < end_addr;
	6129	va += effective_page_size) {
	6130	if (object == VM_OBJECT_NULL) {
	6131	if (pmap) {
	6132	pmap_change_wiring(pmap,
	6133	pmap_addr + (va - entry->vme_start), FALSE);
	6134	}
	6135	(void) vm_fault(map, va, VM_PROT_NONE,
	6136	TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
	6137	} else {
	6138	vm_prot_t prot;
	6139	vm_page_t result_page;
	6140	vm_page_t top_page;
	6141	vm_object_t result_object;
	6142	vm_fault_return_t result;
	6143
	6144	/* cap cluster size at maximum UPL size */
	6145	upl_size_t cluster_size;
	6146	if (os_sub_overflow(end_addr, va, &cluster_size)) {
	6147	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6148	}
	6149	fault_info.cluster_size = cluster_size;
	6150
	6151	do {
	6152	prot = VM_PROT_NONE;
	6153
	6154	vm_object_lock(object);
	6155	vm_object_paging_begin(object);
	6156	result_page = VM_PAGE_NULL;
	6157	result = vm_fault_page(
	6158	object,
	6159	(VME_OFFSET(entry) +
	6160	(va - entry->vme_start)),
	6161	VM_PROT_NONE, TRUE,
	6162	FALSE, /* page not looked up */
	6163	&prot, &result_page, &top_page,
	6164	(int *)0,
	6165	NULL, map->no_zero_fill,
	6166	FALSE, &fault_info);
	6167	} while (result == VM_FAULT_RETRY);
	6168
	6169	/*
	6170	* If this was a mapping to a file on a device that has been forcibly
	6171	* unmounted, then we won't get a page back from vm_fault_page(). Just
	6172	* move on to the next one in case the remaining pages are mapped from
	6173	* different objects. During a forced unmount, the object is terminated
	6174	* so the alive flag will be false if this happens. A forced unmount will
	6175	* will occur when an external disk is unplugged before the user does an
	6176	* eject, so we don't want to panic in that situation.
	6177	*/
	6178
	6179	if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
	6180	continue;
	6181	}
	6182
	6183	if (result == VM_FAULT_MEMORY_ERROR &&
	6184	object == kernel_object) {
	6185	/*
	6186	* This must have been allocated with
	6187	* KMA_KOBJECT and KMA_VAONLY and there's
	6188	* no physical page at this offset.
	6189	* We're done (no page to free).
	6190	*/
	6191	assert(deallocate);
	6192	continue;
	6193	}
	6194
	6195	if (result != VM_FAULT_SUCCESS) {
	6196	panic("vm_fault_unwire: failure");
	6197	}
	6198
	6199	result_object = VM_PAGE_OBJECT(result_page);
	6200
	6201	if (deallocate) {
	6202	assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
	6203	vm_page_fictitious_addr);
	6204	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
	6205	if (VM_PAGE_WIRED(result_page)) {
	6206	unwired_pages++;
	6207	}
	6208	VM_PAGE_FREE(result_page);
	6209	} else {
	6210	if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
	6211	pmap_change_wiring(pmap,
	6212	pmap_addr + (va - entry->vme_start), FALSE);
	6213	}
	6214
	6215
	6216	if (VM_PAGE_WIRED(result_page)) {
	6217	vm_page_lockspin_queues();
	6218	vm_page_unwire(result_page, TRUE);
	6219	vm_page_unlock_queues();
	6220	unwired_pages++;
	6221	}
	6222	if (entry->zero_wired_pages) {
	6223	pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
	6224	entry->zero_wired_pages = FALSE;
	6225	}
	6226
	6227	PAGE_WAKEUP_DONE(result_page);
	6228	}
	6229	vm_fault_cleanup(result_object, top_page);
	6230	}
	6231	}
	6232
	6233	/*
	6234	* Inform the physical mapping system that the range
	6235	* of addresses may fault, so that page tables and
	6236	* such may be unwired themselves.
	6237	*/
	6238
	6239	pmap_pageable(pmap, pmap_addr,
	6240	pmap_addr + (end_addr - entry->vme_start), TRUE);
	6241
	6242	if (kernel_object == object) {
	6243	/*
	6244	* Would like to make user_tag in vm_object_fault_info
	6245	* vm_tag_t (unsigned short) but user_tag derives its value from
	6246	* VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
	6247	* to an _unsigned int_ which is used by non-fault_info paths throughout the
	6248	* code at many places.
	6249	*
	6250	* So, for now, an explicit truncation to unsigned short (vm_tag_t).
	6251	*/
	6252	assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
	6253	"VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
	6254	vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
	6255	}
	6256	}
	6257
	6258	/*
	6259	* vm_fault_wire_fast:
	6260	*
	6261	* Handle common case of a wire down page fault at the given address.
	6262	* If successful, the page is inserted into the associated physical map.
	6263	* The map entry is passed in to avoid the overhead of a map lookup.
	6264	*
	6265	* NOTE: the given address should be truncated to the
	6266	* proper page address.
	6267	*
	6268	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
	6269	* a standard error specifying why the fault is fatal is returned.
	6270	*
	6271	* The map in question must be referenced, and remains so.
	6272	* Caller has a read lock on the map.
	6273	*
	6274	* This is a stripped version of vm_fault() for wiring pages. Anything
	6275	* other than the common case will return KERN_FAILURE, and the caller
	6276	* is expected to call vm_fault().
	6277	*/
	6278	static kern_return_t
	6279	vm_fault_wire_fast(
	6280	__unused vm_map_t map,
	6281	vm_map_offset_t va,
	6282	__unused vm_prot_t caller_prot,
	6283	vm_tag_t wire_tag,
	6284	vm_map_entry_t entry,
	6285	pmap_t pmap,
	6286	vm_map_offset_t pmap_addr,
	6287	ppnum_t *physpage_p)
	6288	{
	6289	vm_object_t object;
	6290	vm_object_offset_t offset;
	6291	vm_page_t m;
	6292	vm_prot_t prot;
	6293	thread_t thread = current_thread();
	6294	int type_of_fault;
	6295	kern_return_t kr;
	6296	vm_map_size_t fault_page_size;
	6297	vm_map_offset_t fault_phys_offset;
	6298	struct vm_object_fault_info fault_info = {};
	6299
	6300	counter_inc(&vm_statistics_faults);
	6301
	6302	if (thread != THREAD_NULL && thread->task != TASK_NULL) {
	6303	counter_inc(&thread->task->faults);
	6304	}
	6305
	6306	/*
	6307	* Recovery actions
	6308	*/
	6309
	6310	#undef RELEASE_PAGE
	6311	#define RELEASE_PAGE(m) { \
	6312	PAGE_WAKEUP_DONE(m); \
	6313	vm_page_lockspin_queues(); \
	6314	vm_page_unwire(m, TRUE); \
	6315	vm_page_unlock_queues(); \
	6316	}
	6317
	6318
	6319	#undef UNLOCK_THINGS
	6320	#define UNLOCK_THINGS { \
	6321	vm_object_paging_end(object); \
	6322	vm_object_unlock(object); \
	6323	}
	6324
	6325	#undef UNLOCK_AND_DEALLOCATE
	6326	#define UNLOCK_AND_DEALLOCATE { \
	6327	UNLOCK_THINGS; \
	6328	vm_object_deallocate(object); \
	6329	}
	6330	/*
	6331	* Give up and have caller do things the hard way.
	6332	*/
	6333
	6334	#define GIVE_UP { \
	6335	UNLOCK_AND_DEALLOCATE; \
	6336	return(KERN_FAILURE); \
	6337	}
	6338
	6339
	6340	/*
	6341	* If this entry is not directly to a vm_object, bail out.
	6342	*/
	6343	if (entry->is_sub_map) {
	6344	assert(physpage_p == NULL);
	6345	return KERN_FAILURE;
	6346	}
	6347
	6348	/*
	6349	* Find the backing store object and offset into it.
	6350	*/
	6351
	6352	object = VME_OBJECT(entry);
	6353	offset = (va - entry->vme_start) + VME_OFFSET(entry);
	6354	prot = entry->protection;
	6355
	6356	/*
	6357	* Make a reference to this object to prevent its
	6358	* disposal while we are messing with it.
	6359	*/
	6360
	6361	vm_object_lock(object);
	6362	vm_object_reference_locked(object);
	6363	vm_object_paging_begin(object);
	6364
	6365	/*
	6366	* INVARIANTS (through entire routine):
	6367	*
	6368	* 1) At all times, we must either have the object
	6369	* lock or a busy page in some object to prevent
	6370	* some other thread from trying to bring in
	6371	* the same page.
	6372	*
	6373	* 2) Once we have a busy page, we must remove it from
	6374	* the pageout queues, so that the pageout daemon
	6375	* will not grab it away.
	6376	*
	6377	*/
	6378
	6379	/*
	6380	* Look for page in top-level object. If it's not there or
	6381	* there's something going on, give up.
	6382	*/
	6383	m = vm_page_lookup(object, vm_object_trunc_page(offset));
	6384	if ((m == VM_PAGE_NULL) \|\| (m->vmp_busy) \|\|
	6385	(m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent))) {
	6386	GIVE_UP;
	6387	}
	6388	if (m->vmp_fictitious &&
	6389	VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	6390	/*
	6391	* Guard pages are fictitious pages and are never
	6392	* entered into a pmap, so let's say it's been wired...
	6393	*/
	6394	kr = KERN_SUCCESS;
	6395	goto done;
	6396	}
	6397
	6398	/*
	6399	* Wire the page down now. All bail outs beyond this
	6400	* point must unwire the page.
	6401	*/
	6402
	6403	vm_page_lockspin_queues();
	6404	vm_page_wire(m, wire_tag, TRUE);
	6405	vm_page_unlock_queues();
	6406
	6407	/*
	6408	* Mark page busy for other threads.
	6409	*/
	6410	assert(!m->vmp_busy);
	6411	m->vmp_busy = TRUE;
	6412	assert(!m->vmp_absent);
	6413
	6414	/*
	6415	* Give up if the page is being written and there's a copy object
	6416	*/
	6417	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
	6418	RELEASE_PAGE(m);
	6419	GIVE_UP;
	6420	}
	6421
	6422	fault_info.user_tag = VME_ALIAS(entry);
	6423	fault_info.pmap_options = 0;
	6424	if (entry->iokit_acct \|\|
	6425	(!entry->is_sub_map && !entry->use_pmap)) {
	6426	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6427	}
	6428
	6429	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6430	fault_phys_offset = offset - vm_object_trunc_page(offset);
	6431
	6432	/*
	6433	* Put this page into the physical map.
	6434	*/
	6435	type_of_fault = DBG_CACHE_HIT_FAULT;
	6436	kr = vm_fault_enter(m,
	6437	pmap,
	6438	pmap_addr,
	6439	fault_page_size,
	6440	fault_phys_offset,
	6441	prot,
	6442	prot,
	6443	TRUE, /* wired */
	6444	FALSE, /* change_wiring */
	6445	wire_tag,
	6446	&fault_info,
	6447	NULL,
	6448	&type_of_fault);
	6449	if (kr != KERN_SUCCESS) {
	6450	RELEASE_PAGE(m);
	6451	GIVE_UP;
	6452	}
	6453
	6454	done:
	6455	/*
	6456	* Unlock everything, and return
	6457	*/
	6458
	6459	if (physpage_p) {
	6460	/* for vm_map_wire_and_extract() */
	6461	if (kr == KERN_SUCCESS) {
	6462	assert(object == VM_PAGE_OBJECT(m));
	6463	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
	6464	if (prot & VM_PROT_WRITE) {
	6465	vm_object_lock_assert_exclusive(object);
	6466	m->vmp_dirty = TRUE;
	6467	}
	6468	} else {
	6469	*physpage_p = 0;
	6470	}
	6471	}
	6472
	6473	PAGE_WAKEUP_DONE(m);
	6474	UNLOCK_AND_DEALLOCATE;
	6475
	6476	return kr;
	6477	}
	6478
	6479	/*
	6480	* Routine: vm_fault_copy_cleanup
	6481	* Purpose:
	6482	* Release a page used by vm_fault_copy.
	6483	*/
	6484
	6485	static void
	6486	vm_fault_copy_cleanup(
	6487	vm_page_t page,
	6488	vm_page_t top_page)
	6489	{
	6490	vm_object_t object = VM_PAGE_OBJECT(page);
	6491
	6492	vm_object_lock(object);
	6493	PAGE_WAKEUP_DONE(page);
	6494	if (!VM_PAGE_PAGEABLE(page)) {
	6495	vm_page_lockspin_queues();
	6496	if (!VM_PAGE_PAGEABLE(page)) {
	6497	vm_page_activate(page);
	6498	}
	6499	vm_page_unlock_queues();
	6500	}
	6501	vm_fault_cleanup(object, top_page);
	6502	}
	6503
	6504	static void
	6505	vm_fault_copy_dst_cleanup(
	6506	vm_page_t page)
	6507	{
	6508	vm_object_t object;
	6509
	6510	if (page != VM_PAGE_NULL) {
	6511	object = VM_PAGE_OBJECT(page);
	6512	vm_object_lock(object);
	6513	vm_page_lockspin_queues();
	6514	vm_page_unwire(page, TRUE);
	6515	vm_page_unlock_queues();
	6516	vm_object_paging_end(object);
	6517	vm_object_unlock(object);
	6518	}
	6519	}
	6520
	6521	/*
	6522	* Routine: vm_fault_copy
	6523	*
	6524	* Purpose:
	6525	* Copy pages from one virtual memory object to another --
	6526	* neither the source nor destination pages need be resident.
	6527	*
	6528	* Before actually copying a page, the version associated with
	6529	* the destination address map wil be verified.
	6530	*
	6531	* In/out conditions:
	6532	* The caller must hold a reference, but not a lock, to
	6533	* each of the source and destination objects and to the
	6534	* destination map.
	6535	*
	6536	* Results:
	6537	* Returns KERN_SUCCESS if no errors were encountered in
	6538	* reading or writing the data. Returns KERN_INTERRUPTED if
	6539	* the operation was interrupted (only possible if the
	6540	* "interruptible" argument is asserted). Other return values
	6541	* indicate a permanent error in copying the data.
	6542	*
	6543	* The actual amount of data copied will be returned in the
	6544	* "copy_size" argument. In the event that the destination map
	6545	* verification failed, this amount may be less than the amount
	6546	* requested.
	6547	*/
	6548	kern_return_t
	6549	vm_fault_copy(
	6550	vm_object_t src_object,
	6551	vm_object_offset_t src_offset,
	6552	vm_map_size_t copy_size, / INOUT */
	6553	vm_object_t dst_object,
	6554	vm_object_offset_t dst_offset,
	6555	vm_map_t dst_map,
	6556	vm_map_version_t *dst_version,
	6557	int interruptible)
	6558	{
	6559	vm_page_t result_page;
	6560
	6561	vm_page_t src_page;
	6562	vm_page_t src_top_page;
	6563	vm_prot_t src_prot;
	6564
	6565	vm_page_t dst_page;
	6566	vm_page_t dst_top_page;
	6567	vm_prot_t dst_prot;
	6568
	6569	vm_map_size_t amount_left;
	6570	vm_object_t old_copy_object;
	6571	vm_object_t result_page_object = NULL;
	6572	kern_return_t error = 0;
	6573	vm_fault_return_t result;
	6574
	6575	vm_map_size_t part_size;
	6576	struct vm_object_fault_info fault_info_src = {};
	6577	struct vm_object_fault_info fault_info_dst = {};
	6578
	6579	/*
	6580	* In order not to confuse the clustered pageins, align
	6581	* the different offsets on a page boundary.
	6582	*/
	6583
	6584	#define RETURN(x) \
	6585	MACRO_BEGIN \
	6586	*copy_size -= amount_left; \
	6587	MACRO_RETURN(x); \
	6588	MACRO_END
	6589
	6590	amount_left = *copy_size;
	6591
	6592	fault_info_src.interruptible = interruptible;
	6593	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
	6594	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
	6595	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
	6596	fault_info_src.stealth = TRUE;
	6597
	6598	fault_info_dst.interruptible = interruptible;
	6599	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
	6600	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
	6601	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
	6602	fault_info_dst.stealth = TRUE;
	6603
	6604	do { /* while (amount_left > 0) */
	6605	/*
	6606	* There may be a deadlock if both source and destination
	6607	* pages are the same. To avoid this deadlock, the copy must
	6608	* start by getting the destination page in order to apply
	6609	* COW semantics if any.
	6610	*/
	6611
	6612	RetryDestinationFault:;
	6613
	6614	dst_prot = VM_PROT_WRITE \| VM_PROT_READ;
	6615
	6616	vm_object_lock(dst_object);
	6617	vm_object_paging_begin(dst_object);
	6618
	6619	/* cap cluster size at maximum UPL size */
	6620	upl_size_t cluster_size;
	6621	if (os_convert_overflow(amount_left, &cluster_size)) {
	6622	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6623	}
	6624	fault_info_dst.cluster_size = cluster_size;
	6625
	6626	dst_page = VM_PAGE_NULL;
	6627	result = vm_fault_page(dst_object,
	6628	vm_object_trunc_page(dst_offset),
	6629	VM_PROT_WRITE \| VM_PROT_READ,
	6630	FALSE,
	6631	FALSE, /* page not looked up */
	6632	&dst_prot, &dst_page, &dst_top_page,
	6633	(int *)0,
	6634	&error,
	6635	dst_map->no_zero_fill,
	6636	FALSE, &fault_info_dst);
	6637	switch (result) {
	6638	case VM_FAULT_SUCCESS:
	6639	break;
	6640	case VM_FAULT_RETRY:
	6641	goto RetryDestinationFault;
	6642	case VM_FAULT_MEMORY_SHORTAGE:
	6643	if (vm_page_wait(interruptible)) {
	6644	goto RetryDestinationFault;
	6645	}
	6646	OS_FALLTHROUGH;
	6647	case VM_FAULT_INTERRUPTED:
	6648	RETURN(MACH_SEND_INTERRUPTED);
	6649	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6650	/* success but no VM page: fail the copy */
	6651	vm_object_paging_end(dst_object);
	6652	vm_object_unlock(dst_object);
	6653	OS_FALLTHROUGH;
	6654	case VM_FAULT_MEMORY_ERROR:
	6655	if (error) {
	6656	return error;
	6657	} else {
	6658	return KERN_MEMORY_ERROR;
	6659	}
	6660	default:
	6661	panic("vm_fault_copy: unexpected error 0x%x from "
	6662	"vm_fault_page()\n", result);
	6663	}
	6664	assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
	6665
	6666	assert(dst_object == VM_PAGE_OBJECT(dst_page));
	6667	old_copy_object = dst_object->copy;
	6668
	6669	/*
	6670	* There exists the possiblity that the source and
	6671	* destination page are the same. But we can't
	6672	* easily determine that now. If they are the
	6673	* same, the call to vm_fault_page() for the
	6674	* destination page will deadlock. To prevent this we
	6675	* wire the page so we can drop busy without having
	6676	* the page daemon steal the page. We clean up the
	6677	* top page but keep the paging reference on the object
	6678	* holding the dest page so it doesn't go away.
	6679	*/
	6680
	6681	vm_page_lockspin_queues();
	6682	vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
	6683	vm_page_unlock_queues();
	6684	PAGE_WAKEUP_DONE(dst_page);
	6685	vm_object_unlock(dst_object);
	6686
	6687	if (dst_top_page != VM_PAGE_NULL) {
	6688	vm_object_lock(dst_object);
	6689	VM_PAGE_FREE(dst_top_page);
	6690	vm_object_paging_end(dst_object);
	6691	vm_object_unlock(dst_object);
	6692	}
	6693
	6694	RetrySourceFault:;
	6695
	6696	if (src_object == VM_OBJECT_NULL) {
	6697	/*
	6698	* No source object. We will just
	6699	* zero-fill the page in dst_object.
	6700	*/
	6701	src_page = VM_PAGE_NULL;
	6702	result_page = VM_PAGE_NULL;
	6703	} else {
	6704	vm_object_lock(src_object);
	6705	src_page = vm_page_lookup(src_object,
	6706	vm_object_trunc_page(src_offset));
	6707	if (src_page == dst_page) {
	6708	src_prot = dst_prot;
	6709	result_page = VM_PAGE_NULL;
	6710	} else {
	6711	src_prot = VM_PROT_READ;
	6712	vm_object_paging_begin(src_object);
	6713
	6714	/* cap cluster size at maximum UPL size */
	6715	if (os_convert_overflow(amount_left, &cluster_size)) {
	6716	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
	6717	}
	6718	fault_info_src.cluster_size = cluster_size;
	6719
	6720	result_page = VM_PAGE_NULL;
	6721	result = vm_fault_page(
	6722	src_object,
	6723	vm_object_trunc_page(src_offset),
	6724	VM_PROT_READ, FALSE,
	6725	FALSE, /* page not looked up */
	6726	&src_prot,
	6727	&result_page, &src_top_page,
	6728	(int *)0, &error, FALSE,
	6729	FALSE, &fault_info_src);
	6730
	6731	switch (result) {
	6732	case VM_FAULT_SUCCESS:
	6733	break;
	6734	case VM_FAULT_RETRY:
	6735	goto RetrySourceFault;
	6736	case VM_FAULT_MEMORY_SHORTAGE:
	6737	if (vm_page_wait(interruptible)) {
	6738	goto RetrySourceFault;
	6739	}
	6740	OS_FALLTHROUGH;
	6741	case VM_FAULT_INTERRUPTED:
	6742	vm_fault_copy_dst_cleanup(dst_page);
	6743	RETURN(MACH_SEND_INTERRUPTED);
	6744	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6745	/* success but no VM page: fail */
	6746	vm_object_paging_end(src_object);
	6747	vm_object_unlock(src_object);
	6748	OS_FALLTHROUGH;
	6749	case VM_FAULT_MEMORY_ERROR:
	6750	vm_fault_copy_dst_cleanup(dst_page);
	6751	if (error) {
	6752	return error;
	6753	} else {
	6754	return KERN_MEMORY_ERROR;
	6755	}
	6756	default:
	6757	panic("vm_fault_copy(2): unexpected "
	6758	"error 0x%x from "
	6759	"vm_fault_page()\n", result);
	6760	}
	6761
	6762	result_page_object = VM_PAGE_OBJECT(result_page);
	6763	assert((src_top_page == VM_PAGE_NULL) ==
	6764	(result_page_object == src_object));
	6765	}
	6766	assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
	6767	vm_object_unlock(result_page_object);
	6768	}
	6769
	6770	vm_map_lock_read(dst_map);
	6771
	6772	if (!vm_map_verify(dst_map, dst_version)) {
	6773	vm_map_unlock_read(dst_map);
	6774	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6775	vm_fault_copy_cleanup(result_page, src_top_page);
	6776	}
	6777	vm_fault_copy_dst_cleanup(dst_page);
	6778	break;
	6779	}
	6780	assert(dst_object == VM_PAGE_OBJECT(dst_page));
	6781
	6782	vm_object_lock(dst_object);
	6783
	6784	if (dst_object->copy != old_copy_object) {
	6785	vm_object_unlock(dst_object);
	6786	vm_map_unlock_read(dst_map);
	6787	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6788	vm_fault_copy_cleanup(result_page, src_top_page);
	6789	}
	6790	vm_fault_copy_dst_cleanup(dst_page);
	6791	break;
	6792	}
	6793	vm_object_unlock(dst_object);
	6794
	6795	/*
	6796	* Copy the page, and note that it is dirty
	6797	* immediately.
	6798	*/
	6799
	6800	if (!page_aligned(src_offset) \|\|
	6801	!page_aligned(dst_offset) \|\|
	6802	!page_aligned(amount_left)) {
	6803	vm_object_offset_t src_po,
	6804	dst_po;
	6805
	6806	src_po = src_offset - vm_object_trunc_page(src_offset);
	6807	dst_po = dst_offset - vm_object_trunc_page(dst_offset);
	6808
	6809	if (dst_po > src_po) {
	6810	part_size = PAGE_SIZE - dst_po;
	6811	} else {
	6812	part_size = PAGE_SIZE - src_po;
	6813	}
	6814	if (part_size > (amount_left)) {
	6815	part_size = amount_left;
	6816	}
	6817
	6818	if (result_page == VM_PAGE_NULL) {
	6819	assert((vm_offset_t) dst_po == dst_po);
	6820	assert((vm_size_t) part_size == part_size);
	6821	vm_page_part_zero_fill(dst_page,
	6822	(vm_offset_t) dst_po,
	6823	(vm_size_t) part_size);
	6824	} else {
	6825	assert((vm_offset_t) src_po == src_po);
	6826	assert((vm_offset_t) dst_po == dst_po);
	6827	assert((vm_size_t) part_size == part_size);
	6828	vm_page_part_copy(result_page,
	6829	(vm_offset_t) src_po,
	6830	dst_page,
	6831	(vm_offset_t) dst_po,
	6832	(vm_size_t)part_size);
	6833	if (!dst_page->vmp_dirty) {
	6834	vm_object_lock(dst_object);
	6835	SET_PAGE_DIRTY(dst_page, TRUE);
	6836	vm_object_unlock(dst_object);
	6837	}
	6838	}
	6839	} else {
	6840	part_size = PAGE_SIZE;
	6841
	6842	if (result_page == VM_PAGE_NULL) {
	6843	vm_page_zero_fill(dst_page);
	6844	} else {
	6845	vm_object_lock(result_page_object);
	6846	vm_page_copy(result_page, dst_page);
	6847	vm_object_unlock(result_page_object);
	6848
	6849	if (!dst_page->vmp_dirty) {
	6850	vm_object_lock(dst_object);
	6851	SET_PAGE_DIRTY(dst_page, TRUE);
	6852	vm_object_unlock(dst_object);
	6853	}
	6854	}
	6855	}
	6856
	6857	/*
	6858	* Unlock everything, and return
	6859	*/
	6860
	6861	vm_map_unlock_read(dst_map);
	6862
	6863	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
	6864	vm_fault_copy_cleanup(result_page, src_top_page);
	6865	}
	6866	vm_fault_copy_dst_cleanup(dst_page);
	6867
	6868	amount_left -= part_size;
	6869	src_offset += part_size;
	6870	dst_offset += part_size;
	6871	} while (amount_left > 0);
	6872
	6873	RETURN(KERN_SUCCESS);
	6874	#undef RETURN
	6875
	6876	/NOTREACHED/
	6877	}
	6878
	6879	#if VM_FAULT_CLASSIFY
	6880	/*
	6881	* Temporary statistics gathering support.
	6882	*/
	6883
	6884	/*
	6885	* Statistics arrays:
	6886	*/
	6887	#define VM_FAULT_TYPES_MAX 5
	6888	#define VM_FAULT_LEVEL_MAX 8
	6889
	6890	int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
	6891
	6892	#define VM_FAULT_TYPE_ZERO_FILL 0
	6893	#define VM_FAULT_TYPE_MAP_IN 1
	6894	#define VM_FAULT_TYPE_PAGER 2
	6895	#define VM_FAULT_TYPE_COPY 3
	6896	#define VM_FAULT_TYPE_OTHER 4
	6897
	6898
	6899	void
	6900	vm_fault_classify(vm_object_t object,
	6901	vm_object_offset_t offset,
	6902	vm_prot_t fault_type)
	6903	{
	6904	int type, level = 0;
	6905	vm_page_t m;
	6906
	6907	while (TRUE) {
	6908	m = vm_page_lookup(object, offset);
	6909	if (m != VM_PAGE_NULL) {
	6910	if (m->vmp_busy \|\| m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent) {
	6911	type = VM_FAULT_TYPE_OTHER;
	6912	break;
	6913	}
	6914	if (((fault_type & VM_PROT_WRITE) == 0) \|\|
	6915	((level == 0) && object->copy == VM_OBJECT_NULL)) {
	6916	type = VM_FAULT_TYPE_MAP_IN;
	6917	break;
	6918	}
	6919	type = VM_FAULT_TYPE_COPY;
	6920	break;
	6921	} else {
	6922	if (object->pager_created) {
	6923	type = VM_FAULT_TYPE_PAGER;
	6924	break;
	6925	}
	6926	if (object->shadow == VM_OBJECT_NULL) {
	6927	type = VM_FAULT_TYPE_ZERO_FILL;
	6928	break;
	6929	}
	6930
	6931	offset += object->vo_shadow_offset;
	6932	object = object->shadow;
	6933	level++;
	6934	continue;
	6935	}
	6936	}
	6937
	6938	if (level > VM_FAULT_LEVEL_MAX) {
	6939	level = VM_FAULT_LEVEL_MAX;
	6940	}
	6941
	6942	vm_fault_stats[type][level] += 1;
	6943
	6944	return;
	6945	}
	6946
	6947	/* cleanup routine to call from debugger */
	6948
	6949	void
	6950	vm_fault_classify_init(void)
	6951	{
	6952	int type, level;
	6953
	6954	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
	6955	for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
	6956	vm_fault_stats[type][level] = 0;
	6957	}
	6958	}
	6959
	6960	return;
	6961	}
	6962	#endif /* VM_FAULT_CLASSIFY */
	6963
	6964	vm_offset_t
	6965	kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
	6966	{
	6967	vm_map_entry_t entry;
	6968	vm_object_t object;
	6969	vm_offset_t object_offset;
	6970	vm_page_t m;
	6971	int compressor_external_state, compressed_count_delta;
	6972	int compressor_flags = (C_DONT_BLOCK \| C_KEEP \| C_KDP);
	6973	int my_fault_type = VM_PROT_READ;
	6974	kern_return_t kr;
	6975	int effective_page_mask, effective_page_size;
	6976
	6977	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
	6978	effective_page_mask = VM_MAP_PAGE_MASK(map);
	6979	effective_page_size = VM_MAP_PAGE_SIZE(map);
	6980	} else {
	6981	effective_page_mask = PAGE_MASK;
	6982	effective_page_size = PAGE_SIZE;
	6983	}
	6984
	6985	if (not_in_kdp) {
	6986	panic("kdp_lightweight_fault called from outside of debugger context");
	6987	}
	6988
	6989	assert(map != VM_MAP_NULL);
	6990
	6991	assert((cur_target_addr & effective_page_mask) == 0);
	6992	if ((cur_target_addr & effective_page_mask) != 0) {
	6993	return 0;
	6994	}
	6995
	6996	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
	6997	return 0;
	6998	}
	6999
	7000	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
	7001	return 0;
	7002	}
	7003
	7004	if (entry->is_sub_map) {
	7005	return 0;
	7006	}
	7007
	7008	object = VME_OBJECT(entry);
	7009	if (object == VM_OBJECT_NULL) {
	7010	return 0;
	7011	}
	7012
	7013	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
	7014
	7015	while (TRUE) {
	7016	if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
	7017	return 0;
	7018	}
	7019
	7020	if (object->pager_created && (object->paging_in_progress \|\|
	7021	object->activity_in_progress)) {
	7022	return 0;
	7023	}
	7024
	7025	m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
	7026
	7027	if (m != VM_PAGE_NULL) {
	7028	if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
	7029	return 0;
	7030	}
	7031
	7032	if (m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_free_when_done \|\| m->vmp_absent \|\| m->vmp_error \|\| m->vmp_cleaning \|\|
	7033	m->vmp_overwriting \|\| m->vmp_restart \|\| m->vmp_unusual) {
	7034	return 0;
	7035	}
	7036
	7037	assert(!m->vmp_private);
	7038	if (m->vmp_private) {
	7039	return 0;
	7040	}
	7041
	7042	assert(!m->vmp_fictitious);
	7043	if (m->vmp_fictitious) {
	7044	return 0;
	7045	}
	7046
	7047	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
	7048	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
	7049	return 0;
	7050	}
	7051
	7052	return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
	7053	}
	7054
	7055	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
	7056
	7057	if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
	7058	if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
	7059	kr = vm_compressor_pager_get(object->pager,
	7060	vm_object_trunc_page(object_offset + object->paging_offset),
	7061	kdp_compressor_decompressed_page_ppnum, &my_fault_type,
	7062	compressor_flags, &compressed_count_delta);
	7063	if (kr == KERN_SUCCESS) {
	7064	return kdp_compressor_decompressed_page_paddr;
	7065	} else {
	7066	return 0;
	7067	}
	7068	}
	7069	}
	7070
	7071	if (object->shadow == VM_OBJECT_NULL) {
	7072	return 0;
	7073	}
	7074
	7075	object_offset += object->vo_shadow_offset;
	7076	object = object->shadow;
	7077	}
	7078	}
	7079
	7080	/*
	7081	* vm_page_validate_cs_fast():
	7082	* Performs a few quick checks to determine if the page's code signature
	7083	* really needs to be fully validated. It could:
	7084	* 1. have been modified (i.e. automatically tainted),
	7085	* 2. have already been validated,
	7086	* 3. have already been found to be tainted,
	7087	* 4. no longer have a backing store.
	7088	* Returns FALSE if the page needs to be fully validated.
	7089	*/
	7090	static boolean_t
	7091	vm_page_validate_cs_fast(
	7092	vm_page_t page,
	7093	vm_map_size_t fault_page_size,
	7094	vm_map_offset_t fault_phys_offset)
	7095	{
	7096	vm_object_t object;
	7097
	7098	object = VM_PAGE_OBJECT(page);
	7099	vm_object_lock_assert_held(object);
	7100
	7101	if (page->vmp_wpmapped &&
	7102	!VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	7103	/*
	7104	* This page was mapped for "write" access sometime in the
	7105	* past and could still be modifiable in the future.
	7106	* Consider it tainted.
	7107	* [ If the page was already found to be "tainted", no
	7108	* need to re-validate. ]
	7109	*/
	7110	vm_object_lock_assert_exclusive(object);
	7111	VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
	7112	VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
	7113	if (cs_debug) {
	7114	printf("CODESIGNING: %s: "
	7115	"page %p obj %p off 0x%llx "
	7116	"was modified\n",
	7117	__FUNCTION__,
	7118	page, object, page->vmp_offset);
	7119	}
	7120	vm_cs_validated_dirtied++;
	7121	}
	7122
	7123	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) \|\|
	7124	VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	7125	return TRUE;
	7126	}
	7127	vm_object_lock_assert_exclusive(object);
	7128
	7129	#if CHECK_CS_VALIDATION_BITMAP
	7130	kern_return_t kr;
	7131
	7132	kr = vnode_pager_cs_check_validation_bitmap(
	7133	object->pager,
	7134	page->vmp_offset + object->paging_offset,
	7135	CS_BITMAP_CHECK);
	7136	if (kr == KERN_SUCCESS) {
	7137	page->vmp_cs_validated = VMP_CS_ALL_TRUE;
	7138	page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
	7139	vm_cs_bitmap_validated++;
	7140	return TRUE;
	7141	}
	7142	#endif /* CHECK_CS_VALIDATION_BITMAP */
	7143
	7144	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7145	/*
	7146	* The object is terminating and we don't have its pager
	7147	* so we can't validate the data...
	7148	*/
	7149	return TRUE;
	7150	}
	7151
	7152	/* we need to really validate this page */
	7153	vm_object_lock_assert_exclusive(object);
	7154	return FALSE;
	7155	}
	7156
	7157	void
	7158	vm_page_validate_cs_mapped_slow(
	7159	vm_page_t page,
	7160	const void *kaddr)
	7161	{
	7162	vm_object_t object;
	7163	memory_object_offset_t mo_offset;
	7164	memory_object_t pager;
	7165	struct vnode *vnode;
	7166	int validated, tainted, nx;
	7167
	7168	assert(page->vmp_busy);
	7169	object = VM_PAGE_OBJECT(page);
	7170	vm_object_lock_assert_exclusive(object);
	7171
	7172	vm_cs_validates++;
	7173
	7174	/*
	7175	* Since we get here to validate a page that was brought in by
	7176	* the pager, we know that this pager is all setup and ready
	7177	* by now.
	7178	*/
	7179	assert(object->code_signed);
	7180	assert(!object->internal);
	7181	assert(object->pager != NULL);
	7182	assert(object->pager_ready);
	7183
	7184	pager = object->pager;
	7185	assert(object->paging_in_progress);
	7186	vnode = vnode_pager_lookup_vnode(pager);
	7187	mo_offset = page->vmp_offset + object->paging_offset;
	7188
	7189	/* verify the SHA1 hash for this page */
	7190	validated = 0;
	7191	tainted = 0;
	7192	nx = 0;
	7193	cs_validate_page(vnode,
	7194	pager,
	7195	mo_offset,
	7196	(const void )((const char )kaddr),
	7197	&validated,
	7198	&tainted,
	7199	&nx);
	7200
	7201	page->vmp_cs_validated \|= validated;
	7202	page->vmp_cs_tainted \|= tainted;
	7203	page->vmp_cs_nx \|= nx;
	7204
	7205	#if CHECK_CS_VALIDATION_BITMAP
	7206	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
	7207	page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
	7208	vnode_pager_cs_check_validation_bitmap(object->pager,
	7209	mo_offset,
	7210	CS_BITMAP_SET);
	7211	}
	7212	#endif /* CHECK_CS_VALIDATION_BITMAP */
	7213	}
	7214
	7215	void
	7216	vm_page_validate_cs_mapped(
	7217	vm_page_t page,
	7218	vm_map_size_t fault_page_size,
	7219	vm_map_offset_t fault_phys_offset,
	7220	const void *kaddr)
	7221	{
	7222	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
	7223	vm_page_validate_cs_mapped_slow(page, kaddr);
	7224	}
	7225	}
	7226
	7227	static void
	7228	vm_page_map_and_validate_cs(
	7229	vm_object_t object,
	7230	vm_page_t page)
	7231	{
	7232	vm_object_offset_t offset;
	7233	vm_map_offset_t koffset;
	7234	vm_map_size_t ksize;
	7235	vm_offset_t kaddr;
	7236	kern_return_t kr;
	7237	boolean_t busy_page;
	7238	boolean_t need_unmap;
	7239
	7240	vm_object_lock_assert_exclusive(object);
	7241
	7242	assert(object->code_signed);
	7243	offset = page->vmp_offset;
	7244
	7245	busy_page = page->vmp_busy;
	7246	if (!busy_page) {
	7247	/* keep page busy while we map (and unlock) the VM object */
	7248	page->vmp_busy = TRUE;
	7249	}
	7250
	7251	/*
	7252	* Take a paging reference on the VM object
	7253	* to protect it from collapse or bypass,
	7254	* and keep it from disappearing too.
	7255	*/
	7256	vm_object_paging_begin(object);
	7257
	7258	/* map the page in the kernel address space */
	7259	ksize = PAGE_SIZE_64;
	7260	koffset = 0;
	7261	need_unmap = FALSE;
	7262	kr = vm_paging_map_object(page,
	7263	object,
	7264	offset,
	7265	VM_PROT_READ,
	7266	FALSE, /* can't unlock object ! */
	7267	&ksize,
	7268	&koffset,
	7269	&need_unmap);
	7270	if (kr != KERN_SUCCESS) {
	7271	panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
	7272	}
	7273	kaddr = CAST_DOWN(vm_offset_t, koffset);
	7274
	7275	/* validate the mapped page */
	7276	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
	7277
	7278	assert(page->vmp_busy);
	7279	assert(object == VM_PAGE_OBJECT(page));
	7280	vm_object_lock_assert_exclusive(object);
	7281
	7282	if (!busy_page) {
	7283	PAGE_WAKEUP_DONE(page);
	7284	}
	7285	if (need_unmap) {
	7286	/* unmap the map from the kernel address space */
	7287	vm_paging_unmap_object(object, koffset, koffset + ksize);
	7288	koffset = 0;
	7289	ksize = 0;
	7290	kaddr = 0;
	7291	}
	7292	vm_object_paging_end(object);
	7293	}
	7294
	7295	void
	7296	vm_page_validate_cs(
	7297	vm_page_t page,
	7298	vm_map_size_t fault_page_size,
	7299	vm_map_offset_t fault_phys_offset)
	7300	{
	7301	vm_object_t object;
	7302
	7303	object = VM_PAGE_OBJECT(page);
	7304	vm_object_lock_assert_held(object);
	7305
	7306	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
	7307	return;
	7308	}
	7309	vm_page_map_and_validate_cs(object, page);
	7310	}
	7311
	7312	void
	7313	vm_page_validate_cs_mapped_chunk(
	7314	vm_page_t page,
	7315	const void *kaddr,
	7316	vm_offset_t chunk_offset,
	7317	vm_size_t chunk_size,
	7318	boolean_t *validated_p,
	7319	unsigned *tainted_p)
	7320	{
	7321	vm_object_t object;
	7322	vm_object_offset_t offset, offset_in_page;
	7323	memory_object_t pager;
	7324	struct vnode *vnode;
	7325	boolean_t validated;
	7326	unsigned tainted;
	7327
	7328	*validated_p = FALSE;
	7329	*tainted_p = 0;
	7330
	7331	assert(page->vmp_busy);
	7332	object = VM_PAGE_OBJECT(page);
	7333	vm_object_lock_assert_exclusive(object);
	7334
	7335	assert(object->code_signed);
	7336	offset = page->vmp_offset;
	7337
	7338	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7339	/*
	7340	* The object is terminating and we don't have its pager
	7341	* so we can't validate the data...
	7342	*/
	7343	return;
	7344	}
	7345	/*
	7346	* Since we get here to validate a page that was brought in by
	7347	* the pager, we know that this pager is all setup and ready
	7348	* by now.
	7349	*/
	7350	assert(!object->internal);
	7351	assert(object->pager != NULL);
	7352	assert(object->pager_ready);
	7353
	7354	pager = object->pager;
	7355	assert(object->paging_in_progress);
	7356	vnode = vnode_pager_lookup_vnode(pager);
	7357
	7358	/* verify the signature for this chunk */
	7359	offset_in_page = chunk_offset;
	7360	assert(offset_in_page < PAGE_SIZE);
	7361
	7362	tainted = 0;
	7363	validated = cs_validate_range(vnode,
	7364	pager,
	7365	(object->paging_offset +
	7366	offset +
	7367	offset_in_page),
	7368	(const void )((const char )kaddr
	7369	+ offset_in_page),
	7370	chunk_size,
	7371	&tainted);
	7372	if (validated) {
	7373	*validated_p = TRUE;
	7374	}
	7375	if (tainted) {
	7376	*tainted_p = tainted;
	7377	}
	7378	}
	7379
	7380	static void
	7381	vm_rtfrecord_lock(void)
	7382	{
	7383	lck_spin_lock(&vm_rtfr_slock);
	7384	}
	7385
	7386	static void
	7387	vm_rtfrecord_unlock(void)
	7388	{
	7389	lck_spin_unlock(&vm_rtfr_slock);
	7390	}
	7391
	7392	unsigned int
	7393	vmrtfaultinfo_bufsz(void)
	7394	{
	7395	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
	7396	}
	7397
	7398	#include <kern/backtrace.h>
	7399
	7400	__attribute__((noinline))
	7401	static void
	7402	vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
	7403	{
	7404	uint64_t fend = mach_continuous_time();
	7405
	7406	uint64_t cfpc = 0;
	7407	uint64_t ctid = cthread->thread_id;
	7408	uint64_t cupid = get_current_unique_pid();
	7409
	7410	uintptr_t bpc = 0;
	7411	int btr = 0;
	7412	bool u64 = false;
	7413
	7414	/* Capture a single-frame backtrace; this extracts just the program
	7415	* counter at the point of the fault into "bpc", and should perform no
	7416	* further user stack traversals, thus avoiding copyin()s and further
	7417	* faults.
	7418	*/
	7419	unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL, false);
	7420
	7421	if ((btr == 0) && (bfrs > 0)) {
	7422	cfpc = bpc;
	7423	}
	7424
	7425	assert((fstart != 0) && fend >= fstart);
	7426	vm_rtfrecord_lock();
	7427	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
	7428
	7429	vmrtfrs.vmrtf_total++;
	7430	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
	7431
	7432	cvmr->rtfabstime = fstart;
	7433	cvmr->rtfduration = fend - fstart;
	7434	cvmr->rtfaddr = fault_vaddr;
	7435	cvmr->rtfpc = cfpc;
	7436	cvmr->rtftype = type_of_fault;
	7437	cvmr->rtfupid = cupid;
	7438	cvmr->rtftid = ctid;
	7439
	7440	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
	7441	vmrtfrs.vmrtfr_curi = 0;
	7442	}
	7443
	7444	vm_rtfrecord_unlock();
	7445	}
	7446
	7447	int
	7448	vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void vrecords, unsigned long vmrtfrv)
	7449	{
	7450	vm_rtfault_record_t *cvmrd = vrecords;
	7451	size_t residue = vrecordsz;
	7452	size_t numextracted = 0;
	7453	boolean_t early_exit = FALSE;
	7454
	7455	vm_rtfrecord_lock();
	7456
	7457	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
	7458	if (residue < sizeof(vm_rtfault_record_t)) {
	7459	early_exit = TRUE;
	7460	break;
	7461	}
	7462
	7463	if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
	7464	#if DEVELOPMENT \|\| DEBUG
	7465	if (isroot == FALSE) {
	7466	continue;
	7467	}
	7468	#else
	7469	continue;
	7470	#endif /* DEVDEBUG */
	7471	}
	7472
	7473	*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
	7474	cvmrd++;
	7475	residue -= sizeof(vm_rtfault_record_t);
	7476	numextracted++;
	7477	}
	7478
	7479	vm_rtfrecord_unlock();
	7480
	7481	*vmrtfrv = numextracted;
	7482	return early_exit;
	7483	}
	7484
	7485	/*
	7486	* Only allow one diagnosis to be in flight at a time, to avoid
	7487	* creating too much additional memory usage.
	7488	*/
	7489	static volatile uint_t vmtc_diagnosing;
	7490	unsigned int vmtc_total;
	7491	unsigned int vmtc_undiagnosed;
	7492	unsigned int vmtc_not_eligible;
	7493	unsigned int vmtc_copyin_fail;
	7494	unsigned int vmtc_not_found;
	7495	unsigned int vmtc_one_bit_flip;
	7496	unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1];
	7497
	7498	#if DEVELOPMENT \|\| DEBUG
	7499	/*
	7500	* Keep around the last diagnosed corruption buffers to aid in debugging.
	7501	*/
	7502	static size_t vmtc_last_buffer_size;
	7503	static uint64_t *vmtc_last_before_buffer = NULL;
	7504	static uint64_t *vmtc_last_after_buffer = NULL;
	7505	#endif /* DEVELOPMENT \|\| DEBUG */
	7506
	7507	/*
	7508	* Set things up so we can diagnose a potential text page corruption.
	7509	*/
	7510	static uint64_t *
	7511	vmtc_text_page_diagnose_setup(
	7512	vm_map_offset_t code_addr)
	7513	{
	7514	uint64_t *buffer;
	7515	size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
	7516
	7517	(void)OSAddAtomic(1, &vmtc_total);
	7518
	7519	/*
	7520	* If another is being diagnosed, skip this one.
	7521	*/
	7522	if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
	7523	(void)OSAddAtomic(1, &vmtc_undiagnosed);
	7524	return NULL;
	7525	}
	7526
	7527	/*
	7528	* Get the contents of the corrupt page.
	7529	*/
	7530	buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
	7531	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), buffer, size) != 0) {
	7532	/* copyin error, so undo things */
	7533	kheap_free(KHEAP_DEFAULT, buffer, size);
	7534	(void)OSAddAtomic(1, &vmtc_undiagnosed);
	7535	++vmtc_copyin_fail;
	7536	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
	7537	panic("Bad compare and swap in setup!");
	7538	}
	7539	return NULL;
	7540	}
	7541	return buffer;
	7542	}
	7543
	7544	/*
	7545	* Diagnose the text page by comparing its contents with
	7546	* the one we've previously saved.
	7547	*/
	7548	static void
	7549	vmtc_text_page_diagnose(
	7550	vm_map_offset_t code_addr,
	7551	uint64_t *old_code_buffer)
	7552	{
	7553	uint64_t *new_code_buffer;
	7554	size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
	7555	uint_t count = (uint_t)size / sizeof(uint64_t);
	7556	uint_t diff_count = 0;
	7557	bool bit_flip = false;
	7558	uint_t b;
	7559	uint64_t *new;
	7560	uint64_t *old;
	7561
	7562	new_code_buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
	7563	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
	7564	/* copyin error, so undo things */
	7565	(void)OSAddAtomic(1, &vmtc_undiagnosed);
	7566	++vmtc_copyin_fail;
	7567	goto done;
	7568	}
	7569
	7570	new = new_code_buffer;
	7571	old = old_code_buffer;
	7572	for (; count-- > 0; ++new, ++old) {
	7573	if (new == old) {
	7574	continue;
	7575	}
	7576
	7577	/*
	7578	* On first diff, check for a single bit flip
	7579	*/
	7580	if (diff_count == 0) {
	7581	uint64_t x = (new ^ old);
	7582	assert(x != 0);
	7583	if ((x & (x - 1)) == 0) {
	7584	bit_flip = true;
	7585	++diff_count;
	7586	continue;
	7587	}
	7588	}
	7589
	7590	/*
	7591	* count up the number of different bytes.
	7592	*/
	7593	for (b = 0; b < sizeof(uint64_t); ++b) {
	7594	char n = (char )new;
	7595	char o = (char )old;
	7596	if (n[b] != o[b]) {
	7597	++diff_count;
	7598	}
	7599	}
	7600
	7601	/* quit counting when too many */
	7602	if (diff_count > (1 << MAX_TRACK_POWER2)) {
	7603	break;
	7604	}
	7605	}
	7606
	7607	if (diff_count > 1) {
	7608	bit_flip = false;
	7609	}
	7610
	7611	if (diff_count == 0) {
	7612	++vmtc_not_found;
	7613	} else if (bit_flip) {
	7614	++vmtc_one_bit_flip;
	7615	++vmtc_byte_counts[0];
	7616	} else {
	7617	for (b = 0; b <= MAX_TRACK_POWER2; ++b) {
	7618	if (diff_count <= (1 << b)) {
	7619	++vmtc_byte_counts[b];
	7620	break;
	7621	}
	7622	}
	7623	if (diff_count > (1 << MAX_TRACK_POWER2)) {
	7624	++vmtc_byte_counts[MAX_TRACK_POWER2];
	7625	}
	7626	}
	7627
	7628	done:
	7629	/*
	7630	* Free up the code copy buffers, but save the last
	7631	* set on development / debug kernels in case they
	7632	* can provide evidence for debugging memory stomps.
	7633	*/
	7634	#if DEVELOPMENT \|\| DEBUG
	7635	if (vmtc_last_before_buffer != NULL) {
	7636	kheap_free(KHEAP_DEFAULT, vmtc_last_before_buffer, vmtc_last_buffer_size);
	7637	}
	7638	if (vmtc_last_after_buffer != NULL) {
	7639	kheap_free(KHEAP_DEFAULT, vmtc_last_after_buffer, vmtc_last_buffer_size);
	7640	}
	7641	vmtc_last_before_buffer = old_code_buffer;
	7642	vmtc_last_after_buffer = new_code_buffer;
	7643	vmtc_last_buffer_size = size;
	7644	#else /* DEVELOPMENT \|\| DEBUG */
	7645	kheap_free(KHEAP_DEFAULT, new_code_buffer, size);
	7646	kheap_free(KHEAP_DEFAULT, old_code_buffer, size);
	7647	#endif /* DEVELOPMENT \|\| DEBUG */
	7648
	7649	/*
	7650	* We're finished, so clear the diagnosing flag.
	7651	*/
	7652	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
	7653	panic("Bad compare and swap in diagnose!");
	7654	}
	7655	}
	7656
	7657	/*
	7658	* For the given map, virt address, find the object, offset, and page.
	7659	* This has to lookup the map entry, verify protections, walk any shadow chains.
	7660	* If found, returns with the object locked.
	7661	*/
	7662	static kern_return_t
	7663	vmtc_revalidate_lookup(
	7664	vm_map_t map,
	7665	vm_map_offset_t vaddr,
	7666	vm_object_t *ret_object,
	7667	vm_object_offset_t *ret_offset,
	7668	vm_page_t *ret_page)
	7669	{
	7670	vm_object_t object;
	7671	vm_object_offset_t offset;
	7672	vm_page_t page;
	7673	kern_return_t kr = KERN_SUCCESS;
	7674	uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	7675	vm_map_version_t version;
	7676	boolean_t wired;
	7677	struct vm_object_fault_info fault_info = {};
	7678	vm_map_t real_map = NULL;
	7679	vm_prot_t prot;
	7680	vm_object_t shadow;
	7681
	7682	/*
	7683	* Find the object/offset for the given location/map.
	7684	* Note this returns with the object locked.
	7685	*/
	7686	restart:
	7687	vm_map_lock_read(map);
	7688	object = VM_OBJECT_NULL; /* in case we come around the restart path */
	7689	kr = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
	7690	object_lock_type, &version, &object, &offset, &prot, &wired,
	7691	&fault_info, &real_map, NULL);
	7692	vm_map_unlock_read(map);
	7693	if (real_map != NULL && real_map != map) {
	7694	vm_map_unlock(real_map);
	7695	}
	7696
	7697	/*
	7698	* If there's no mapping here, or if we fail because the page
	7699	* wasn't mapped executable, we can ignore this.
	7700	*/
	7701	if (kr != KERN_SUCCESS \|\|
	7702	object == NULL \|\|
	7703	!(prot & VM_PROT_EXECUTE)) {
	7704	kr = KERN_FAILURE;
	7705	goto done;
	7706	}
	7707
	7708	/*
	7709	* Chase down any shadow chains to find the actual page.
	7710	*/
	7711	for (;;) {
	7712	/*
	7713	* See if the page is on the current object.
	7714	*/
	7715	page = vm_page_lookup(object, vm_object_trunc_page(offset));
	7716	if (page != NULL) {
	7717	/* restart the lookup */
	7718	if (page->vmp_restart) {
	7719	vm_object_unlock(object);
	7720	goto restart;
	7721	}
	7722
	7723	/*
	7724	* If this page is busy, we need to wait for it.
	7725	*/
	7726	if (page->vmp_busy) {
	7727	PAGE_SLEEP(object, page, TRUE);
	7728	vm_object_unlock(object);
	7729	goto restart;
	7730	}
	7731	break;
	7732	}
	7733
	7734	/*
	7735	* If the object doesn't have the page and
	7736	* has no shadow, then we can quit.
	7737	*/
	7738	shadow = object->shadow;
	7739	if (shadow == NULL) {
	7740	kr = KERN_FAILURE;
	7741	goto done;
	7742	}
	7743
	7744	/*
	7745	* Move to the next object
	7746	*/
	7747	offset += object->vo_shadow_offset;
	7748	vm_object_lock(shadow);
	7749	vm_object_unlock(object);
	7750	object = shadow;
	7751	shadow = VM_OBJECT_NULL;
	7752	}
	7753	*ret_object = object;
	7754	*ret_offset = vm_object_trunc_page(offset);
	7755	*ret_page = page;
	7756
	7757	done:
	7758	if (kr != KERN_SUCCESS && object != NULL) {
	7759	vm_object_unlock(object);
	7760	}
	7761	return kr;
	7762	}
	7763
	7764	/*
	7765	* Check if a page is wired, needs extra locking.
	7766	*/
	7767	static bool
	7768	is_page_wired(vm_page_t page)
	7769	{
	7770	bool result;
	7771	vm_page_lock_queues();
	7772	result = VM_PAGE_WIRED(page);
	7773	vm_page_unlock_queues();
	7774	return result;
	7775	}
	7776
	7777	/*
	7778	* A fatal process error has occurred in the given task.
	7779	* Recheck the code signing of the text page at the given
	7780	* address to check for a text page corruption.
	7781	*
	7782	* Returns KERN_FAILURE if a page was found to be corrupt
	7783	* by failing to match its code signature. KERN_SUCCESS
	7784	* means the page is either valid or we don't have the
	7785	* information to say it's corrupt.
	7786	*/
	7787	kern_return_t
	7788	revalidate_text_page(task_t task, vm_map_offset_t code_addr)
	7789	{
	7790	kern_return_t kr;
	7791	vm_map_t map;
	7792	vm_object_t object = NULL;
	7793	vm_object_offset_t offset;
	7794	vm_page_t page = NULL;
	7795	struct vnode *vnode;
	7796	bool do_invalidate = false;
	7797	uint64_t *diagnose_buffer = NULL;
	7798
	7799	map = task->map;
	7800	if (task->map == NULL) {
	7801	return KERN_SUCCESS;
	7802	}
	7803
	7804	kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page);
	7805	if (kr != KERN_SUCCESS) {
	7806	goto done;
	7807	}
	7808
	7809	/*
	7810	* The object needs to have a pager.
	7811	*/
	7812	if (object->pager == NULL) {
	7813	goto done;
	7814	}
	7815
	7816	/*
	7817	* Needs to be a vnode backed page to have a signature.
	7818	*/
	7819	vnode = vnode_pager_lookup_vnode(object->pager);
	7820	if (vnode == NULL) {
	7821	goto done;
	7822	}
	7823
	7824	/*
	7825	* Object checks to see if we should proceed.
	7826	*/
	7827	if (!object->code_signed \|\| /* no code signature to check */
	7828	object->internal \|\| /* internal objects aren't signed */
	7829	object->terminating \|\| /* the object and its pages are already going away */
	7830	!object->pager_ready) { /* this should happen, but check shouldn't hurt */
	7831	goto done;
	7832	}
	7833
	7834	/*
	7835	* Check the code signature of the page in question.
	7836	*/
	7837	vm_page_map_and_validate_cs(object, page);
	7838
	7839	/*
	7840	* At this point:
	7841	* vmp_cs_validated \|= validated (set if a code signature exists)
	7842	* vmp_cs_tainted \|= tainted (set if code signature violation)
	7843	* vmp_cs_nx \|= nx; ??
	7844	*
	7845	* if vmp_pmapped then have to pmap_disconnect..
	7846	* other flags to check on object or page?
	7847	*/
	7848	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
	7849	#if DEBUG \|\| DEVELOPMENT
	7850	/*
	7851	* On development builds, a boot-arg can be used to cause
	7852	* a panic, instead of a quiet repair.
	7853	*/
	7854	if (vmtc_panic_instead) {
	7855	panic("Text page corruption detected: vm_page_t 0x%llx\n", (long long)(uintptr_t)page);
	7856	}
	7857	#endif /* DEBUG \|\| DEVELOPMENT */
	7858
	7859	/*
	7860	* We're going to invalidate this page. Mark it as busy so we can
	7861	* drop the object lock and use copyin() to save its contents.
	7862	*/
	7863	do_invalidate = true;
	7864	assert(!page->vmp_busy);
	7865	page->vmp_busy = TRUE;
	7866	vm_object_unlock(object);
	7867	diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr);
	7868	}
	7869
	7870	done:
	7871	if (do_invalidate) {
	7872	vm_object_lock(object);
	7873	assert(page->vmp_busy);
	7874	assert(VM_PAGE_OBJECT(page) == object); /* Since the page was busy, this shouldn't change */
	7875	assert(page->vmp_offset == offset);
	7876	PAGE_WAKEUP_DONE(page); /* make no longer busy */
	7877
	7878	/*
	7879	* Invalidate, i.e. toss, the corrupted page.
	7880	*/
	7881	if (!page->vmp_cleaning &&
	7882	!page->vmp_laundry &&
	7883	!page->vmp_fictitious &&
	7884	!page->vmp_precious &&
	7885	!page->vmp_absent &&
	7886	!page->vmp_error &&
	7887	!page->vmp_dirty &&
	7888	!is_page_wired(page)) {
	7889	if (page->vmp_pmapped) {
	7890	int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
	7891	if (refmod & VM_MEM_MODIFIED) {
	7892	SET_PAGE_DIRTY(page, FALSE);
	7893	}
	7894	if (refmod & VM_MEM_REFERENCED) {
	7895	page->vmp_reference = TRUE;
	7896	}
	7897	}
	7898	/* If the page seems intentionally modified, don't trash it. */
	7899	if (!page->vmp_dirty) {
	7900	VM_PAGE_FREE(page);
	7901	} else {
	7902	(void)OSAddAtomic(1, &vmtc_not_eligible);
	7903	}
	7904	} else {
	7905	(void)OSAddAtomic(1, &vmtc_not_eligible);
	7906	}
	7907	vm_object_unlock(object);
	7908
	7909	/*
	7910	* Now try to diagnose the type of failure by faulting
	7911	* in a new copy and diff'ing it with what we saved.
	7912	*/
	7913	if (diagnose_buffer) {
	7914	vmtc_text_page_diagnose(code_addr, diagnose_buffer);
	7915	}
	7916	return KERN_FAILURE;
	7917	}
	7918
	7919	if (object != NULL) {
	7920	vm_object_unlock(object);
	7921	}
	7922	return KERN_SUCCESS;
	7923	}
	7924
	7925	#if DEBUG \|\| DEVELOPMENT
	7926	/*
	7927	* For implementing unit tests - ask the pmap to corrupt a text page.
	7928	* We have to find the page, to get the physical address, then invoke
	7929	* the pmap.
	7930	*/
	7931	extern kern_return_t vm_corrupt_text_addr(uintptr_t);
	7932
	7933	kern_return_t
	7934	vm_corrupt_text_addr(uintptr_t va)
	7935	{
	7936	task_t task = current_task();
	7937	vm_map_t map;
	7938	kern_return_t kr = KERN_SUCCESS;
	7939	vm_object_t object = VM_OBJECT_NULL;
	7940	vm_object_offset_t offset;
	7941	vm_page_t page = NULL;
	7942	pmap_paddr_t pa;
	7943
	7944	map = task->map;
	7945	if (task->map == NULL) {
	7946	printf("corrupt_text_addr: no map\n");
	7947	return KERN_FAILURE;
	7948	}
	7949
	7950	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page);
	7951	if (kr != KERN_SUCCESS) {
	7952	printf("corrupt_text_addr: page lookup failed\n");
	7953	return kr;
	7954	}
	7955	/* get the physical address to use */
	7956	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
	7957
	7958	/*
	7959	* Check we have something we can work with.
	7960	* Due to racing with pageout as we enter the sysctl,
	7961	* it's theoretically possible to have the page disappear, just
	7962	* before the lookup.
	7963	*
	7964	* That's highly likely to happen often. I've filed a radar 72857482
	7965	* to bubble up the error here to the sysctl result and have the
	7966	* test not FAIL in that case.
	7967	*/
	7968	if (page->vmp_busy) {
	7969	printf("corrupt_text_addr: vmp_busy\n");
	7970	kr = KERN_FAILURE;
	7971	}
	7972	if (page->vmp_cleaning) {
	7973	printf("corrupt_text_addr: vmp_cleaning\n");
	7974	kr = KERN_FAILURE;
	7975	}
	7976	if (page->vmp_laundry) {
	7977	printf("corrupt_text_addr: vmp_cleaning\n");
	7978	kr = KERN_FAILURE;
	7979	}
	7980	if (page->vmp_fictitious) {
	7981	printf("corrupt_text_addr: vmp_fictitious\n");
	7982	kr = KERN_FAILURE;
	7983	}
	7984	if (page->vmp_precious) {
	7985	printf("corrupt_text_addr: vmp_precious\n");
	7986	kr = KERN_FAILURE;
	7987	}
	7988	if (page->vmp_absent) {
	7989	printf("corrupt_text_addr: vmp_absent\n");
	7990	kr = KERN_FAILURE;
	7991	}
	7992	if (page->vmp_error) {
	7993	printf("corrupt_text_addr: vmp_error\n");
	7994	kr = KERN_FAILURE;
	7995	}
	7996	if (page->vmp_dirty) {
	7997	printf("corrupt_text_addr: vmp_dirty\n");
	7998	kr = KERN_FAILURE;
	7999	}
	8000	if (is_page_wired(page)) {
	8001	printf("corrupt_text_addr: wired\n");
	8002	kr = KERN_FAILURE;
	8003	}
	8004	if (!page->vmp_pmapped) {
	8005	printf("corrupt_text_addr: !vmp_pmapped\n");
	8006	kr = KERN_FAILURE;
	8007	}
	8008
	8009	if (kr == KERN_SUCCESS) {
	8010	printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
	8011	kr = pmap_test_text_corruption(pa);
	8012	if (kr != KERN_SUCCESS) {
	8013	printf("corrupt_text_addr: pmap error %d\n", kr);
	8014	}
	8015	} else {
	8016	printf("corrupt_text_addr: object %p\n", object);
	8017	printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
	8018	printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
	8019	printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
	8020	printf("corrupt_text_addr: vm_page_t %p\n", page);
	8021	printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
	8022	printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
	8023	}
	8024
	8025	if (object != VM_OBJECT_NULL) {
	8026	vm_object_unlock(object);
	8027	}
	8028	return kr;
	8029	}
	8030	#endif /* DEBUG \|\| DEVELOPMENT */