git.saurik.com Git - apple/xnu.git/blame

Commit	Line	Data
1c79356b	1	/*
f427ee49	2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
1c79356b	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5ba3f43e	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
5ba3f43e	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
2d21ac55 A	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
5ba3f43e	17	*
2d21ac55 A	18	* The Original Code and all software distributed under the License are
2d21ac55 A	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
5ba3f43e	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b A	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
5ba3f43e	31	/*
1c79356b A	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
5ba3f43e	35	*
1c79356b A	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
5ba3f43e	41	*
1c79356b A	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
5ba3f43e	45	*
1c79356b	46	* Carnegie Mellon requests users of this software to return to
5ba3f43e	47	*
1c79356b A	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
5ba3f43e	52	*
1c79356b A	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	*/
	58	/*
	59	* File: vm_fault.c
	60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
	61	*
	62	* Page fault handling module.
	63	*/
1c79356b A	64
	65	#include <mach_cluster_stats.h>
	66	#include <mach_pagemap.h>
2d21ac55	67	#include <libkern/OSAtomic.h>
1c79356b	68
91447636	69	#include <mach/mach_types.h>
1c79356b	70	#include <mach/kern_return.h>
0a7de745	71	#include <mach/message.h> /* for error codes */
91447636 A	72	#include <mach/vm_param.h>
	73	#include <mach/vm_behavior.h>
	74	#include <mach/memory_object.h>
0a7de745	75	/* For memory_object_data_{request,unlock} */
2d21ac55	76	#include <mach/sdt.h>
91447636 A	77
91447636 A	78	#include <kern/kern_types.h>
1c79356b	79	#include <kern/host_statistics.h>
c3c9b80d	80	#include <kern/counter.h>
1c79356b A	81	#include <kern/task.h>
	82	#include <kern/thread.h>
	83	#include <kern/sched_prim.h>
	84	#include <kern/host.h>
91447636 A	85	#include <kern/mach_param.h>
	86	#include <kern/macro_help.h>
	87	#include <kern/zalloc.h>
	88	#include <kern/misc_protos.h>
39037602	89	#include <kern/policy_internal.h>
91447636	90
39236c6e A	91	#include <vm/vm_compressor.h>
39236c6e A	92	#include <vm/vm_compressor_pager.h>
91447636	93	#include <vm/vm_fault.h>
1c79356b A	94	#include <vm/vm_map.h>
	95	#include <vm/vm_object.h>
	96	#include <vm/vm_page.h>
55e303ae	97	#include <vm/vm_kern.h>
1c79356b A	98	#include <vm/pmap.h>
1c79356b A	99	#include <vm/vm_pageout.h>
91447636	100	#include <vm/vm_protos.h>
2d21ac55 A	101	#include <vm/vm_external.h>
2d21ac55 A	102	#include <vm/memory_object.h>
0a7de745	103	#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
6d2010ae	104	#include <vm/vm_shared_region.h>
1c79356b	105
39236c6e	106	#include <sys/codesign.h>
39037602 A	107	#include <sys/reason.h>
39037602 A	108	#include <sys/signalvar.h>
39236c6e	109
5ba3f43e	110	#include <san/kasan.h>
15129b1c	111
0a7de745	112	#define VM_FAULT_CLASSIFY 0
1c79356b	113
2d21ac55	114	#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
1c79356b	115
cb323159 A	116	int vm_protect_privileged_from_untrusted = 1;
cb323159 A	117
0a7de745	118	unsigned int vm_object_pagein_throttle = 16;
1c79356b	119
b0d623f7	120	/*
5ba3f43e	121	* We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
b0d623f7 A	122	* kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
	123	* of memory if they're buggy and can run the system completely out of swap space. If this happens, we
	124	* impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
5ba3f43e	125	* keep the UI active so that the user has a chance to kill the offending task before the system
b0d623f7 A	126	* completely hangs.
	127	*
	128	* The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
	129	* to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
	130	* will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
	131	* delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
	132	*/
	133
99c3a104	134	extern void throttle_lowpri_io(int);
b0d623f7	135
39037602 A	136	extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
39037602 A	137
b0d623f7 A	138	uint64_t vm_hard_throttle_threshold;
b0d623f7 A	139
c3c9b80d A	140	#if DEBUG \|\| DEVELOPMENT
	141	static bool vmtc_panic_instead = false;
	142	#endif /* DEBUG \|\| DEVELOPMENT */
b0d623f7	143
bca245ac A	144	OS_ALWAYS_INLINE
	145	boolean_t
	146	NEED_TO_HARD_THROTTLE_THIS_TASK(void)
	147	{
	148	return vm_wants_task_throttled(current_task()) \|\|
	149	((vm_page_free_count < vm_page_throttle_limit \|\|
	150	HARD_THROTTLE_LIMIT_REACHED()) &&
	151	proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
	152	}
b0d623f7	153
0a7de745 A	154	#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
0a7de745 A	155	#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
04b8595b	156
0a7de745 A	157	#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
0a7de745 A	158	#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
04b8595b	159
1c79356b	160
cb323159 A	161	#define VM_STAT_DECOMPRESSIONS() \
cb323159 A	162	MACRO_BEGIN \
c3c9b80d	163	counter_inc(&vm_statistics_decompressions); \
cb323159 A	164	current_thread()->decompressions++; \
	165	MACRO_END
	166
b0d623f7	167	boolean_t current_thread_aborted(void);
91447636	168
1c79356b	169	/* Forward declarations of internal routines. */
3e170ce0	170	static kern_return_t vm_fault_wire_fast(
0a7de745 A	171	vm_map_t map,
	172	vm_map_offset_t va,
	173	vm_prot_t prot,
	174	vm_tag_t wire_tag,
	175	vm_map_entry_t entry,
	176	pmap_t pmap,
	177	vm_map_offset_t pmap_addr,
	178	ppnum_t *physpage_p);
1c79356b	179
3e170ce0	180	static kern_return_t vm_fault_internal(
0a7de745 A	181	vm_map_t map,
	182	vm_map_offset_t vaddr,
	183	vm_prot_t caller_prot,
	184	boolean_t change_wiring,
	185	vm_tag_t wire_tag,
	186	int interruptible,
	187	pmap_t pmap,
	188	vm_map_offset_t pmap_addr,
	189	ppnum_t *physpage_p);
3e170ce0 A	190
3e170ce0 A	191	static void vm_fault_copy_cleanup(
0a7de745 A	192	vm_page_t page,
0a7de745 A	193	vm_page_t top_page);
1c79356b	194
3e170ce0	195	static void vm_fault_copy_dst_cleanup(
0a7de745	196	vm_page_t page);
1c79356b	197
0a7de745 A	198	#if VM_FAULT_CLASSIFY
	199	extern void vm_fault_classify(vm_object_t object,
	200	vm_object_offset_t offset,
	201	vm_prot_t fault_type);
1c79356b A	202
	203	extern void vm_fault_classify_init(void);
	204	#endif
	205
d1ecb069	206	unsigned long vm_pmap_enter_blocked = 0;
316670eb	207	unsigned long vm_pmap_enter_retried = 0;
4a3eedf9 A	208
	209	unsigned long vm_cs_validates = 0;
	210	unsigned long vm_cs_revalidates = 0;
	211	unsigned long vm_cs_query_modified = 0;
	212	unsigned long vm_cs_validated_dirtied = 0;
6d2010ae	213	unsigned long vm_cs_bitmap_validated = 0;
593a1d5f	214
cb323159	215	void vm_pre_fault(vm_map_offset_t, vm_prot_t);
fe8ab488	216
3e170ce0	217	extern char *kdp_compressor_decompressed_page;
0a7de745 A	218	extern addr64_t kdp_compressor_decompressed_page_paddr;
0a7de745 A	219	extern ppnum_t kdp_compressor_decompressed_page_ppnum;
3e170ce0	220
d9a64523 A	221	struct vmrtfr {
	222	int vmrtfr_maxi;
	223	int vmrtfr_curi;
	224	int64_t vmrtf_total;
	225	vm_rtfault_record_t *vm_rtf_records;
	226	} vmrtfrs;
	227	#define VMRTF_DEFAULT_BUFSIZE (4096)
	228	#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
f427ee49	229	TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
d9a64523 A	230
	231	static void vm_rtfrecord_lock(void);
	232	static void vm_rtfrecord_unlock(void);
	233	static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
	234
d9a64523 A	235	extern lck_grp_t vm_page_lck_grp_bucket;
d9a64523 A	236	extern lck_attr_t vm_page_lck_attr;
f427ee49	237	LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
d9a64523	238
1c79356b A	239	/*
	240	* Routine: vm_fault_init
	241	* Purpose:
	242	* Initialize our private data structures.
	243	*/
f427ee49	244	__startup_func
1c79356b A	245	void
	246	vm_fault_init(void)
	247	{
39236c6e A	248	int i, vm_compressor_temp;
39236c6e A	249	boolean_t need_default_val = TRUE;
b0d623f7 A	250	/*
	251	* Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
	252	* computed as a percentage of available memory, and the percentage used is scaled inversely with
39236c6e	253	* the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
b0d623f7 A	254	* and reduce the value down to 10% for very large memory configurations. This helps give us a
	255	* definition of a memory hog that makes more sense relative to the amount of ram in the machine.
	256	* The formula here simply uses the number of gigabytes of ram to adjust the percentage.
	257	*/
	258
0a7de745	259	vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
39236c6e A	260
	261	/*
	262	* Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
	263	*/
	264
0a7de745 A	265	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
0a7de745 A	266	for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
f427ee49	267	if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
39236c6e A	268	need_default_val = FALSE;
	269	vm_compressor_mode = vm_compressor_temp;
	270	break;
	271	}
	272	}
0a7de745	273	if (need_default_val) {
39236c6e	274	printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
0a7de745	275	}
5ba3f43e	276	}
39236c6e A	277	if (need_default_val) {
	278	/* If no boot arg or incorrect boot arg, try device tree. */
	279	PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
	280	}
39236c6e	281	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
cb323159	282
f427ee49 A	283	PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
	284	&vm_protect_privileged_from_untrusted,
	285	sizeof(vm_protect_privileged_from_untrusted));
c3c9b80d A	286
	287	#if DEBUG \|\| DEVELOPMENT
	288	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
	289	#endif /* DEBUG \|\| DEVELOPMENT */
1c79356b A	290	}
1c79356b A	291
f427ee49 A	292	__startup_func
f427ee49 A	293	static void
0a7de745 A	294	vm_rtfault_record_init(void)
0a7de745 A	295	{
f427ee49	296	size_t size;
d9a64523	297
d9a64523	298	vmrtf_num_records = MAX(vmrtf_num_records, 1);
f427ee49 A	299	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
	300	vmrtfrs.vm_rtf_records = zalloc_permanent(size,
	301	ZALIGN(vm_rtfault_record_t));
d9a64523	302	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
d9a64523	303	}
f427ee49 A	304	STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
f427ee49 A	305
1c79356b A	306	/*
	307	* Routine: vm_fault_cleanup
	308	* Purpose:
	309	* Clean up the result of vm_fault_page.
	310	* Results:
	311	* The paging reference for "object" is released.
	312	* "object" is unlocked.
	313	* If "top_page" is not null, "top_page" is
	314	* freed and the paging reference for the object
	315	* containing it is released.
	316	*
	317	* In/out conditions:
	318	* "object" must be locked.
	319	*/
	320	void
	321	vm_fault_cleanup(
0a7de745 A	322	vm_object_t object,
0a7de745 A	323	vm_page_t top_page)
1c79356b A	324	{
1c79356b A	325	vm_object_paging_end(object);
0a7de745	326	vm_object_unlock(object);
1c79356b A	327
1c79356b A	328	if (top_page != VM_PAGE_NULL) {
0a7de745	329	object = VM_PAGE_OBJECT(top_page);
2d21ac55 A	330
	331	vm_object_lock(object);
	332	VM_PAGE_FREE(top_page);
	333	vm_object_paging_end(object);
	334	vm_object_unlock(object);
1c79356b A	335	}
	336	}
	337
55e303ae A	338	#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
	339
	340
0a7de745	341	boolean_t vm_page_deactivate_behind = TRUE;
5ba3f43e A	342	/*
5ba3f43e A	343	* default sizes given VM_BEHAVIOR_DEFAULT reference behavior
1c79356b	344	*/
0a7de745 A	345	#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
0a7de745 A	346	#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
b0d623f7 A	347	/* we use it to size an array on the stack */
	348
	349	int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
55e303ae	350
0a7de745	351	#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
2d21ac55 A	352
	353	/*
	354	* vm_page_is_sequential
	355	*
	356	* Determine if sequential access is in progress
	357	* in accordance with the behavior specified.
	358	* Update state to indicate current access pattern.
	359	*
	360	* object must have at least the shared lock held
	361	*/
	362	static
	363	void
	364	vm_fault_is_sequential(
0a7de745 A	365	vm_object_t object,
	366	vm_object_offset_t offset,
	367	vm_behavior_t behavior)
2d21ac55	368	{
0a7de745 A	369	vm_object_offset_t last_alloc;
	370	int sequential;
	371	int orig_sequential;
2d21ac55	372
0a7de745	373	last_alloc = object->last_alloc;
2d21ac55 A	374	sequential = object->sequential;
	375	orig_sequential = sequential;
	376
f427ee49 A	377	offset = vm_object_trunc_page(offset);
	378	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
	379	/* re-faulting in the same page: no change in behavior */
	380	return;
	381	}
	382
2d21ac55 A	383	switch (behavior) {
2d21ac55 A	384	case VM_BEHAVIOR_RANDOM:
0a7de745	385	/*
2d21ac55 A	386	* reset indicator of sequential behavior
2d21ac55 A	387	*/
0a7de745 A	388	sequential = 0;
0a7de745 A	389	break;
2d21ac55 A	390
2d21ac55 A	391	case VM_BEHAVIOR_SEQUENTIAL:
0a7de745 A	392	if (offset && last_alloc == offset - PAGE_SIZE_64) {
0a7de745 A	393	/*
2d21ac55 A	394	* advance indicator of sequential behavior
2d21ac55 A	395	*/
0a7de745 A	396	if (sequential < MAX_SEQUENTIAL_RUN) {
	397	sequential += PAGE_SIZE;
	398	}
2d21ac55	399	} else {
0a7de745	400	/*
2d21ac55 A	401	* reset indicator of sequential behavior
2d21ac55 A	402	*/
0a7de745	403	sequential = 0;
2d21ac55	404	}
0a7de745	405	break;
2d21ac55 A	406
2d21ac55 A	407	case VM_BEHAVIOR_RSEQNTL:
0a7de745 A	408	if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
0a7de745 A	409	/*
2d21ac55 A	410	* advance indicator of sequential behavior
2d21ac55 A	411	*/
0a7de745 A	412	if (sequential > -MAX_SEQUENTIAL_RUN) {
	413	sequential -= PAGE_SIZE;
	414	}
2d21ac55	415	} else {
0a7de745	416	/*
2d21ac55 A	417	* reset indicator of sequential behavior
2d21ac55 A	418	*/
0a7de745	419	sequential = 0;
2d21ac55	420	}
0a7de745	421	break;
2d21ac55 A	422
	423	case VM_BEHAVIOR_DEFAULT:
	424	default:
0a7de745 A	425	if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
0a7de745 A	426	/*
2d21ac55 A	427	* advance indicator of sequential behavior
2d21ac55 A	428	*/
0a7de745 A	429	if (sequential < 0) {
	430	sequential = 0;
	431	}
	432	if (sequential < MAX_SEQUENTIAL_RUN) {
	433	sequential += PAGE_SIZE;
	434	}
2d21ac55	435	} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
0a7de745	436	/*
2d21ac55 A	437	* advance indicator of sequential behavior
2d21ac55 A	438	*/
0a7de745 A	439	if (sequential > 0) {
	440	sequential = 0;
	441	}
	442	if (sequential > -MAX_SEQUENTIAL_RUN) {
	443	sequential -= PAGE_SIZE;
	444	}
2d21ac55	445	} else {
0a7de745	446	/*
2d21ac55 A	447	* reset indicator of sequential behavior
2d21ac55 A	448	*/
0a7de745	449	sequential = 0;
2d21ac55	450	}
0a7de745	451	break;
2d21ac55 A	452	}
2d21ac55 A	453	if (sequential != orig_sequential) {
0a7de745 A	454	if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
0a7de745 A	455	/*
2d21ac55 A	456	* if someone else has already updated object->sequential
	457	* don't bother trying to update it or object->last_alloc
	458	*/
0a7de745	459	return;
2d21ac55 A	460	}
	461	}
	462	/*
	463	* I'd like to do this with a OSCompareAndSwap64, but that
	464	* doesn't exist for PPC... however, it shouldn't matter
	465	* that much... last_alloc is maintained so that we can determine
	466	* if a sequential access pattern is taking place... if only
	467	* one thread is banging on this object, no problem with the unprotected
	468	* update... if 2 or more threads are banging away, we run the risk of
	469	* someone seeing a mangled update... however, in the face of multiple
	470	* accesses, no sequential access pattern can develop anyway, so we
	471	* haven't lost any real info.
	472	*/
	473	object->last_alloc = offset;
	474	}
	475
	476
b0d623f7 A	477	int vm_page_deactivate_behind_count = 0;
b0d623f7 A	478
55e303ae	479	/*
2d21ac55 A	480	* vm_page_deactivate_behind
	481	*
	482	* Determine if sequential access is in progress
	483	* in accordance with the behavior specified. If
	484	* so, compute a potential page to deactivate and
	485	* deactivate it.
55e303ae	486	*
2d21ac55	487	* object must be locked.
55e303ae	488	*
2d21ac55	489	* return TRUE if we actually deactivate a page
55e303ae A	490	*/
	491	static
	492	boolean_t
	493	vm_fault_deactivate_behind(
0a7de745 A	494	vm_object_t object,
	495	vm_object_offset_t offset,
	496	vm_behavior_t behavior)
55e303ae	497	{
0a7de745 A	498	int n;
	499	int pages_in_run = 0;
	500	int max_pages_in_run = 0;
	501	int sequential_run;
	502	int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	503	vm_object_offset_t run_offset = 0;
	504	vm_object_offset_t pg_offset = 0;
	505	vm_page_t m;
	506	vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
55e303ae	507
b0d623f7	508	pages_in_run = 0;
55e303ae	509	#if TRACEFAULTPAGE
0a7de745	510	dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
55e303ae	511	#endif
f427ee49	512	if (object == kernel_object \|\| vm_page_deactivate_behind == FALSE \|\| (vm_object_trunc_page(offset) != offset)) {
91447636 A	513	/*
	514	* Do not deactivate pages from the kernel object: they
	515	* are not intended to become pageable.
2d21ac55	516	* or we've disabled the deactivate behind mechanism
f427ee49 A	517	* or we are dealing with an offset that is not aligned to
	518	* the system's PAGE_SIZE because in that case we will
	519	* handle the deactivation on the aligned offset and, thus,
	520	* the full PAGE_SIZE page once. This helps us avoid the redundant
	521	* deactivates and the extra faults.
91447636 A	522	*/
	523	return FALSE;
	524	}
2d21ac55	525	if ((sequential_run = object->sequential)) {
0a7de745 A	526	if (sequential_run < 0) {
	527	sequential_behavior = VM_BEHAVIOR_RSEQNTL;
	528	sequential_run = 0 - sequential_run;
	529	} else {
	530	sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
	531	}
2d21ac55	532	}
55e303ae A	533	switch (behavior) {
55e303ae A	534	case VM_BEHAVIOR_RANDOM:
55e303ae A	535	break;
55e303ae A	536	case VM_BEHAVIOR_SEQUENTIAL:
0a7de745	537	if (sequential_run >= (int)PAGE_SIZE) {
b0d623f7 A	538	run_offset = 0 - PAGE_SIZE_64;
	539	max_pages_in_run = 1;
	540	}
55e303ae A	541	break;
55e303ae A	542	case VM_BEHAVIOR_RSEQNTL:
0a7de745	543	if (sequential_run >= (int)PAGE_SIZE) {
b0d623f7 A	544	run_offset = PAGE_SIZE_64;
	545	max_pages_in_run = 1;
	546	}
55e303ae A	547	break;
	548	case VM_BEHAVIOR_DEFAULT:
	549	default:
0a7de745	550	{ vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
2d21ac55	551
0a7de745	552	/*
2d21ac55 A	553	* determine if the run of sequential accesss has been
	554	* long enough on an object with default access behavior
	555	* to consider it for deactivation
	556	*/
b0d623f7 A	557	if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
	558	/*
	559	* the comparisons between offset and behind are done
	560	* in this kind of odd fashion in order to prevent wrap around
	561	* at the end points
	562	*/
0a7de745 A	563	if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
0a7de745 A	564	if (offset >= behind) {
b0d623f7 A	565	run_offset = 0 - behind;
	566	pg_offset = PAGE_SIZE_64;
	567	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	568	}
2d21ac55	569	} else {
0a7de745	570	if (offset < -behind) {
b0d623f7 A	571	run_offset = behind;
	572	pg_offset = 0 - PAGE_SIZE_64;
	573	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
	574	}
2d21ac55	575	}
55e303ae	576	}
0a7de745	577	break;}
2d21ac55	578	}
0a7de745	579	for (n = 0; n < max_pages_in_run; n++) {
b0d623f7 A	580	m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
b0d623f7 A	581
d9a64523	582	if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
b0d623f7	583	page_run[pages_in_run++] = m;
39236c6e A	584
	585	/*
	586	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
	587	*
	588	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
	589	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
	590	* new reference happens. If no futher references happen on the page after that remote TLB flushes
	591	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
	592	* by pageout_scan, which is just fine since the last reference would have happened quite far
	593	* in the past (TLB caches don't hang around for very long), and of course could just as easily
	594	* have happened before we did the deactivate_behind.
	595	*/
39037602	596	pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
b0d623f7 A	597	}
	598	}
	599	if (pages_in_run) {
	600	vm_page_lockspin_queues();
	601
	602	for (n = 0; n < pages_in_run; n++) {
b0d623f7 A	603	m = page_run[n];
	604
	605	vm_page_deactivate_internal(m, FALSE);
	606
	607	vm_page_deactivate_behind_count++;
55e303ae	608	#if TRACEFAULTPAGE
0a7de745	609	dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
55e303ae A	610	#endif
55e303ae A	611	}
b0d623f7 A	612	vm_page_unlock_queues();
	613
	614	return TRUE;
55e303ae A	615	}
	616	return FALSE;
	617	}
1c79356b	618
1c79356b	619
04b8595b	620	#if (DEVELOPMENT \|\| DEBUG)
0a7de745 A	621	uint32_t vm_page_creation_throttled_hard = 0;
	622	uint32_t vm_page_creation_throttled_soft = 0;
	623	uint64_t vm_page_creation_throttle_avoided = 0;
04b8595b A	624	#endif /* DEVELOPMENT \|\| DEBUG */
04b8595b A	625
6d2010ae	626	static int
04b8595b	627	vm_page_throttled(boolean_t page_kept)
b0d623f7	628	{
0a7de745 A	629	clock_sec_t elapsed_sec;
	630	clock_sec_t tv_sec;
	631	clock_usec_t tv_usec;
5ba3f43e	632
b0d623f7	633	thread_t thread = current_thread();
5ba3f43e	634
0a7de745 A	635	if (thread->options & TH_OPT_VMPRIV) {
	636	return 0;
	637	}
b0d623f7	638
04b8595b A	639	if (thread->t_page_creation_throttled) {
04b8595b A	640	thread->t_page_creation_throttled = 0;
5ba3f43e	641
0a7de745	642	if (page_kept == FALSE) {
04b8595b	643	goto no_throttle;
0a7de745	644	}
04b8595b A	645	}
	646	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
	647	#if (DEVELOPMENT \|\| DEBUG)
	648	thread->t_page_creation_throttled_hard++;
	649	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	650	#endif /* DEVELOPMENT \|\| DEBUG */
0a7de745	651	return HARD_THROTTLE_DELAY;
04b8595b	652	}
b0d623f7	653
39037602	654	if ((vm_page_free_count < vm_page_throttle_limit \|\| (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
04b8595b	655	thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
3e170ce0 A	656	if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
	657	#if (DEVELOPMENT \|\| DEBUG)
	658	OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
	659	#endif
	660	goto no_throttle;
	661	}
b0d623f7 A	662	clock_get_system_microtime(&tv_sec, &tv_usec);
	663
	664	elapsed_sec = tv_sec - thread->t_page_creation_time;
	665
04b8595b A	666	if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS \|\|
04b8595b A	667	(thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
04b8595b	668	if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
b0d623f7 A	669	/*
	670	* we'll reset our stats to give a well behaved app
	671	* that was unlucky enough to accumulate a bunch of pages
	672	* over a long period of time a chance to get out of
	673	* the throttled state... we reset the counter and timestamp
	674	* so that if it stays under the rate limit for the next second
5ba3f43e	675	* it will be back in our good graces... if it exceeds it, it
b0d623f7 A	676	* will remain in the throttled state
	677	*/
	678	thread->t_page_creation_time = tv_sec;
04b8595b	679	thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
b0d623f7	680	}
d9a64523	681	VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
b0d623f7	682
04b8595b A	683	thread->t_page_creation_throttled = 1;
04b8595b A	684
39037602	685	if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
04b8595b A	686	#if (DEVELOPMENT \|\| DEBUG)
	687	thread->t_page_creation_throttled_hard++;
	688	OSAddAtomic(1, &vm_page_creation_throttled_hard);
	689	#endif /* DEVELOPMENT \|\| DEBUG */
0a7de745	690	return HARD_THROTTLE_DELAY;
04b8595b A	691	} else {
	692	#if (DEVELOPMENT \|\| DEBUG)
	693	thread->t_page_creation_throttled_soft++;
	694	OSAddAtomic(1, &vm_page_creation_throttled_soft);
	695	#endif /* DEVELOPMENT \|\| DEBUG */
0a7de745	696	return SOFT_THROTTLE_DELAY;
04b8595b	697	}
b0d623f7 A	698	}
	699	thread->t_page_creation_time = tv_sec;
	700	thread->t_page_creation_count = 0;
	701	}
04b8595b A	702	no_throttle:
	703	thread->t_page_creation_count++;
	704
0a7de745	705	return 0;
b0d623f7 A	706	}
b0d623f7 A	707
3e170ce0	708
2d21ac55 A	709	/*
	710	* check for various conditions that would
	711	* prevent us from creating a ZF page...
	712	* cleanup is based on being called from vm_fault_page
	713	*
	714	* object must be locked
d9a64523	715	* object == m->vmp_object
2d21ac55 A	716	*/
2d21ac55 A	717	static vm_fault_return_t
d9a64523	718	vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
2d21ac55	719	{
6d2010ae A	720	int throttle_delay;
6d2010ae A	721
0a7de745	722	if (object->shadow_severed \|\|
b0d623f7	723	VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
0a7de745	724	/*
b0d623f7 A	725	* Either:
	726	* 1. the shadow chain was severed,
	727	* 2. the purgeable object is volatile or empty and is marked
	728	* to fault on access while volatile.
	729	* Just have to return an error at this point
2d21ac55	730	*/
0a7de745 A	731	if (m != VM_PAGE_NULL) {
	732	VM_PAGE_FREE(m);
	733	}
2d21ac55 A	734	vm_fault_cleanup(object, first_m);
	735
	736	thread_interrupt_level(interruptible_state);
	737
0a7de745	738	return VM_FAULT_MEMORY_ERROR;
2d21ac55	739	}
3e170ce0 A	740	if (page_throttle == TRUE) {
	741	if ((throttle_delay = vm_page_throttled(FALSE))) {
	742	/*
	743	* we're throttling zero-fills...
	744	* treat this as if we couldn't grab a page
	745	*/
0a7de745	746	if (m != VM_PAGE_NULL) {
3e170ce0	747	VM_PAGE_FREE(m);
0a7de745	748	}
3e170ce0	749	vm_fault_cleanup(object, first_m);
2d21ac55	750
3e170ce0	751	VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
b0d623f7	752
3e170ce0	753	delay(throttle_delay);
b0d623f7	754
3e170ce0 A	755	if (current_thread_aborted()) {
	756	thread_interrupt_level(interruptible_state);
	757	return VM_FAULT_INTERRUPTED;
	758	}
6d2010ae	759	thread_interrupt_level(interruptible_state);
2d21ac55	760
0a7de745	761	return VM_FAULT_MEMORY_SHORTAGE;
3e170ce0	762	}
2d21ac55	763	}
0a7de745	764	return VM_FAULT_SUCCESS;
2d21ac55 A	765	}
2d21ac55 A	766
f427ee49 A	767	/*
	768	* Clear the code signing bits on the given page_t
	769	*/
	770	static void
	771	vm_fault_cs_clear(vm_page_t m)
	772	{
	773	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
	774	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
	775	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
	776	}
	777
	778	/*
	779	* Enqueues the given page on the throttled queue.
	780	* The caller must hold the vm_page_queue_lock and it will be held on return.
	781	*/
	782	static void
	783	vm_fault_enqueue_throttled_locked(vm_page_t m)
	784	{
	785	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
	786	assert(!VM_PAGE_WIRED(m));
	787
	788	/*
	789	* can't be on the pageout queue since we don't
	790	* have a pager to try and clean to
	791	*/
	792	vm_page_queues_remove(m, TRUE);
	793	vm_page_check_pageable_safe(m);
	794	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
	795	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
	796	vm_page_throttled_count++;
	797	}
2d21ac55 A	798
	799	/*
	800	* do the work to zero fill a page and
	801	* inject it into the correct paging queue
	802	*
d9a64523	803	* m->vmp_object must be locked
2d21ac55 A	804	* page queue lock must NOT be held
	805	*/
	806	static int
	807	vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
	808	{
0a7de745 A	809	int my_fault = DBG_ZERO_FILL_FAULT;
0a7de745 A	810	vm_object_t object;
39037602 A	811
39037602 A	812	object = VM_PAGE_OBJECT(m);
2d21ac55 A	813
	814	/*
	815	* This is is a zero-fill page fault...
	816	*
	817	* Checking the page lock is a waste of
	818	* time; this page was absent, so
	819	* it can't be page locked by a pager.
	820	*
	821	* we also consider it undefined
	822	* with respect to instruction
	823	* execution. i.e. it is the responsibility
	824	* of higher layers to call for an instruction
	825	* sync after changing the contents and before
5ba3f43e	826	* sending a program into this area. We
2d21ac55 A	827	* choose this approach for performance
2d21ac55 A	828	*/
f427ee49	829	vm_fault_cs_clear(m);
d9a64523	830	m->vmp_pmapped = TRUE;
2d21ac55	831
6d2010ae A	832	if (no_zero_fill == TRUE) {
6d2010ae A	833	my_fault = DBG_NZF_PAGE_FAULT;
fe8ab488	834
0a7de745 A	835	if (m->vmp_absent && m->vmp_busy) {
	836	return my_fault;
	837	}
6d2010ae	838	} else {
2d21ac55 A	839	vm_page_zero_fill(m);
2d21ac55 A	840
c3c9b80d	841	counter_inc(&vm_statistics_zero_fill_count);
2d21ac55 A	842	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
2d21ac55 A	843	}
d9a64523	844	assert(!m->vmp_laundry);
39037602	845	assert(object != kernel_object);
d9a64523	846	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
39037602	847	if (!VM_DYNAMIC_PAGING_ENABLED() &&
0a7de745 A	848	(object->purgable == VM_PURGABLE_DENY \|\|
	849	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	850	object->purgable == VM_PURGABLE_VOLATILE)) {
b0d623f7	851	vm_page_lockspin_queues();
39037602	852	if (!VM_DYNAMIC_PAGING_ENABLED()) {
f427ee49	853	vm_fault_enqueue_throttled_locked(m);
39236c6e	854	}
2d21ac55	855	vm_page_unlock_queues();
2d21ac55	856	}
0a7de745	857	return my_fault;
2d21ac55 A	858	}
	859
	860
1c79356b A	861	/*
	862	* Routine: vm_fault_page
	863	* Purpose:
	864	* Find the resident page for the virtual memory
	865	* specified by the given virtual memory object
	866	* and offset.
	867	* Additional arguments:
	868	* The required permissions for the page is given
	869	* in "fault_type". Desired permissions are included
2d21ac55	870	* in "protection".
5ba3f43e	871	* fault_info is passed along to determine pagein cluster
2d21ac55 A	872	* limits... it contains the expected reference pattern,
2d21ac55 A	873	* cluster size if available, etc...
1c79356b A	874	*
	875	* If the desired page is known to be resident (for
	876	* example, because it was previously wired down), asserting
	877	* the "unwiring" parameter will speed the search.
	878	*
	879	* If the operation can be interrupted (by thread_abort
	880	* or thread_terminate), then the "interruptible"
	881	* parameter should be asserted.
	882	*
	883	* Results:
	884	* The page containing the proper data is returned
	885	* in "result_page".
	886	*
	887	* In/out conditions:
	888	* The source object must be locked and referenced,
	889	* and must donate one paging reference. The reference
	890	* is not affected. The paging reference and lock are
	891	* consumed.
	892	*
	893	* If the call succeeds, the object in which "result_page"
	894	* resides is left locked and holding a paging reference.
	895	* If this is not the original object, a busy page in the
	896	* original object is returned in "top_page", to prevent other
	897	* callers from pursuing this same data, along with a paging
	898	* reference for the original object. The "top_page" should
	899	* be destroyed when this guarantee is no longer required.
	900	* The "result_page" is also left busy. It is not removed
	901	* from the pageout queues.
b0d623f7	902	* Special Case:
5ba3f43e	903	* A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
b0d623f7	904	* fault succeeded but there's no VM page (i.e. the VM object
0a7de745	905	* does not actually hold VM pages, but device memory or
b0d623f7 A	906	* large pages). The object is still locked and we still hold a
b0d623f7 A	907	* paging_in_progress reference.
1c79356b	908	*/
b0d623f7	909	unsigned int vm_fault_page_blocked_access = 0;
316670eb	910	unsigned int vm_fault_page_forced_retry = 0;
1c79356b A	911
	912	vm_fault_return_t
	913	vm_fault_page(
	914	/* Arguments: */
0a7de745 A	915	vm_object_t first_object, /* Object to begin search */
	916	vm_object_offset_t first_offset, /* Offset into object */
	917	vm_prot_t fault_type, /* What access is requested */
	918	boolean_t must_be_resident,/* Must page be resident? */
	919	boolean_t caller_lookup, /* caller looked up page */
1c79356b	920	/* Modifies in place: */
0a7de745 A	921	vm_prot_t protection, / Protection for mapping */
0a7de745 A	922	vm_page_t result_page, / Page found, if successful */
39236c6e	923	/* Returns: */
0a7de745 A	924	vm_page_t top_page, / Page in top object, if
0a7de745 A	925	* not result_page. */
1c79356b	926	int type_of_fault, / if non-null, fill in with type of fault
0a7de745	927	* COW, zero-fill, etc... returned in trace point */
1c79356b	928	/* More arguments: */
0a7de745 A	929	kern_return_t error_code, / code if page is in error */
	930	boolean_t no_zero_fill, /* don't zero fill absent pages */
	931	boolean_t data_supply, /* treat as data_supply if
	932	* it is a write fault and a full
	933	* page is provided */
2d21ac55	934	vm_object_fault_info_t fault_info)
1c79356b	935	{
0a7de745 A	936	vm_page_t m;
	937	vm_object_t object;
	938	vm_object_offset_t offset;
	939	vm_page_t first_m;
	940	vm_object_t next_object;
	941	vm_object_t copy_object;
	942	boolean_t look_for_page;
	943	boolean_t force_fault_retry = FALSE;
	944	vm_prot_t access_required = fault_type;
	945	vm_prot_t wants_copy_flag;
	946	kern_return_t wait_result;
	947	wait_interrupt_t interruptible_state;
	948	boolean_t data_already_requested = FALSE;
	949	vm_behavior_t orig_behavior;
	950	vm_size_t orig_cluster_size;
	951	vm_fault_return_t error;
	952	int my_fault;
	953	uint32_t try_failed_count;
	954	int interruptible; /* how may fault be interrupted? */
	955	int external_state = VM_EXTERNAL_STATE_UNKNOWN;
	956	memory_object_t pager;
	957	vm_fault_return_t retval;
	958	int grab_options;
1c79356b	959
1c79356b	960	/*
5ba3f43e	961	* MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
39037602	962	* marked as paged out in the compressor pager or the pager doesn't exist.
5ba3f43e A	963	* Note also that if the pager for an internal object
5ba3f43e A	964	* has not been created, the pager is not invoked regardless of the value
39037602	965	* of MUST_ASK_PAGER().
1c79356b A	966	*
1c79356b A	967	* PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
39037602	968	* is marked as paged out in the compressor pager.
1c79356b A	969	* PAGED_OUT() is used to determine if a page has already been pushed
	970	* into a copy object in order to avoid a redundant page out operation.
	971	*/
0a7de745	972	#define MUST_ASK_PAGER(o, f, s) \
39236c6e	973	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
39037602	974
39236c6e A	975	#define PAGED_OUT(o, f) \
39236c6e A	976	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1c79356b A	977
	978	/*
	979	* Recovery actions
	980	*/
0a7de745 A	981	#define RELEASE_PAGE(m) \
	982	MACRO_BEGIN \
	983	PAGE_WAKEUP_DONE(m); \
	984	if ( !VM_PAGE_PAGEABLE(m)) { \
	985	vm_page_lockspin_queues(); \
	986	if ( !VM_PAGE_PAGEABLE(m)) { \
	987	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
	988	vm_page_deactivate(m); \
	989	else \
	990	vm_page_activate(m); \
	991	} \
	992	vm_page_unlock_queues(); \
	993	} \
1c79356b A	994	MACRO_END
	995
	996	#if TRACEFAULTPAGE
0a7de745	997	dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
1c79356b A	998	#endif
1c79356b A	999
2d21ac55	1000	interruptible = fault_info->interruptible;
9bccf70c	1001	interruptible_state = thread_interrupt_level(interruptible);
5ba3f43e	1002
1c79356b A	1003	/*
	1004	* INVARIANTS (through entire routine):
	1005	*
	1006	* 1) At all times, we must either have the object
	1007	* lock or a busy page in some object to prevent
	1008	* some other thread from trying to bring in
	1009	* the same page.
	1010	*
	1011	* Note that we cannot hold any locks during the
	1012	* pager access or when waiting for memory, so
	1013	* we use a busy page then.
	1014	*
1c79356b A	1015	* 2) To prevent another thread from racing us down the
	1016	* shadow chain and entering a new page in the top
	1017	* object before we do, we must keep a busy page in
	1018	* the top object while following the shadow chain.
	1019	*
	1020	* 3) We must increment paging_in_progress on any object
2d21ac55 A	1021	* for which we have a busy page before dropping
2d21ac55 A	1022	* the object lock
1c79356b A	1023	*
	1024	* 4) We leave busy pages on the pageout queues.
	1025	* If the pageout daemon comes across a busy page,
	1026	* it will remove the page from the pageout queues.
	1027	*/
	1028
1c79356b A	1029	object = first_object;
	1030	offset = first_offset;
	1031	first_m = VM_PAGE_NULL;
	1032	access_required = fault_type;
	1033
1c79356b	1034	/*
2d21ac55	1035	* default type of fault
1c79356b	1036	*/
2d21ac55	1037	my_fault = DBG_CACHE_HIT_FAULT;
1c79356b A	1038
	1039	while (TRUE) {
	1040	#if TRACEFAULTPAGE
0a7de745	1041	dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1042	#endif
39037602 A	1043
	1044	grab_options = 0;
	1045	#if CONFIG_SECLUDED_MEMORY
	1046	if (object->can_grab_secluded) {
	1047	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	1048	}
	1049	#endif /* CONFIG_SECLUDED_MEMORY */
	1050
1c79356b	1051	if (!object->alive) {
0a7de745	1052	/*
2d21ac55 A	1053	* object is no longer valid
	1054	* clean up and return error
	1055	*/
1c79356b	1056	vm_fault_cleanup(object, first_m);
9bccf70c	1057	thread_interrupt_level(interruptible_state);
2d21ac55	1058
0a7de745	1059	return VM_FAULT_MEMORY_ERROR;
1c79356b	1060	}
2d21ac55	1061
b0d623f7 A	1062	if (!object->pager_created && object->phys_contiguous) {
	1063	/*
	1064	* A physically-contiguous object without a pager:
	1065	* must be a "large page" object. We do not deal
	1066	* with VM pages for this object.
	1067	*/
39236c6e	1068	caller_lookup = FALSE;
b0d623f7 A	1069	m = VM_PAGE_NULL;
	1070	goto phys_contig_object;
	1071	}
	1072
	1073	if (object->blocked_access) {
	1074	/*
	1075	* Access to this VM object has been blocked.
	1076	* Replace our "paging_in_progress" reference with
	1077	* a "activity_in_progress" reference and wait for
	1078	* access to be unblocked.
	1079	*/
39236c6e	1080	caller_lookup = FALSE; /* no longer valid after sleep */
b0d623f7 A	1081	vm_object_activity_begin(object);
	1082	vm_object_paging_end(object);
	1083	while (object->blocked_access) {
	1084	vm_object_sleep(object,
0a7de745 A	1085	VM_OBJECT_EVENT_UNBLOCKED,
0a7de745 A	1086	THREAD_UNINT);
b0d623f7 A	1087	}
	1088	vm_fault_page_blocked_access++;
	1089	vm_object_paging_begin(object);
	1090	vm_object_activity_end(object);
	1091	}
	1092
2d21ac55 A	1093	/*
	1094	* See whether the page at 'offset' is resident
	1095	*/
39236c6e A	1096	if (caller_lookup == TRUE) {
	1097	/*
	1098	* The caller has already looked up the page
	1099	* and gave us the result in "result_page".
	1100	* We can use this for the first lookup but
	1101	* it loses its validity as soon as we unlock
	1102	* the object.
	1103	*/
	1104	m = *result_page;
	1105	caller_lookup = FALSE; /* no longer valid after that */
	1106	} else {
f427ee49	1107	m = vm_page_lookup(object, vm_object_trunc_page(offset));
39236c6e	1108	}
1c79356b	1109	#if TRACEFAULTPAGE
0a7de745	1110	dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1c79356b A	1111	#endif
1c79356b A	1112	if (m != VM_PAGE_NULL) {
d9a64523	1113	if (m->vmp_busy) {
0a7de745	1114	/*
2d21ac55 A	1115	* The page is being brought in,
2d21ac55 A	1116	* wait for it and then retry.
2d21ac55	1117	*/
1c79356b	1118	#if TRACEFAULTPAGE
0a7de745	1119	dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1120	#endif
316670eb	1121	wait_result = PAGE_SLEEP(object, m, interruptible);
1c79356b	1122
316670eb A	1123	if (wait_result != THREAD_AWAKENED) {
	1124	vm_fault_cleanup(object, first_m);
	1125	thread_interrupt_level(interruptible_state);
6d2010ae	1126
0a7de745 A	1127	if (wait_result == THREAD_RESTART) {
	1128	return VM_FAULT_RETRY;
	1129	} else {
	1130	return VM_FAULT_INTERRUPTED;
	1131	}
1c79356b	1132	}
316670eb	1133	continue;
1c79356b	1134	}
d9a64523 A	1135	if (m->vmp_laundry) {
d9a64523 A	1136	m->vmp_free_when_done = FALSE;
1c79356b	1137
0a7de745	1138	if (!m->vmp_cleaning) {
316670eb	1139	vm_pageout_steal_laundry(m, FALSE);
0a7de745	1140	}
316670eb	1141	}
39037602	1142	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
91447636	1143	/*
2d21ac55	1144	* Guard page: off limits !
91447636	1145	*/
2d21ac55 A	1146	if (fault_type == VM_PROT_NONE) {
	1147	/*
	1148	* The fault is not requesting any
	1149	* access to the guard page, so it must
	1150	* be just to wire or unwire it.
	1151	* Let's pretend it succeeded...
	1152	*/
d9a64523	1153	m->vmp_busy = TRUE;
2d21ac55 A	1154	*result_page = m;
	1155	assert(first_m == VM_PAGE_NULL);
	1156	*top_page = first_m;
0a7de745	1157	if (type_of_fault) {
2d21ac55	1158	*type_of_fault = DBG_GUARD_FAULT;
0a7de745	1159	}
99c3a104	1160	thread_interrupt_level(interruptible_state);
2d21ac55 A	1161	return VM_FAULT_SUCCESS;
	1162	} else {
	1163	/*
	1164	* The fault requests access to the
	1165	* guard page: let's deny that !
	1166	*/
	1167	vm_fault_cleanup(object, first_m);
	1168	thread_interrupt_level(interruptible_state);
	1169	return VM_FAULT_MEMORY_ERROR;
	1170	}
91447636	1171	}
1c79356b	1172
d9a64523	1173	if (m->vmp_error) {
0a7de745	1174	/*
2d21ac55 A	1175	* The page is in error, give up now.
2d21ac55 A	1176	*/
1c79356b	1177	#if TRACEFAULTPAGE
0a7de745	1178	dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1c79356b	1179	#endif
0a7de745 A	1180	if (error_code) {
	1181	*error_code = KERN_MEMORY_ERROR;
	1182	}
1c79356b	1183	VM_PAGE_FREE(m);
2d21ac55	1184
1c79356b	1185	vm_fault_cleanup(object, first_m);
9bccf70c	1186	thread_interrupt_level(interruptible_state);
1c79356b	1187
0a7de745	1188	return VM_FAULT_MEMORY_ERROR;
2d21ac55	1189	}
d9a64523	1190	if (m->vmp_restart) {
0a7de745	1191	/*
2d21ac55 A	1192	* The pager wants us to restart
	1193	* at the top of the chain,
	1194	* typically because it has moved the
	1195	* page to another pager, then do so.
	1196	*/
1c79356b	1197	#if TRACEFAULTPAGE
0a7de745	1198	dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b A	1199	#endif
1c79356b A	1200	VM_PAGE_FREE(m);
2d21ac55	1201
1c79356b	1202	vm_fault_cleanup(object, first_m);
9bccf70c	1203	thread_interrupt_level(interruptible_state);
1c79356b	1204
0a7de745	1205	return VM_FAULT_RETRY;
2d21ac55	1206	}
d9a64523	1207	if (m->vmp_absent) {
0a7de745	1208	/*
2d21ac55 A	1209	* The page isn't busy, but is absent,
	1210	* therefore it's deemed "unavailable".
	1211	*
1c79356b A	1212	* Remove the non-existent page (unless it's
	1213	* in the top object) and move on down to the
	1214	* next object (if there is one).
	1215	*/
	1216	#if TRACEFAULTPAGE
0a7de745	1217	dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1c79356b	1218	#endif
1c79356b	1219	next_object = object->shadow;
1c79356b	1220
2d21ac55	1221	if (next_object == VM_OBJECT_NULL) {
1c79356b A	1222	/*
	1223	* Absent page at bottom of shadow
	1224	* chain; zero fill the page we left
2d21ac55 A	1225	* busy in the first object, and free
2d21ac55 A	1226	* the absent page.
1c79356b	1227	*/
2d21ac55	1228	assert(!must_be_resident);
55e303ae A	1229
55e303ae A	1230	/*
2d21ac55 A	1231	* check for any conditions that prevent
2d21ac55 A	1232	* us from creating a new zero-fill page
5ba3f43e	1233	* vm_fault_check will do all of the
2d21ac55 A	1234	* fault cleanup in the case of an error condition
2d21ac55 A	1235	* including resetting the thread_interrupt_level
55e303ae	1236	*/
04b8595b	1237	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
55e303ae	1238
0a7de745 A	1239	if (error != VM_FAULT_SUCCESS) {
	1240	return error;
	1241	}
55e303ae	1242
1c79356b	1243	if (object != first_object) {
0a7de745	1244	/*
2d21ac55 A	1245	* free the absent page we just found
2d21ac55 A	1246	*/
1c79356b	1247	VM_PAGE_FREE(m);
2d21ac55 A	1248
	1249	/*
	1250	* drop reference and lock on current object
	1251	*/
1c79356b A	1252	vm_object_paging_end(object);
1c79356b A	1253	vm_object_unlock(object);
2d21ac55 A	1254
2d21ac55 A	1255	/*
5ba3f43e	1256	* grab the original page we
2d21ac55 A	1257	* 'soldered' in place and
	1258	* retake lock on 'first_object'
	1259	*/
1c79356b A	1260	m = first_m;
1c79356b A	1261	first_m = VM_PAGE_NULL;
1c79356b	1262
2d21ac55 A	1263	object = first_object;
2d21ac55 A	1264	offset = first_offset;
0b4e3aa0	1265
1c79356b	1266	vm_object_lock(object);
9bccf70c	1267	} else {
0a7de745	1268	/*
2d21ac55 A	1269	* we're going to use the absent page we just found
	1270	* so convert it to a 'busy' page
	1271	*/
0a7de745	1272	m->vmp_absent = FALSE;
d9a64523	1273	m->vmp_busy = TRUE;
0b4e3aa0	1274	}
0a7de745	1275	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
d9a64523	1276	m->vmp_absent = TRUE;
0a7de745	1277	}
2d21ac55 A	1278	/*
	1279	* zero-fill the page and put it on
	1280	* the correct paging queue
	1281	*/
	1282	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1283
1c79356b A	1284	break;
1c79356b A	1285	} else {
0a7de745	1286	if (must_be_resident) {
1c79356b	1287	vm_object_paging_end(object);
0a7de745	1288	} else if (object != first_object) {
1c79356b A	1289	vm_object_paging_end(object);
	1290	VM_PAGE_FREE(m);
	1291	} else {
	1292	first_m = m;
d9a64523 A	1293	m->vmp_absent = FALSE;
d9a64523 A	1294	m->vmp_busy = TRUE;
1c79356b	1295
2d21ac55	1296	vm_page_lockspin_queues();
39037602	1297	vm_page_queues_remove(m, FALSE);
1c79356b A	1298	vm_page_unlock_queues();
1c79356b A	1299	}
2d21ac55	1300
6d2010ae A	1301	offset += object->vo_shadow_offset;
	1302	fault_info->lo_offset += object->vo_shadow_offset;
	1303	fault_info->hi_offset += object->vo_shadow_offset;
1c79356b	1304	access_required = VM_PROT_READ;
2d21ac55	1305
1c79356b A	1306	vm_object_lock(next_object);
	1307	vm_object_unlock(object);
	1308	object = next_object;
	1309	vm_object_paging_begin(object);
5ba3f43e	1310
2d21ac55 A	1311	/*
	1312	* reset to default type of fault
	1313	*/
	1314	my_fault = DBG_CACHE_HIT_FAULT;
	1315
1c79356b A	1316	continue;
	1317	}
	1318	}
d9a64523	1319	if ((m->vmp_cleaning)
2d21ac55 A	1320	&& ((object != first_object) \|\| (object->copy != VM_OBJECT_NULL))
2d21ac55 A	1321	&& (fault_type & VM_PROT_WRITE)) {
1c79356b A	1322	/*
	1323	* This is a copy-on-write fault that will
	1324	* cause us to revoke access to this page, but
	1325	* this page is in the process of being cleaned
	1326	* in a clustered pageout. We must wait until
	1327	* the cleaning operation completes before
	1328	* revoking access to the original page,
	1329	* otherwise we might attempt to remove a
	1330	* wired mapping.
	1331	*/
	1332	#if TRACEFAULTPAGE
0a7de745	1333	dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1c79356b	1334	#endif
2d21ac55 A	1335	/*
	1336	* take an extra ref so that object won't die
	1337	*/
	1338	vm_object_reference_locked(object);
	1339
1c79356b	1340	vm_fault_cleanup(object, first_m);
5ba3f43e	1341
1c79356b A	1342	vm_object_lock(object);
1c79356b A	1343	assert(object->ref_count > 0);
2d21ac55	1344
f427ee49	1345	m = vm_page_lookup(object, vm_object_trunc_page(offset));
2d21ac55	1346
d9a64523	1347	if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1c79356b	1348	PAGE_ASSERT_WAIT(m, interruptible);
2d21ac55	1349
1c79356b	1350	vm_object_unlock(object);
9bccf70c	1351	wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b	1352	vm_object_deallocate(object);
2d21ac55	1353
1c79356b A	1354	goto backoff;
	1355	} else {
	1356	vm_object_unlock(object);
2d21ac55	1357
1c79356b	1358	vm_object_deallocate(object);
9bccf70c	1359	thread_interrupt_level(interruptible_state);
2d21ac55	1360
0a7de745	1361	return VM_FAULT_RETRY;
1c79356b A	1362	}
1c79356b A	1363	}
d9a64523	1364	if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
b0d623f7	1365	!(fault_info != NULL && fault_info->stealth)) {
0a7de745	1366	/*
2d21ac55 A	1367	* If we were passed a non-NULL pointer for
	1368	* "type_of_fault", than we came from
	1369	* vm_fault... we'll let it deal with
	1370	* this condition, since it
d9a64523	1371	* needs to see m->vmp_speculative to correctly
2d21ac55 A	1372	* account the pageins, otherwise...
	1373	* take it off the speculative queue, we'll
	1374	* let the caller of vm_fault_page deal
	1375	* with getting it onto the correct queue
b0d623f7 A	1376	*
	1377	* If the caller specified in fault_info that
	1378	* it wants a "stealth" fault, we also leave
	1379	* the page in the speculative queue.
2d21ac55	1380	*/
0a7de745 A	1381	vm_page_lockspin_queues();
0a7de745 A	1382	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
39037602	1383	vm_page_queues_remove(m, FALSE);
0a7de745 A	1384	}
0a7de745 A	1385	vm_page_unlock_queues();
2d21ac55	1386	}
39037602	1387	assert(object == VM_PAGE_OBJECT(m));
1c79356b	1388
39037602	1389	if (object->code_signed) {
2d21ac55 A	1390	/*
	1391	* CODE SIGNING:
	1392	* We just paged in a page from a signed
	1393	* memory object but we don't need to
	1394	* validate it now. We'll validate it if
	1395	* when it gets mapped into a user address
	1396	* space for the first time or when the page
	1397	* gets copied to another object as a result
	1398	* of a copy-on-write.
	1399	*/
1c79356b	1400	}
2d21ac55	1401
1c79356b	1402	/*
2d21ac55 A	1403	* We mark the page busy and leave it on
	1404	* the pageout queues. If the pageout
	1405	* deamon comes across it, then it will
	1406	* remove the page from the queue, but not the object
1c79356b	1407	*/
1c79356b	1408	#if TRACEFAULTPAGE
0a7de745	1409	dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1410	#endif
d9a64523 A	1411	assert(!m->vmp_busy);
d9a64523 A	1412	assert(!m->vmp_absent);
2d21ac55	1413
d9a64523	1414	m->vmp_busy = TRUE;
1c79356b A	1415	break;
1c79356b A	1416	}
5ba3f43e	1417
1c79356b	1418
2d21ac55 A	1419	/*
	1420	* we get here when there is no page present in the object at
	1421	* the offset we're interested in... we'll allocate a page
	1422	* at this point if the pager associated with
	1423	* this object can provide the data or we're the top object...
	1424	* object is locked; m == NULL
	1425	*/
5ba3f43e	1426
39236c6e A	1427	if (must_be_resident) {
	1428	if (fault_type == VM_PROT_NONE &&
	1429	object == kernel_object) {
	1430	/*
	1431	* We've been called from vm_fault_unwire()
	1432	* while removing a map entry that was allocated
	1433	* with KMA_KOBJECT and KMA_VAONLY. This page
	1434	* is not present and there's nothing more to
	1435	* do here (nothing to unwire).
	1436	*/
	1437	vm_fault_cleanup(object, first_m);
	1438	thread_interrupt_level(interruptible_state);
	1439
	1440	return VM_FAULT_MEMORY_ERROR;
	1441	}
	1442
316670eb	1443	goto dont_look_for_page;
39236c6e	1444	}
5ba3f43e A	1445
	1446	/* Don't expect to fault pages into the kernel object. */
	1447	assert(object != kernel_object);
	1448
39236c6e	1449	data_supply = FALSE;
39236c6e	1450
0a7de745	1451	look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
5ba3f43e	1452
1c79356b	1453	#if TRACEFAULTPAGE
0a7de745	1454	dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1c79356b	1455	#endif
316670eb	1456	if (!look_for_page && object == first_object && !object->phys_contiguous) {
1c79356b	1457	/*
316670eb	1458	* Allocate a new page for this object/offset pair as a placeholder
1c79356b	1459	*/
39037602	1460	m = vm_page_grab_options(grab_options);
1c79356b	1461	#if TRACEFAULTPAGE
0a7de745	1462	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1c79356b A	1463	#endif
	1464	if (m == VM_PAGE_NULL) {
	1465	vm_fault_cleanup(object, first_m);
9bccf70c	1466	thread_interrupt_level(interruptible_state);
2d21ac55	1467
0a7de745	1468	return VM_FAULT_MEMORY_SHORTAGE;
1c79356b	1469	}
316670eb A	1470
316670eb A	1471	if (fault_info && fault_info->batch_pmap_op == TRUE) {
f427ee49 A	1472	vm_page_insert_internal(m, object,
	1473	vm_object_trunc_page(offset),
	1474	VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
316670eb	1475	} else {
f427ee49	1476	vm_page_insert(m, object, vm_object_trunc_page(offset));
316670eb	1477	}
1c79356b	1478	}
316670eb	1479	if (look_for_page) {
0a7de745 A	1480	kern_return_t rc;
0a7de745 A	1481	int my_fault_type;
1c79356b A	1482
	1483	/*
	1484	* If the memory manager is not ready, we
	1485	* cannot make requests.
	1486	*/
	1487	if (!object->pager_ready) {
	1488	#if TRACEFAULTPAGE
0a7de745	1489	dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1490	#endif
0a7de745 A	1491	if (m != VM_PAGE_NULL) {
	1492	VM_PAGE_FREE(m);
	1493	}
2d21ac55	1494
2d21ac55 A	1495	/*
	1496	* take an extra ref so object won't die
	1497	*/
	1498	vm_object_reference_locked(object);
1c79356b	1499	vm_fault_cleanup(object, first_m);
2d21ac55	1500
1c79356b A	1501	vm_object_lock(object);
1c79356b A	1502	assert(object->ref_count > 0);
2d21ac55	1503
1c79356b	1504	if (!object->pager_ready) {
2d21ac55 A	1505	wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
2d21ac55 A	1506
1c79356b	1507	vm_object_unlock(object);
0a7de745	1508	if (wait_result == THREAD_WAITING) {
9bccf70c	1509	wait_result = thread_block(THREAD_CONTINUE_NULL);
0a7de745	1510	}
1c79356b	1511	vm_object_deallocate(object);
2d21ac55	1512
1c79356b A	1513	goto backoff;
	1514	} else {
	1515	vm_object_unlock(object);
	1516	vm_object_deallocate(object);
9bccf70c	1517	thread_interrupt_level(interruptible_state);
1c79356b	1518
0a7de745	1519	return VM_FAULT_RETRY;
0b4e3aa0	1520	}
0b4e3aa0	1521	}
2d21ac55	1522	if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1c79356b	1523	/*
2d21ac55 A	1524	* If there are too many outstanding page
	1525	* requests pending on this external object, we
	1526	* wait for them to be resolved now.
1c79356b	1527	*/
1c79356b	1528	#if TRACEFAULTPAGE
0a7de745	1529	dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1530	#endif
0a7de745	1531	if (m != VM_PAGE_NULL) {
1c79356b	1532	VM_PAGE_FREE(m);
0a7de745	1533	}
1c79356b	1534	/*
2d21ac55	1535	* take an extra ref so object won't die
1c79356b	1536	*/
2d21ac55	1537	vm_object_reference_locked(object);
1c79356b	1538
1c79356b	1539	vm_fault_cleanup(object, first_m);
2d21ac55	1540
1c79356b A	1541	vm_object_lock(object);
1c79356b A	1542	assert(object->ref_count > 0);
2d21ac55	1543
6d2010ae	1544	if (object->paging_in_progress >= vm_object_pagein_throttle) {
0a7de745	1545	vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
2d21ac55	1546
1c79356b	1547	vm_object_unlock(object);
9bccf70c	1548	wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b	1549	vm_object_deallocate(object);
2d21ac55	1550
1c79356b A	1551	goto backoff;
	1552	} else {
	1553	vm_object_unlock(object);
	1554	vm_object_deallocate(object);
9bccf70c	1555	thread_interrupt_level(interruptible_state);
2d21ac55	1556
0a7de745	1557	return VM_FAULT_RETRY;
1c79356b A	1558	}
1c79356b A	1559	}
39037602	1560	if (object->internal) {
fe8ab488	1561	int compressed_count_delta;
39236c6e	1562
39037602 A	1563	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
39037602 A	1564
39236c6e A	1565	if (m == VM_PAGE_NULL) {
	1566	/*
	1567	* Allocate a new page for this object/offset pair as a placeholder
	1568	*/
39037602	1569	m = vm_page_grab_options(grab_options);
39236c6e	1570	#if TRACEFAULTPAGE
0a7de745	1571	dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
39236c6e A	1572	#endif
39236c6e A	1573	if (m == VM_PAGE_NULL) {
39236c6e A	1574	vm_fault_cleanup(object, first_m);
	1575	thread_interrupt_level(interruptible_state);
	1576
0a7de745	1577	return VM_FAULT_MEMORY_SHORTAGE;
39236c6e A	1578	}
39236c6e A	1579
d9a64523	1580	m->vmp_absent = TRUE;
39236c6e	1581	if (fault_info && fault_info->batch_pmap_op == TRUE) {
f427ee49	1582	vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
39236c6e	1583	} else {
f427ee49	1584	vm_page_insert(m, object, vm_object_trunc_page(offset));
39236c6e A	1585	}
39236c6e A	1586	}
d9a64523	1587	assert(m->vmp_busy);
5ba3f43e	1588
d9a64523	1589	m->vmp_absent = TRUE;
39236c6e A	1590	pager = object->pager;
39236c6e A	1591
fe8ab488	1592	assert(object->paging_in_progress > 0);
39236c6e A	1593	vm_object_unlock(object);
39236c6e A	1594
fe8ab488 A	1595	rc = vm_compressor_pager_get(
	1596	pager,
	1597	offset + object->paging_offset,
39037602	1598	VM_PAGE_GET_PHYS_PAGE(m),
fe8ab488 A	1599	&my_fault_type,
	1600	0,
	1601	&compressed_count_delta);
39236c6e	1602
04b8595b	1603	if (type_of_fault == NULL) {
0a7de745	1604	int throttle_delay;
04b8595b A	1605
	1606	/*
	1607	* we weren't called from vm_fault, so we
	1608	* need to apply page creation throttling
	1609	* do it before we re-acquire any locks
	1610	*/
	1611	if (my_fault_type == DBG_COMPRESSOR_FAULT) {
	1612	if ((throttle_delay = vm_page_throttled(TRUE))) {
	1613	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
	1614	delay(throttle_delay);
	1615	}
	1616	}
	1617	}
39236c6e	1618	vm_object_lock(object);
fe8ab488 A	1619	assert(object->paging_in_progress > 0);
	1620
	1621	vm_compressor_pager_count(
	1622	pager,
	1623	compressed_count_delta,
	1624	FALSE, /* shared_lock */
	1625	object);
39236c6e A	1626
	1627	switch (rc) {
	1628	case KERN_SUCCESS:
d9a64523 A	1629	m->vmp_absent = FALSE;
d9a64523 A	1630	m->vmp_dirty = TRUE;
39037602	1631	if ((object->wimg_bits &
0a7de745	1632	VM_WIMG_MASK) !=
39236c6e A	1633	VM_WIMG_USE_DEFAULT) {
	1634	/*
	1635	* If the page is not cacheable,
	1636	* we can't let its contents
	1637	* linger in the data cache
	1638	* after the decompression.
	1639	*/
	1640	pmap_sync_page_attributes_phys(
39037602	1641	VM_PAGE_GET_PHYS_PAGE(m));
fe8ab488	1642	} else {
d9a64523	1643	m->vmp_written_by_kernel = TRUE;
fe8ab488 A	1644	}
	1645
	1646	/*
	1647	* If the object is purgeable, its
	1648	* owner's purgeable ledgers have been
	1649	* updated in vm_page_insert() but the
	1650	* page was also accounted for in a
	1651	* "compressed purgeable" ledger, so
	1652	* update that now.
	1653	*/
d9a64523	1654	if (((object->purgable !=
0a7de745 A	1655	VM_PURGABLE_DENY) \|\|
0a7de745 A	1656	object->vo_ledger_tag) &&
d9a64523	1657	(object->vo_owner !=
0a7de745	1658	NULL)) {
fe8ab488 A	1659	/*
fe8ab488 A	1660	* One less compressed
d9a64523	1661	* purgeable/tagged page.
fe8ab488	1662	*/
d9a64523	1663	vm_object_owner_compressed_update(
fe8ab488 A	1664	object,
	1665	-1);
	1666	}
	1667
39236c6e A	1668	break;
39236c6e A	1669	case KERN_MEMORY_FAILURE:
d9a64523 A	1670	m->vmp_unusual = TRUE;
	1671	m->vmp_error = TRUE;
	1672	m->vmp_absent = FALSE;
39236c6e A	1673	break;
39236c6e A	1674	case KERN_MEMORY_ERROR:
d9a64523	1675	assert(m->vmp_absent);
39236c6e A	1676	break;
39236c6e A	1677	default:
fe8ab488	1678	panic("vm_fault_page(): unexpected "
0a7de745 A	1679	"error %d from "
	1680	"vm_compressor_pager_get()\n",
	1681	rc);
39236c6e A	1682	}
	1683	PAGE_WAKEUP_DONE(m);
	1684
	1685	rc = KERN_SUCCESS;
	1686	goto data_requested;
	1687	}
	1688	my_fault_type = DBG_PAGEIN_FAULT;
5ba3f43e	1689
2d21ac55	1690	if (m != VM_PAGE_NULL) {
316670eb A	1691	VM_PAGE_FREE(m);
316670eb A	1692	m = VM_PAGE_NULL;
0b4e3aa0	1693	}
1c79356b	1694
1c79356b	1695	#if TRACEFAULTPAGE
0a7de745	1696	dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1c79356b	1697	#endif
2d21ac55	1698
1c79356b	1699	/*
2d21ac55	1700	* It's possible someone called vm_object_destroy while we weren't
5ba3f43e	1701	* holding the object lock. If that has happened, then bail out
2d21ac55	1702	* here.
1c79356b	1703	*/
2d21ac55 A	1704
	1705	pager = object->pager;
	1706
	1707	if (pager == MEMORY_OBJECT_NULL) {
	1708	vm_fault_cleanup(object, first_m);
	1709	thread_interrupt_level(interruptible_state);
	1710	return VM_FAULT_MEMORY_ERROR;
	1711	}
1c79356b A	1712
1c79356b A	1713	/*
2d21ac55 A	1714	* We have an absent page in place for the faulting offset,
2d21ac55 A	1715	* so we can release the object lock.
1c79356b A	1716	*/
1c79356b A	1717
d9a64523	1718	if (object->object_is_shared_cache) {
5ba3f43e A	1719	set_thread_rwlock_boost();
	1720	}
	1721
2d21ac55	1722	vm_object_unlock(object);
1c79356b A	1723
1c79356b A	1724	/*
2d21ac55 A	1725	* If this object uses a copy_call strategy,
	1726	* and we are interested in a copy of this object
	1727	* (having gotten here only by following a
	1728	* shadow chain), then tell the memory manager
	1729	* via a flag added to the desired_access
	1730	* parameter, so that it can detect a race
	1731	* between our walking down the shadow chain
	1732	* and its pushing pages up into a copy of
	1733	* the object that it manages.
1c79356b	1734	*/
0a7de745	1735	if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1c79356b	1736	wants_copy_flag = VM_PROT_WANTS_COPY;
0a7de745	1737	} else {
1c79356b	1738	wants_copy_flag = VM_PROT_NONE;
0a7de745	1739	}
1c79356b	1740
316670eb A	1741	if (object->copy == first_object) {
	1742	/*
	1743	* if we issue the memory_object_data_request in
	1744	* this state, we are subject to a deadlock with
	1745	* the underlying filesystem if it is trying to
	1746	* shrink the file resulting in a push of pages
	1747	* into the copy object... that push will stall
	1748	* on the placeholder page, and if the pushing thread
	1749	* is holding a lock that is required on the pagein
	1750	* path (such as a truncate lock), we'll deadlock...
	1751	* to avoid this potential deadlock, we throw away
	1752	* our placeholder page before calling memory_object_data_request
	1753	* and force this thread to retry the vm_fault_page after
	1754	* we have issued the I/O. the second time through this path
	1755	* we will find the page already in the cache (presumably still
	1756	* busy waiting for the I/O to complete) and then complete
	1757	* the fault w/o having to go through memory_object_data_request again
	1758	*/
	1759	assert(first_m != VM_PAGE_NULL);
39037602	1760	assert(VM_PAGE_OBJECT(first_m) == first_object);
5ba3f43e	1761
316670eb A	1762	vm_object_lock(first_object);
	1763	VM_PAGE_FREE(first_m);
	1764	vm_object_paging_end(first_object);
	1765	vm_object_unlock(first_object);
	1766
	1767	first_m = VM_PAGE_NULL;
	1768	force_fault_retry = TRUE;
	1769
	1770	vm_fault_page_forced_retry++;
	1771	}
	1772
	1773	if (data_already_requested == TRUE) {
	1774	orig_behavior = fault_info->behavior;
	1775	orig_cluster_size = fault_info->cluster_size;
	1776
	1777	fault_info->behavior = VM_BEHAVIOR_RANDOM;
	1778	fault_info->cluster_size = PAGE_SIZE;
	1779	}
2d21ac55 A	1780	/*
	1781	* Call the memory manager to retrieve the data.
	1782	*/
	1783	rc = memory_object_data_request(
	1784	pager,
f427ee49	1785	vm_object_trunc_page(offset) + object->paging_offset,
2d21ac55 A	1786	PAGE_SIZE,
	1787	access_required \| wants_copy_flag,
	1788	(memory_object_fault_info_t)fault_info);
1c79356b	1789
316670eb A	1790	if (data_already_requested == TRUE) {
	1791	fault_info->behavior = orig_behavior;
	1792	fault_info->cluster_size = orig_cluster_size;
0a7de745	1793	} else {
316670eb	1794	data_already_requested = TRUE;
0a7de745	1795	}
316670eb	1796
fe8ab488	1797	DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1c79356b	1798	#if TRACEFAULTPAGE
0a7de745	1799	dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1c79356b	1800	#endif
2d21ac55 A	1801	vm_object_lock(object);
2d21ac55 A	1802
d9a64523	1803	if (object->object_is_shared_cache) {
5ba3f43e A	1804	clear_thread_rwlock_boost();
	1805	}
	1806
0a7de745	1807	data_requested:
1c79356b	1808	if (rc != KERN_SUCCESS) {
1c79356b	1809	vm_fault_cleanup(object, first_m);
9bccf70c	1810	thread_interrupt_level(interruptible_state);
2d21ac55	1811
0a7de745 A	1812	return (rc == MACH_SEND_INTERRUPTED) ?
	1813	VM_FAULT_INTERRUPTED :
	1814	VM_FAULT_MEMORY_ERROR;
b0d623f7 A	1815	} else {
	1816	clock_sec_t tv_sec;
	1817	clock_usec_t tv_usec;
39236c6e A	1818
	1819	if (my_fault_type == DBG_PAGEIN_FAULT) {
	1820	clock_get_system_microtime(&tv_sec, &tv_usec);
	1821	current_thread()->t_page_creation_time = tv_sec;
	1822	current_thread()->t_page_creation_count = 0;
	1823	}
1c79356b	1824	}
6d2010ae	1825	if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1c79356b	1826	vm_fault_cleanup(object, first_m);
9bccf70c	1827	thread_interrupt_level(interruptible_state);
2d21ac55	1828
0a7de745	1829	return VM_FAULT_INTERRUPTED;
1c79356b	1830	}
316670eb	1831	if (force_fault_retry == TRUE) {
316670eb A	1832	vm_fault_cleanup(object, first_m);
	1833	thread_interrupt_level(interruptible_state);
	1834
0a7de745	1835	return VM_FAULT_RETRY;
316670eb	1836	}
2d21ac55	1837	if (m == VM_PAGE_NULL && object->phys_contiguous) {
91447636 A	1838	/*
91447636 A	1839	* No page here means that the object we
5ba3f43e	1840	* initially looked up was "physically
91447636 A	1841	* contiguous" (i.e. device memory). However,
	1842	* with Virtual VRAM, the object might not
	1843	* be backed by that device memory anymore,
	1844	* so we're done here only if the object is
	1845	* still "phys_contiguous".
	1846	* Otherwise, if the object is no longer
	1847	* "phys_contiguous", we need to retry the
	1848	* page fault against the object's new backing
	1849	* store (different memory object).
	1850	*/
0a7de745	1851	phys_contig_object:
b0d623f7	1852	goto done;
91447636	1853	}
2d21ac55 A	1854	/*
	1855	* potentially a pagein fault
	1856	* if we make it through the state checks
	1857	* above, than we'll count it as such
	1858	*/
39236c6e	1859	my_fault = my_fault_type;
91447636 A	1860
	1861	/*
	1862	* Retry with same object/offset, since new data may
	1863	* be in a different page (i.e., m is meaningless at
	1864	* this point).
	1865	*/
1c79356b A	1866	continue;
1c79356b A	1867	}
316670eb	1868	dont_look_for_page:
1c79356b	1869	/*
5ba3f43e	1870	* We get here if the object has no pager, or an existence map
2d21ac55 A	1871	* exists and indicates the page isn't present on the pager
2d21ac55 A	1872	* or we're unwiring a page. If a pager exists, but there
d9a64523	1873	* is no existence map, then the m->vmp_absent case above handles
2d21ac55	1874	* the ZF case when the pager can't provide the page
1c79356b A	1875	*/
1c79356b A	1876	#if TRACEFAULTPAGE
0a7de745	1877	dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1c79356b	1878	#endif
0a7de745	1879	if (object == first_object) {
1c79356b	1880	first_m = m;
0a7de745	1881	} else {
1c79356b	1882	assert(m == VM_PAGE_NULL);
0a7de745	1883	}
1c79356b	1884
1c79356b	1885	next_object = object->shadow;
2d21ac55	1886
1c79356b	1887	if (next_object == VM_OBJECT_NULL) {
1c79356b	1888	/*
2d21ac55 A	1889	* we've hit the bottom of the shadown chain,
2d21ac55 A	1890	* fill the page in the top object with zeros.
1c79356b	1891	*/
2d21ac55	1892	assert(!must_be_resident);
1c79356b A	1893
	1894	if (object != first_object) {
	1895	vm_object_paging_end(object);
	1896	vm_object_unlock(object);
	1897
	1898	object = first_object;
	1899	offset = first_offset;
	1900	vm_object_lock(object);
	1901	}
1c79356b	1902	m = first_m;
39037602	1903	assert(VM_PAGE_OBJECT(m) == object);
1c79356b A	1904	first_m = VM_PAGE_NULL;
1c79356b A	1905
55e303ae	1906	/*
2d21ac55 A	1907	* check for any conditions that prevent
2d21ac55 A	1908	* us from creating a new zero-fill page
5ba3f43e	1909	* vm_fault_check will do all of the
2d21ac55 A	1910	* fault cleanup in the case of an error condition
2d21ac55 A	1911	* including resetting the thread_interrupt_level
55e303ae	1912	*/
04b8595b	1913	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
55e303ae	1914
0a7de745 A	1915	if (error != VM_FAULT_SUCCESS) {
	1916	return error;
	1917	}
55e303ae	1918
2d21ac55	1919	if (m == VM_PAGE_NULL) {
39037602	1920	m = vm_page_grab_options(grab_options);
1c79356b	1921
2d21ac55 A	1922	if (m == VM_PAGE_NULL) {
	1923	vm_fault_cleanup(object, VM_PAGE_NULL);
	1924	thread_interrupt_level(interruptible_state);
55e303ae	1925
0a7de745	1926	return VM_FAULT_MEMORY_SHORTAGE;
2d21ac55	1927	}
f427ee49	1928	vm_page_insert(m, object, vm_object_trunc_page(offset));
0b4e3aa0	1929	}
0a7de745	1930	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
d9a64523	1931	m->vmp_absent = TRUE;
0a7de745	1932	}
fe8ab488 A	1933
	1934	my_fault = vm_fault_zero_page(m, no_zero_fill);
	1935
1c79356b	1936	break;
2d21ac55	1937	} else {
0a7de745	1938	/*
2d21ac55 A	1939	* Move on to the next object. Lock the next
	1940	* object before unlocking the current one.
	1941	*/
0a7de745	1942	if ((object != first_object) \|\| must_be_resident) {
1c79356b	1943	vm_object_paging_end(object);
0a7de745	1944	}
2d21ac55	1945
6d2010ae A	1946	offset += object->vo_shadow_offset;
	1947	fault_info->lo_offset += object->vo_shadow_offset;
	1948	fault_info->hi_offset += object->vo_shadow_offset;
1c79356b	1949	access_required = VM_PROT_READ;
2d21ac55	1950
1c79356b A	1951	vm_object_lock(next_object);
1c79356b A	1952	vm_object_unlock(object);
2d21ac55	1953
1c79356b A	1954	object = next_object;
	1955	vm_object_paging_begin(object);
	1956	}
	1957	}
	1958
	1959	/*
	1960	* PAGE HAS BEEN FOUND.
	1961	*
	1962	* This page (m) is:
	1963	* busy, so that we can play with it;
	1964	* not absent, so that nobody else will fill it;
	1965	* possibly eligible for pageout;
	1966	*
	1967	* The top-level page (first_m) is:
	1968	* VM_PAGE_NULL if the page was found in the
	1969	* top-level object;
	1970	* busy, not absent, and ineligible for pageout.
	1971	*
	1972	* The current object (object) is locked. A paging
	1973	* reference is held for the current and top-level
	1974	* objects.
	1975	*/
	1976
	1977	#if TRACEFAULTPAGE
0a7de745	1978	dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1c79356b	1979	#endif
0a7de745	1980	#if EXTRA_ASSERTIONS
d9a64523	1981	assert(m->vmp_busy && !m->vmp_absent);
b0d623f7	1982	assert((first_m == VM_PAGE_NULL) \|\|
0a7de745 A	1983	(first_m->vmp_busy && !first_m->vmp_absent &&
	1984	!first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
	1985	#endif /* EXTRA_ASSERTIONS */
1c79356b	1986
1c79356b	1987	/*
2d21ac55 A	1988	* If the page is being written, but isn't
	1989	* already owned by the top-level object,
	1990	* we have to copy it into a new page owned
	1991	* by the top-level object.
1c79356b	1992	*/
b0d623f7	1993	if (object != first_object) {
1c79356b	1994	#if TRACEFAULTPAGE
0a7de745	1995	dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1c79356b	1996	#endif
0a7de745	1997	if (fault_type & VM_PROT_WRITE) {
1c79356b A	1998	vm_page_t copy_m;
1c79356b A	1999
2d21ac55 A	2000	/*
	2001	* We only really need to copy if we
	2002	* want to write it.
	2003	*/
1c79356b A	2004	assert(!must_be_resident);
	2005
	2006	/*
2d21ac55 A	2007	* If we try to collapse first_object at this
	2008	* point, we may deadlock when we try to get
	2009	* the lock on an intermediate object (since we
	2010	* have the bottom object locked). We can't
	2011	* unlock the bottom object, because the page
	2012	* we found may move (by collapse) if we do.
1c79356b	2013	*
2d21ac55 A	2014	* Instead, we first copy the page. Then, when
	2015	* we have no more use for the bottom object,
	2016	* we unlock it and try to collapse.
1c79356b	2017	*
2d21ac55 A	2018	* Note that we copy the page even if we didn't
2d21ac55 A	2019	* need to... that's the breaks.
1c79356b A	2020	*/
	2021
	2022	/*
2d21ac55	2023	* Allocate a page for the copy
1c79356b	2024	*/
39037602	2025	copy_m = vm_page_grab_options(grab_options);
2d21ac55	2026
1c79356b A	2027	if (copy_m == VM_PAGE_NULL) {
1c79356b A	2028	RELEASE_PAGE(m);
2d21ac55	2029
1c79356b	2030	vm_fault_cleanup(object, first_m);
9bccf70c	2031	thread_interrupt_level(interruptible_state);
1c79356b	2032
0a7de745	2033	return VM_FAULT_MEMORY_SHORTAGE;
2d21ac55	2034	}
2d21ac55	2035
1c79356b A	2036	vm_page_copy(m, copy_m);
	2037
	2038	/*
2d21ac55 A	2039	* If another map is truly sharing this
	2040	* page with us, we have to flush all
	2041	* uses of the original page, since we
	2042	* can't distinguish those which want the
	2043	* original from those which need the
	2044	* new copy.
1c79356b	2045	*
2d21ac55 A	2046	* XXXO If we know that only one map has
	2047	* access to this page, then we could
	2048	* avoid the pmap_disconnect() call.
1c79356b	2049	*/
0a7de745 A	2050	if (m->vmp_pmapped) {
	2051	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2052	}
1c79356b	2053
d9a64523	2054	if (m->vmp_clustered) {
fe8ab488 A	2055	VM_PAGE_COUNT_AS_PAGEIN(m);
	2056	VM_PAGE_CONSUME_CLUSTERED(m);
	2057	}
d9a64523	2058	assert(!m->vmp_cleaning);
1c79356b A	2059
1c79356b A	2060	/*
2d21ac55	2061	* We no longer need the old page or object.
1c79356b	2062	*/
39236c6e A	2063	RELEASE_PAGE(m);
39236c6e A	2064
39037602 A	2065	/*
	2066	* This check helps with marking the object as having a sequential pattern
	2067	* Normally we'll miss doing this below because this fault is about COW to
	2068	* the first_object i.e. bring page in from disk, push to object above but
	2069	* don't update the file object's sequential pattern.
	2070	*/
0a7de745	2071	if (object->internal == FALSE) {
39037602 A	2072	vm_fault_is_sequential(object, offset, fault_info->behavior);
	2073	}
	2074
1c79356b A	2075	vm_object_paging_end(object);
	2076	vm_object_unlock(object);
	2077
2d21ac55	2078	my_fault = DBG_COW_FAULT;
c3c9b80d	2079	counter_inc(&vm_statistics_cow_faults);
2d21ac55	2080	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1c79356b	2081	current_task()->cow_faults++;
2d21ac55	2082
1c79356b A	2083	object = first_object;
	2084	offset = first_offset;
	2085
	2086	vm_object_lock(object);
2d21ac55 A	2087	/*
	2088	* get rid of the place holder
	2089	* page that we soldered in earlier
	2090	*/
1c79356b A	2091	VM_PAGE_FREE(first_m);
1c79356b A	2092	first_m = VM_PAGE_NULL;
5ba3f43e	2093
2d21ac55 A	2094	/*
	2095	* and replace it with the
	2096	* page we just copied into
	2097	*/
d9a64523	2098	assert(copy_m->vmp_busy);
f427ee49	2099	vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
316670eb	2100	SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b	2101
2d21ac55	2102	m = copy_m;
1c79356b	2103	/*
2d21ac55 A	2104	* Now that we've gotten the copy out of the
	2105	* way, let's try to collapse the top object.
	2106	* But we have to play ugly games with
	2107	* paging_in_progress to do that...
5ba3f43e A	2108	*/
5ba3f43e A	2109	vm_object_paging_end(object);
f427ee49	2110	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
1c79356b	2111	vm_object_paging_begin(object);
0a7de745 A	2112	} else {
	2113	*protection &= (~VM_PROT_WRITE);
	2114	}
1c79356b	2115	}
1c79356b	2116	/*
2d21ac55 A	2117	* Now check whether the page needs to be pushed into the
	2118	* copy object. The use of asymmetric copy on write for
	2119	* shared temporary objects means that we may do two copies to
	2120	* satisfy the fault; one above to get the page from a
	2121	* shadowed object, and one here to push it into the copy.
1c79356b	2122	*/
2d21ac55	2123	try_failed_count = 0;
1c79356b	2124
b0d623f7	2125	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
0a7de745 A	2126	vm_object_offset_t copy_offset;
0a7de745 A	2127	vm_page_t copy_m;
1c79356b A	2128
1c79356b A	2129	#if TRACEFAULTPAGE
0a7de745	2130	dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1c79356b A	2131	#endif
1c79356b A	2132	/*
2d21ac55 A	2133	* If the page is being written, but hasn't been
2d21ac55 A	2134	* copied to the copy-object, we have to copy it there.
1c79356b	2135	*/
1c79356b A	2136	if ((fault_type & VM_PROT_WRITE) == 0) {
	2137	*protection &= ~VM_PROT_WRITE;
	2138	break;
	2139	}
	2140
	2141	/*
2d21ac55 A	2142	* If the page was guaranteed to be resident,
2d21ac55 A	2143	* we must have already performed the copy.
1c79356b	2144	*/
0a7de745	2145	if (must_be_resident) {
1c79356b	2146	break;
0a7de745	2147	}
1c79356b A	2148
1c79356b A	2149	/*
2d21ac55	2150	* Try to get the lock on the copy_object.
1c79356b A	2151	*/
1c79356b A	2152	if (!vm_object_lock_try(copy_object)) {
2d21ac55 A	2153	vm_object_unlock(object);
2d21ac55 A	2154	try_failed_count++;
1c79356b	2155
0a7de745	2156	mutex_pause(try_failed_count); /* wait a bit */
1c79356b	2157	vm_object_lock(object);
2d21ac55	2158
1c79356b A	2159	continue;
1c79356b A	2160	}
2d21ac55	2161	try_failed_count = 0;
1c79356b A	2162
1c79356b A	2163	/*
2d21ac55 A	2164	* Make another reference to the copy-object,
	2165	* to keep it from disappearing during the
	2166	* copy.
1c79356b	2167	*/
2d21ac55	2168	vm_object_reference_locked(copy_object);
1c79356b A	2169
1c79356b A	2170	/*
2d21ac55	2171	* Does the page exist in the copy?
1c79356b	2172	*/
6d2010ae	2173	copy_offset = first_offset - copy_object->vo_shadow_offset;
f427ee49	2174	copy_offset = vm_object_trunc_page(copy_offset);
2d21ac55	2175
0a7de745	2176	if (copy_object->vo_size <= copy_offset) {
1c79356b A	2177	/*
	2178	* Copy object doesn't cover this page -- do nothing.
	2179	*/
	2180	;
0a7de745	2181	} else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2d21ac55 A	2182	/*
	2183	* Page currently exists in the copy object
	2184	*/
d9a64523	2185	if (copy_m->vmp_busy) {
1c79356b	2186	/*
2d21ac55 A	2187	* If the page is being brought
2d21ac55 A	2188	* in, wait for it and then retry.
1c79356b A	2189	*/
1c79356b A	2190	RELEASE_PAGE(m);
2d21ac55 A	2191
	2192	/*
	2193	* take an extra ref so object won't die
	2194	*/
	2195	vm_object_reference_locked(copy_object);
1c79356b A	2196	vm_object_unlock(copy_object);
1c79356b A	2197	vm_fault_cleanup(object, first_m);
2d21ac55	2198
1c79356b A	2199	vm_object_lock(copy_object);
1c79356b A	2200	assert(copy_object->ref_count > 0);
2d21ac55	2201	vm_object_lock_assert_exclusive(copy_object);
1c79356b A	2202	copy_object->ref_count--;
	2203	assert(copy_object->ref_count > 0);
	2204	copy_m = vm_page_lookup(copy_object, copy_offset);
5ba3f43e	2205
d9a64523	2206	if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
1c79356b	2207	PAGE_ASSERT_WAIT(copy_m, interruptible);
2d21ac55	2208
1c79356b	2209	vm_object_unlock(copy_object);
9bccf70c	2210	wait_result = thread_block(THREAD_CONTINUE_NULL);
1c79356b	2211	vm_object_deallocate(copy_object);
2d21ac55	2212
1c79356b A	2213	goto backoff;
	2214	} else {
	2215	vm_object_unlock(copy_object);
	2216	vm_object_deallocate(copy_object);
9bccf70c	2217	thread_interrupt_level(interruptible_state);
2d21ac55	2218
0a7de745	2219	return VM_FAULT_RETRY;
1c79356b A	2220	}
1c79356b A	2221	}
0a7de745	2222	} else if (!PAGED_OUT(copy_object, copy_offset)) {
1c79356b A	2223	/*
	2224	* If PAGED_OUT is TRUE, then the page used to exist
	2225	* in the copy-object, and has already been paged out.
	2226	* We don't need to repeat this. If PAGED_OUT is
	2227	* FALSE, then either we don't know (!pager_created,
	2228	* for example) or it hasn't been paged out.
	2229	* (VM_EXTERNAL_STATE_UNKNOWN\|\|VM_EXTERNAL_STATE_ABSENT)
	2230	* We must copy the page to the copy object.
d9a64523	2231	*
2d21ac55	2232	* Allocate a page for the copy
1c79356b A	2233	*/
1c79356b A	2234	copy_m = vm_page_alloc(copy_object, copy_offset);
2d21ac55	2235
1c79356b A	2236	if (copy_m == VM_PAGE_NULL) {
1c79356b A	2237	RELEASE_PAGE(m);
2d21ac55	2238
2d21ac55	2239	vm_object_lock_assert_exclusive(copy_object);
1c79356b A	2240	copy_object->ref_count--;
1c79356b A	2241	assert(copy_object->ref_count > 0);
2d21ac55	2242
1c79356b A	2243	vm_object_unlock(copy_object);
1c79356b A	2244	vm_fault_cleanup(object, first_m);
9bccf70c	2245	thread_interrupt_level(interruptible_state);
1c79356b	2246
0a7de745	2247	return VM_FAULT_MEMORY_SHORTAGE;
2d21ac55	2248	}
1c79356b	2249	/*
2d21ac55	2250	* Must copy page into copy-object.
1c79356b	2251	*/
1c79356b	2252	vm_page_copy(m, copy_m);
5ba3f43e	2253
1c79356b	2254	/*
2d21ac55 A	2255	* If the old page was in use by any users
	2256	* of the copy-object, it must be removed
	2257	* from all pmaps. (We can't know which
	2258	* pmaps use it.)
1c79356b	2259	*/
0a7de745 A	2260	if (m->vmp_pmapped) {
	2261	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	2262	}
1c79356b	2263
d9a64523	2264	if (m->vmp_clustered) {
fe8ab488 A	2265	VM_PAGE_COUNT_AS_PAGEIN(m);
	2266	VM_PAGE_CONSUME_CLUSTERED(m);
	2267	}
1c79356b	2268	/*
2d21ac55 A	2269	* If there's a pager, then immediately
	2270	* page out this page, using the "initialize"
	2271	* option. Else, we use the copy.
1c79356b	2272	*/
0a7de745	2273	if ((!copy_object->pager_ready)
39236c6e	2274	\|\| VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
0a7de745	2275	) {
2d21ac55	2276	vm_page_lockspin_queues();
d9a64523	2277	assert(!m->vmp_cleaning);
1c79356b A	2278	vm_page_activate(copy_m);
1c79356b A	2279	vm_page_unlock_queues();
2d21ac55	2280
316670eb	2281	SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b	2282	PAGE_WAKEUP_DONE(copy_m);
316670eb	2283	} else {
d9a64523 A	2284	assert(copy_m->vmp_busy == TRUE);
d9a64523 A	2285	assert(!m->vmp_cleaning);
1c79356b A	2286
1c79356b A	2287	/*
2d21ac55	2288	* dirty is protected by the object lock
1c79356b	2289	*/
316670eb	2290	SET_PAGE_DIRTY(copy_m, TRUE);
1c79356b	2291
2d21ac55 A	2292	/*
	2293	* The page is already ready for pageout:
	2294	* not on pageout queues and busy.
	2295	* Unlock everything except the
	2296	* copy_object itself.
	2297	*/
1c79356b A	2298	vm_object_unlock(object);
	2299
	2300	/*
2d21ac55 A	2301	* Write the page to the copy-object,
2d21ac55 A	2302	* flushing it from the kernel.
1c79356b	2303	*/
1c79356b A	2304	vm_pageout_initialize_page(copy_m);
	2305
	2306	/*
2d21ac55 A	2307	* Since the pageout may have
	2308	* temporarily dropped the
	2309	* copy_object's lock, we
	2310	* check whether we'll have
	2311	* to deallocate the hard way.
1c79356b	2312	*/
2d21ac55	2313	if ((copy_object->shadow != object) \|\| (copy_object->ref_count == 1)) {
1c79356b A	2314	vm_object_unlock(copy_object);
	2315	vm_object_deallocate(copy_object);
	2316	vm_object_lock(object);
2d21ac55	2317
1c79356b A	2318	continue;
1c79356b A	2319	}
1c79356b	2320	/*
2d21ac55 A	2321	* Pick back up the old object's
	2322	* lock. [It is safe to do so,
	2323	* since it must be deeper in the
	2324	* object tree.]
1c79356b	2325	*/
1c79356b A	2326	vm_object_lock(object);
1c79356b A	2327	}
316670eb	2328
1c79356b	2329	/*
2d21ac55 A	2330	* Because we're pushing a page upward
	2331	* in the object tree, we must restart
	2332	* any faults that are waiting here.
	2333	* [Note that this is an expansion of
	2334	* PAGE_WAKEUP that uses the THREAD_RESTART
	2335	* wait result]. Can't turn off the page's
	2336	* busy bit because we're not done with it.
1c79356b	2337	*/
d9a64523 A	2338	if (m->vmp_wanted) {
d9a64523 A	2339	m->vmp_wanted = FALSE;
2d21ac55	2340	thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1c79356b A	2341	}
1c79356b A	2342	}
1c79356b	2343	/*
2d21ac55 A	2344	* The reference count on copy_object must be
	2345	* at least 2: one for our extra reference,
	2346	* and at least one from the outside world
	2347	* (we checked that when we last locked
	2348	* copy_object).
1c79356b	2349	*/
2d21ac55	2350	vm_object_lock_assert_exclusive(copy_object);
1c79356b A	2351	copy_object->ref_count--;
1c79356b A	2352	assert(copy_object->ref_count > 0);
2d21ac55	2353
1c79356b A	2354	vm_object_unlock(copy_object);
	2355
	2356	break;
	2357	}
b0d623f7 A	2358
b0d623f7 A	2359	done:
1c79356b A	2360	*result_page = m;
	2361	*top_page = first_m;
	2362
2d21ac55	2363	if (m != VM_PAGE_NULL) {
39037602 A	2364	assert(VM_PAGE_OBJECT(m) == object);
39037602 A	2365
b0d623f7	2366	retval = VM_FAULT_SUCCESS;
fe8ab488	2367
2d21ac55	2368	if (my_fault == DBG_PAGEIN_FAULT) {
fe8ab488	2369	VM_PAGE_COUNT_AS_PAGEIN(m);
2d21ac55	2370
0a7de745	2371	if (object->internal) {
b0d623f7	2372	my_fault = DBG_PAGEIND_FAULT;
0a7de745	2373	} else {
b0d623f7	2374	my_fault = DBG_PAGEINV_FAULT;
0a7de745	2375	}
2d21ac55	2376
0a7de745	2377	/*
2d21ac55 A	2378	* evaluate access pattern and update state
	2379	* vm_fault_deactivate_behind depends on the
	2380	* state being up to date
	2381	*/
0a7de745	2382	vm_fault_is_sequential(object, offset, fault_info->behavior);
d9a64523	2383	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
d9a64523	2384	} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
0a7de745	2385	/*
d9a64523 A	2386	* we weren't called from vm_fault, so handle the
	2387	* accounting here for hits in the cache
	2388	*/
0a7de745 A	2389	if (m->vmp_clustered) {
0a7de745 A	2390	VM_PAGE_COUNT_AS_PAGEIN(m);
d9a64523 A	2391	VM_PAGE_CONSUME_CLUSTERED(m);
d9a64523 A	2392	}
0a7de745	2393	vm_fault_is_sequential(object, offset, fault_info->behavior);
2d21ac55	2394	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
39236c6e	2395	} else if (my_fault == DBG_COMPRESSOR_FAULT \|\| my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
cb323159	2396	VM_STAT_DECOMPRESSIONS();
2d21ac55	2397	}
0a7de745 A	2398	if (type_of_fault) {
	2399	*type_of_fault = my_fault;
	2400	}
b0d623f7 A	2401	} else {
	2402	retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
	2403	assert(first_m == VM_PAGE_NULL);
	2404	assert(object == first_object);
	2405	}
2d21ac55	2406
55e303ae A	2407	thread_interrupt_level(interruptible_state);
55e303ae A	2408
1c79356b	2409	#if TRACEFAULTPAGE
0a7de745	2410	dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1c79356b	2411	#endif
b0d623f7	2412	return retval;
1c79356b	2413
2d21ac55	2414	backoff:
9bccf70c	2415	thread_interrupt_level(interruptible_state);
2d21ac55	2416
0a7de745 A	2417	if (wait_result == THREAD_INTERRUPTED) {
	2418	return VM_FAULT_INTERRUPTED;
	2419	}
	2420	return VM_FAULT_RETRY;
1c79356b	2421
0a7de745	2422	#undef RELEASE_PAGE
1c79356b A	2423	}
1c79356b A	2424
2d21ac55	2425
f427ee49 A	2426	extern int panic_on_cs_killed;
	2427	extern int proc_selfpid(void);
	2428	extern char proc_name_address(void p);
	2429	unsigned long cs_enter_tainted_rejected = 0;
	2430	unsigned long cs_enter_tainted_accepted = 0;
2d21ac55	2431
593a1d5f A	2432	/*
	2433	* CODE SIGNING:
	2434	* When soft faulting a page, we have to validate the page if:
	2435	* 1. the page is being mapped in user space
	2436	* 2. the page hasn't already been found to be "tainted"
	2437	* 3. the page belongs to a code-signed object
	2438	* 4. the page has not been validated yet or has been mapped for write.
	2439	*/
f427ee49 A	2440	static bool
	2441	vm_fault_cs_need_validation(
	2442	pmap_t pmap,
	2443	vm_page_t page,
	2444	vm_object_t page_obj,
	2445	vm_map_size_t fault_page_size,
	2446	vm_map_offset_t fault_phys_offset)
55e303ae	2447	{
5ba3f43e	2448	if (pmap == kernel_pmap) {
f427ee49 A	2449	/* 1 - not user space */
f427ee49 A	2450	return false;
5ba3f43e	2451	}
f427ee49 A	2452	if (!page_obj->code_signed) {
	2453	/* 3 - page does not belong to a code-signed object */
	2454	return false;
2d21ac55	2455	}
f427ee49 A	2456	if (fault_page_size == PAGE_SIZE) {
	2457	/* looking at the whole page */
	2458	assertf(fault_phys_offset == 0,
	2459	"fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
	2460	(uint64_t)fault_page_size,
	2461	(uint64_t)fault_phys_offset);
	2462	if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
	2463	/* 2 - page is all tainted */
	2464	return false;
5ba3f43e	2465	}
f427ee49 A	2466	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
	2467	!page->vmp_wpmapped) {
	2468	/* 4 - already fully validated and never mapped writable */
	2469	return false;
	2470	}
	2471	} else {
	2472	/* looking at a specific sub-page */
	2473	if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
	2474	/* 2 - sub-page was already marked as tainted */
	2475	return false;
	2476	}
	2477	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
	2478	!page->vmp_wpmapped) {
	2479	/* 4 - already validated and never mapped writable */
	2480	return false;
2d21ac55	2481	}
6d2010ae	2482	}
f427ee49 A	2483	/* page needs to be validated */
	2484	return true;
	2485	}
2d21ac55	2486
2d21ac55	2487
f427ee49 A	2488	static bool
	2489	vm_fault_cs_page_immutable(
	2490	vm_page_t m,
	2491	vm_map_size_t fault_page_size,
	2492	vm_map_offset_t fault_phys_offset,
	2493	vm_prot_t prot __unused)
	2494	{
	2495	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
	2496	/&& ((prot) & VM_PROT_EXECUTE)/) {
	2497	return true;
2d21ac55	2498	}
f427ee49 A	2499	return false;
f427ee49 A	2500	}
2d21ac55	2501
f427ee49 A	2502	static bool
	2503	vm_fault_cs_page_nx(
	2504	vm_page_t m,
	2505	vm_map_size_t fault_page_size,
	2506	vm_map_offset_t fault_phys_offset)
	2507	{
	2508	return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2509	}
	2510
	2511	/*
	2512	* Check if the page being entered into the pmap violates code signing.
	2513	*/
	2514	static kern_return_t
	2515	vm_fault_cs_check_violation(
	2516	bool cs_bypass,
	2517	vm_object_t object,
	2518	vm_page_t m,
	2519	pmap_t pmap,
	2520	vm_prot_t prot,
	2521	vm_prot_t caller_prot,
	2522	vm_map_size_t fault_page_size,
	2523	vm_map_offset_t fault_phys_offset,
	2524	vm_object_fault_info_t fault_info,
	2525	bool map_is_switched,
	2526	bool map_is_switch_protected,
	2527	bool *cs_violation)
	2528	{
	2529	#if !PMAP_CS
	2530	#pragma unused(caller_prot)
	2531	#pragma unused(fault_info)
	2532	#endif /* !PMAP_CS */
	2533	int cs_enforcement_enabled;
d9a64523	2534	if (!cs_bypass &&
f427ee49 A	2535	vm_fault_cs_need_validation(pmap, m, object,
f427ee49 A	2536	fault_page_size, fault_phys_offset)) {
39037602	2537	vm_object_lock_assert_exclusive(object);
593a1d5f	2538
f427ee49	2539	if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
593a1d5f A	2540	vm_cs_revalidates++;
	2541	}
	2542
5ba3f43e	2543	/* VM map is locked, so 1 ref will remain on VM object -
b0d623f7	2544	* so no harm if vm_page_validate_cs drops the object lock */
d9a64523	2545
f427ee49	2546	vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
593a1d5f A	2547	}
593a1d5f A	2548
b0d623f7	2549	/* If the map is switched, and is switch-protected, we must protect
5ba3f43e	2550	* some pages from being write-faulted: immutable pages because by
b0d623f7 A	2551	* definition they may not be written, and executable pages because that
	2552	* would provide a way to inject unsigned code.
	2553	* If the page is immutable, we can simply return. However, we can't
	2554	* immediately determine whether a page is executable anywhere. But,
	2555	* we can disconnect it everywhere and remove the executable protection
5ba3f43e	2556	* from the current map. We do that below right before we do the
b0d623f7 A	2557	* PMAP_ENTER.
b0d623f7 A	2558	*/
eb6b6ca3	2559	if (pmap == kernel_pmap) {
f427ee49	2560	/* kernel fault: cs_enforcement does not apply */
eb6b6ca3 A	2561	cs_enforcement_enabled = 0;
eb6b6ca3 A	2562	} else {
f427ee49	2563	cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
eb6b6ca3	2564	}
39236c6e	2565
0a7de745	2566	if (cs_enforcement_enabled && map_is_switched &&
f427ee49 A	2567	map_is_switch_protected &&
f427ee49 A	2568	vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
0a7de745	2569	(prot & VM_PROT_WRITE)) {
b0d623f7 A	2570	return KERN_CODESIGN_ERROR;
	2571	}
	2572
f427ee49 A	2573	if (cs_enforcement_enabled &&
	2574	vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
	2575	(prot & VM_PROT_EXECUTE)) {
0a7de745	2576	if (cs_debug) {
c18c124e	2577	printf("page marked to be NX, not letting it be mapped EXEC\n");
0a7de745	2578	}
c18c124e A	2579	return KERN_CODESIGN_ERROR;
	2580	}
	2581
b0d623f7 A	2582	/* A page could be tainted, or pose a risk of being tainted later.
	2583	* Check whether the receiving process wants it, and make it feel
	2584	* the consequences (that hapens in cs_invalid_page()).
5ba3f43e A	2585	* For CS Enforcement, two other conditions will
5ba3f43e A	2586	* cause that page to be tainted as well:
b0d623f7 A	2587	* - pmapping an unsigned page executable - this means unsigned code;
	2588	* - writeable mapping of a validated page - the content of that page
	2589	* can be changed without the kernel noticing, therefore unsigned
	2590	* code can be created
	2591	*/
d9a64523 A	2592	if (cs_bypass) {
d9a64523 A	2593	/* code-signing is bypassed */
f427ee49 A	2594	*cs_violation = FALSE;
f427ee49 A	2595	} else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
d9a64523	2596	/* tainted page */
f427ee49	2597	*cs_violation = TRUE;
d9a64523 A	2598	} else if (!cs_enforcement_enabled) {
d9a64523 A	2599	/* no further code-signing enforcement */
f427ee49 A	2600	*cs_violation = FALSE;
f427ee49 A	2601	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
0a7de745 A	2602	((prot & VM_PROT_WRITE) \|\|
0a7de745 A	2603	m->vmp_wpmapped)) {
d9a64523 A	2604	/*
	2605	* The page should be immutable, but is in danger of being
	2606	* modified.
	2607	* This is the case where we want policy from the code
	2608	* directory - is the page immutable or not? For now we have
	2609	* to assume that code pages will be immutable, data pages not.
	2610	* We'll assume a page is a code page if it has a code directory
	2611	* and we fault for execution.
	2612	* That is good enough since if we faulted the code page for
	2613	* writing in another map before, it is wpmapped; if we fault
	2614	* it for writing in this map later it will also be faulted for
	2615	* executing at the same time; and if we fault for writing in
	2616	* another map later, we will disconnect it from this pmap so
	2617	* we'll notice the change.
	2618	*/
f427ee49 A	2619	*cs_violation = TRUE;
f427ee49 A	2620	} else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
0a7de745	2621	(prot & VM_PROT_EXECUTE)
0a7de745	2622	) {
f427ee49	2623	*cs_violation = TRUE;
d9a64523	2624	} else {
f427ee49	2625	*cs_violation = FALSE;
d9a64523	2626	}
f427ee49 A	2627	return KERN_SUCCESS;
f427ee49 A	2628	}
d9a64523	2629
f427ee49 A	2630	/*
	2631	* Handles a code signing violation by either rejecting the page or forcing a disconnect.
	2632	* @param must_disconnect This value will be set to true if the caller must disconnect
	2633	* this page.
	2634	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2635	*/
	2636	static kern_return_t
	2637	vm_fault_cs_handle_violation(
	2638	vm_object_t object,
	2639	vm_page_t m,
	2640	pmap_t pmap,
	2641	vm_prot_t prot,
	2642	vm_map_offset_t vaddr,
	2643	vm_map_size_t fault_page_size,
	2644	vm_map_offset_t fault_phys_offset,
	2645	bool map_is_switched,
	2646	bool map_is_switch_protected,
	2647	bool *must_disconnect)
	2648	{
	2649	#if !MACH_ASSERT
	2650	#pragma unused(pmap)
	2651	#pragma unused(map_is_switch_protected)
	2652	#endif /* !MACH_ASSERT */
	2653	/*
	2654	* We will have a tainted page. Have to handle the special case
	2655	* of a switched map now. If the map is not switched, standard
	2656	* procedure applies - call cs_invalid_page().
	2657	* If the map is switched, the real owner is invalid already.
	2658	* There is no point in invalidating the switching process since
	2659	* it will not be executing from the map. So we don't call
	2660	* cs_invalid_page() in that case.
	2661	*/
	2662	boolean_t reject_page, cs_killed;
	2663	kern_return_t kr;
	2664	if (map_is_switched) {
	2665	assert(pmap == vm_map_pmap(current_thread()->map));
	2666	assert(!(prot & VM_PROT_WRITE) \|\| (map_is_switch_protected == FALSE));
	2667	reject_page = FALSE;
	2668	} else {
	2669	if (cs_debug > 5) {
	2670	printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
	2671	object->code_signed ? "yes" : "no",
	2672	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2673	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
	2674	m->vmp_wpmapped ? "yes" : "no",
	2675	(int)prot);
b0d623f7	2676	}
f427ee49 A	2677	reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
f427ee49 A	2678	}
5ba3f43e	2679
f427ee49 A	2680	if (reject_page) {
	2681	/* reject the invalid page: abort the page fault */
	2682	int pid;
	2683	const char *procname;
	2684	task_t task;
	2685	vm_object_t file_object, shadow;
	2686	vm_object_offset_t file_offset;
	2687	char pathname, filename;
	2688	vm_size_t pathname_len, filename_len;
	2689	boolean_t truncated_path;
15129b1c	2690	#define __PATH_MAX 1024
f427ee49 A	2691	struct timespec mtime, cs_mtime;
	2692	int shadow_depth;
	2693	os_reason_t codesigning_exit_reason = OS_REASON_NULL;
	2694
	2695	kr = KERN_CODESIGN_ERROR;
	2696	cs_enter_tainted_rejected++;
	2697
	2698	/* get process name and pid */
	2699	procname = "?";
	2700	task = current_task();
	2701	pid = proc_selfpid();
	2702	if (task->bsd_info != NULL) {
	2703	procname = proc_name_address(task->bsd_info);
	2704	}
	2705
	2706	/* get file's VM object */
	2707	file_object = object;
	2708	file_offset = m->vmp_offset;
	2709	for (shadow = file_object->shadow,
	2710	shadow_depth = 0;
	2711	shadow != VM_OBJECT_NULL;
	2712	shadow = file_object->shadow,
	2713	shadow_depth++) {
	2714	vm_object_lock_shared(shadow);
	2715	if (file_object != object) {
	2716	vm_object_unlock(file_object);
	2717	}
	2718	file_offset += file_object->vo_shadow_offset;
	2719	file_object = shadow;
	2720	}
	2721
	2722	mtime.tv_sec = 0;
	2723	mtime.tv_nsec = 0;
	2724	cs_mtime.tv_sec = 0;
	2725	cs_mtime.tv_nsec = 0;
	2726
	2727	/* get file's pathname and/or filename */
	2728	pathname = NULL;
	2729	filename = NULL;
	2730	pathname_len = 0;
	2731	filename_len = 0;
	2732	truncated_path = FALSE;
	2733	/* no pager -> no file -> no pathname, use "<nil>" in that case */
	2734	if (file_object->pager != NULL) {
	2735	pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
	2736	if (pathname) {
	2737	pathname[0] = '\0';
	2738	pathname_len = __PATH_MAX;
	2739	filename = pathname + pathname_len;
	2740	filename_len = __PATH_MAX;
	2741
	2742	if (vnode_pager_get_object_name(file_object->pager,
	2743	pathname,
	2744	pathname_len,
	2745	filename,
	2746	filename_len,
	2747	&truncated_path) == KERN_SUCCESS) {
	2748	/* safety first... */
	2749	pathname[__PATH_MAX - 1] = '\0';
	2750	filename[__PATH_MAX - 1] = '\0';
	2751
	2752	vnode_pager_get_object_mtime(file_object->pager,
	2753	&mtime,
	2754	&cs_mtime);
2755	} else {
2756	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
2757	pathname = NULL;
2758	filename = NULL;
2759	pathname_len = 0;
2760	filename_len = 0;
2761	truncated_path = FALSE;
15129b1c	2762	}
15129b1c	2763	}
f427ee49 A	2764	}
	2765	printf("CODE SIGNING: process %d[%s]: "
	2766	"rejecting invalid page at address 0x%llx "
	2767	"from offset 0x%llx in file \"%s%s%s\" "
	2768	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
	2769	"(signed:%d validated:%d tainted:%d nx:%d "
	2770	"wpmapped:%d dirty:%d depth:%d)\n",
	2771	pid, procname, (addr64_t) vaddr,
	2772	file_offset,
	2773	(pathname ? pathname : "<nil>"),
	2774	(truncated_path ? "/.../" : ""),
	2775	(truncated_path ? filename : ""),
	2776	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2777	((cs_mtime.tv_sec == mtime.tv_sec &&
	2778	cs_mtime.tv_nsec == mtime.tv_nsec)
	2779	? "=="
	2780	: "!="),
	2781	mtime.tv_sec, mtime.tv_nsec,
	2782	object->code_signed,
	2783	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2784	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2785	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
	2786	m->vmp_wpmapped,
	2787	m->vmp_dirty,
	2788	shadow_depth);
15129b1c	2789
f427ee49 A	2790	/*
	2791	* We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
	2792	* did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
	2793	* process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
	2794	* will deal with the segmentation fault.
	2795	*/
	2796	if (cs_killed) {
	2797	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
	2798	pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
15129b1c	2799
f427ee49 A	2800	codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
	2801	if (codesigning_exit_reason == NULL) {
	2802	printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
	2803	} else {
	2804	mach_vm_address_t data_addr = 0;
	2805	struct codesigning_exit_reason_info *ceri = NULL;
	2806	uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
	2807
	2808	if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
	2809	printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
	2810	} else {
	2811	if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
	2812	EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
	2813	ceri = (struct codesigning_exit_reason_info *)data_addr;
	2814	static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
	2815
	2816	ceri->ceri_virt_addr = vaddr;
	2817	ceri->ceri_file_offset = file_offset;
	2818	if (pathname) {
	2819	strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
	2820	} else {
	2821	ceri->ceri_pathname[0] = '\0';
	2822	}
	2823	if (filename) {
	2824	strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
	2825	} else {
	2826	ceri->ceri_filename[0] = '\0';
	2827	}
	2828	ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
	2829	ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
	2830	ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
	2831	ceri->ceri_page_modtime_secs = mtime.tv_sec;
	2832	ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
	2833	ceri->ceri_object_codesigned = (object->code_signed);
	2834	ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
	2835	ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
	2836	ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
	2837	ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
	2838	ceri->ceri_page_slid = 0;
	2839	ceri->ceri_page_dirty = (m->vmp_dirty);
	2840	ceri->ceri_page_shadow_depth = shadow_depth;
cb323159	2841	} else {
f427ee49 A	2842	#if DEBUG \|\| DEVELOPMENT
	2843	panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
	2844	#else
	2845	printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
	2846	#endif /* DEBUG \|\| DEVELOPMENT */
	2847	/* Free the buffer */
	2848	os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
cb323159	2849	}
15129b1c	2850	}
15129b1c	2851	}
f427ee49 A	2852
	2853	set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
	2854	}
	2855	if (panic_on_cs_killed &&
	2856	object->object_is_shared_cache) {
	2857	char *tainted_contents;
	2858	vm_map_offset_t src_vaddr;
	2859	src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
	2860	tainted_contents = kalloc(PAGE_SIZE);
	2861	bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
	2862	printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
	2863	panic("CODE SIGNING: process %d[%s]: "
	2864	"rejecting invalid page (phys#0x%x) at address 0x%llx "
0a7de745 A	2865	"from offset 0x%llx in file \"%s%s%s\" "
0a7de745 A	2866	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
f427ee49	2867	"(signed:%d validated:%d tainted:%d nx:%d"
0a7de745	2868	"wpmapped:%d dirty:%d depth:%d)\n",
f427ee49 A	2869	pid, procname,
	2870	VM_PAGE_GET_PHYS_PAGE(m),
	2871	(addr64_t) vaddr,
0a7de745 A	2872	file_offset,
	2873	(pathname ? pathname : "<nil>"),
	2874	(truncated_path ? "/.../" : ""),
	2875	(truncated_path ? filename : ""),
	2876	cs_mtime.tv_sec, cs_mtime.tv_nsec,
	2877	((cs_mtime.tv_sec == mtime.tv_sec &&
	2878	cs_mtime.tv_nsec == mtime.tv_nsec)
	2879	? "=="
	2880	: "!="),
	2881	mtime.tv_sec, mtime.tv_nsec,
	2882	object->code_signed,
f427ee49 A	2883	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
	2884	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
	2885	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
0a7de745 A	2886	m->vmp_wpmapped,
	2887	m->vmp_dirty,
	2888	shadow_depth);
f427ee49	2889	}
39037602	2890
f427ee49 A	2891	if (file_object != object) {
	2892	vm_object_unlock(file_object);
	2893	}
	2894	if (pathname_len != 0) {
	2895	kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
	2896	pathname = NULL;
	2897	filename = NULL;
	2898	}
	2899	} else {
	2900	/* proceed with the invalid page */
	2901	kr = KERN_SUCCESS;
	2902	if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
	2903	!object->code_signed) {
39037602	2904	/*
f427ee49 A	2905	* This page has not been (fully) validated but
	2906	* does not belong to a code-signed object
	2907	* so it should not be forcefully considered
	2908	* as tainted.
	2909	* We're just concerned about it here because
	2910	* we've been asked to "execute" it but that
	2911	* does not mean that it should cause other
	2912	* accesses to fail.
	2913	* This happens when a debugger sets a
	2914	* breakpoint and we then execute code in
	2915	* that page. Marking the page as "tainted"
	2916	* would cause any inspection tool ("leaks",
	2917	* "vmmap", "CrashReporter", ...) to get killed
	2918	* due to code-signing violation on that page,
	2919	* even though they're just reading it and not
	2920	* executing from it.
39037602	2921	*/
b0d623f7	2922	} else {
f427ee49 A	2923	/*
	2924	* Page might have been tainted before or not;
	2925	* now it definitively is. If the page wasn't
	2926	* tainted, we must disconnect it from all
	2927	* pmaps later, to force existing mappings
	2928	* through that code path for re-consideration
	2929	* of the validity of that page.
	2930	*/
	2931	if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
	2932	*must_disconnect = TRUE;
	2933	VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
fe8ab488	2934	}
2d21ac55	2935	}
f427ee49 A	2936	cs_enter_tainted_accepted++;
	2937	}
	2938	if (kr != KERN_SUCCESS) {
	2939	if (cs_debug) {
	2940	printf("CODESIGNING: vm_fault_enter(0x%llx): "
	2941	"* INVALID PAGE *\n",
	2942	(long long)vaddr);
	2943	}
39236c6e	2944	#if !SECURE_KERNEL
f427ee49 A	2945	if (cs_enforcement_panic) {
f427ee49 A	2946	panic("CODESIGNING: panicking on invalid page\n");
2d21ac55	2947	}
f427ee49	2948	#endif
2d21ac55	2949	}
f427ee49 A	2950	return kr;
f427ee49 A	2951	}
2d21ac55	2952
f427ee49 A	2953	/*
	2954	* Check that the code signature is valid for the given page being inserted into
	2955	* the pmap.
	2956	*
	2957	* @param must_disconnect This value will be set to true if the caller must disconnect
	2958	* this page.
	2959	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
	2960	*/
	2961	static kern_return_t
	2962	vm_fault_validate_cs(
	2963	bool cs_bypass,
	2964	vm_object_t object,
	2965	vm_page_t m,
	2966	pmap_t pmap,
	2967	vm_map_offset_t vaddr,
	2968	vm_prot_t prot,
	2969	vm_prot_t caller_prot,
	2970	vm_map_size_t fault_page_size,
	2971	vm_map_offset_t fault_phys_offset,
	2972	vm_object_fault_info_t fault_info,
	2973	bool *must_disconnect)
	2974	{
	2975	bool map_is_switched, map_is_switch_protected, cs_violation;
	2976	kern_return_t kr;
	2977	/* Validate code signature if necessary. */
	2978	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
	2979	(pmap == vm_map_pmap(current_thread()->map)));
	2980	map_is_switch_protected = current_thread()->map->switch_protect;
	2981	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
	2982	prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
	2983	map_is_switched, map_is_switch_protected, &cs_violation);
	2984	if (kr != KERN_SUCCESS) {
	2985	return kr;
	2986	}
	2987	if (cs_violation) {
	2988	kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
	2989	fault_page_size, fault_phys_offset,
	2990	map_is_switched, map_is_switch_protected, must_disconnect);
	2991	}
	2992	return kr;
	2993	}
	2994
	2995	/*
	2996	* Enqueue the page on the appropriate paging queue.
	2997	*/
	2998	static void
	2999	vm_fault_enqueue_page(
	3000	vm_object_t object,
	3001	vm_page_t m,
	3002	bool wired,
	3003	bool change_wiring,
	3004	vm_tag_t wire_tag,
	3005	bool no_cache,
	3006	int *type_of_fault,
	3007	kern_return_t kr)
	3008	{
	3009	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) \|\| object != compressor_object);
0a7de745	3010	boolean_t page_queues_locked = FALSE;
f427ee49	3011	boolean_t previously_pmapped = m->vmp_pmapped;
0a7de745 A	3012	#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
	3013	MACRO_BEGIN \
	3014	if (! page_queues_locked) { \
	3015	page_queues_locked = TRUE; \
	3016	vm_page_lockspin_queues(); \
	3017	} \
39236c6e	3018	MACRO_END
0a7de745 A	3019	#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
	3020	MACRO_BEGIN \
	3021	if (page_queues_locked) { \
	3022	page_queues_locked = FALSE; \
	3023	vm_page_unlock_queues(); \
	3024	} \
39236c6e A	3025	MACRO_END
39236c6e A	3026
39037602 A	3027	#if CONFIG_BACKGROUND_QUEUE
	3028	vm_page_update_background_state(m);
	3029	#endif
d9a64523	3030	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
39236c6e A	3031	/*
	3032	* Compressor pages are neither wired
	3033	* nor pageable and should never change.
	3034	*/
39037602	3035	assert(object == compressor_object);
39236c6e	3036	} else if (change_wiring) {
0a7de745	3037	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
39236c6e A	3038
	3039	if (wired) {
	3040	if (kr == KERN_SUCCESS) {
5ba3f43e	3041	vm_page_wire(m, wire_tag, TRUE);
39236c6e A	3042	}
39236c6e A	3043	} else {
0a7de745	3044	vm_page_unwire(m, TRUE);
39236c6e A	3045	}
39236c6e A	3046	/* we keep the page queues lock, if we need it later */
39236c6e	3047	} else {
39037602 A	3048	if (object->internal == TRUE) {
	3049	/*
	3050	* don't allow anonymous pages on
	3051	* the speculative queues
	3052	*/
	3053	no_cache = FALSE;
	3054	}
0a7de745 A	3055	if (kr != KERN_SUCCESS) {
	3056	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3057	vm_page_deactivate(m);
39236c6e	3058	/* we keep the page queues lock, if we need it later */
d9a64523	3059	} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) \|\|
0a7de745 A	3060	(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) \|\|
	3061	(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) \|\|
	3062	((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
	3063	!VM_PAGE_WIRED(m)) {
5ba3f43e	3064	if (vm_page_local_q &&
39236c6e	3065	(*type_of_fault == DBG_COW_FAULT \|\|
0a7de745 A	3066	*type_of_fault == DBG_ZERO_FILL_FAULT)) {
	3067	struct vpl *lq;
	3068	uint32_t lid;
39236c6e	3069
d9a64523	3070	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
39037602	3071
39236c6e	3072	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
39037602	3073	vm_object_lock_assert_exclusive(object);
39236c6e A	3074
	3075	/*
	3076	* we got a local queue to stuff this
	3077	* new page on...
	3078	* its safe to manipulate local and
	3079	* local_id at this point since we're
	3080	* behind an exclusive object lock and
	3081	* the page is not on any global queue.
	3082	*
	3083	* we'll use the current cpu number to
	3084	* select the queue note that we don't
	3085	* need to disable preemption... we're
39037602	3086	* going to be behind the local queue's
39236c6e A	3087	* lock to do the real work
	3088	*/
	3089	lid = cpu_number();
	3090
f427ee49	3091	lq = zpercpu_get_cpu(vm_page_local_q, lid);
39236c6e A	3092
	3093	VPL_LOCK(&lq->vpl_lock);
	3094
3e170ce0	3095	vm_page_check_pageable_safe(m);
0a7de745	3096	vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
d9a64523 A	3097	m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
d9a64523 A	3098	m->vmp_local_id = lid;
39236c6e	3099	lq->vpl_count++;
5ba3f43e	3100
0a7de745	3101	if (object->internal) {
39236c6e	3102	lq->vpl_internal_count++;
0a7de745	3103	} else {
39236c6e	3104	lq->vpl_external_count++;
0a7de745	3105	}
39236c6e A	3106
	3107	VPL_UNLOCK(&lq->vpl_lock);
	3108
0a7de745	3109	if (lq->vpl_count > vm_page_local_q_soft_limit) {
39236c6e A	3110	/*
	3111	* we're beyond the soft limit
	3112	* for the local queue
	3113	* vm_page_reactivate_local will
	3114	* 'try' to take the global page
	3115	* queue lock... if it can't
	3116	* that's ok... we'll let the
	3117	* queue continue to grow up
	3118	* to the hard limit... at that
	3119	* point we'll wait for the
	3120	* lock... once we've got the
	3121	* lock, we'll transfer all of
	3122	* the pages from the local
	3123	* queue to the global active
	3124	* queue
	3125	*/
	3126	vm_page_reactivate_local(lid, FALSE, FALSE);
	3127	}
	3128	} else {
39236c6e A	3129	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
	3130
	3131	/*
	3132	* test again now that we hold the
	3133	* page queue lock
	3134	*/
	3135	if (!VM_PAGE_WIRED(m)) {
d9a64523	3136	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
39037602	3137	vm_page_queues_remove(m, FALSE);
39236c6e	3138
d9a64523 A	3139	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
d9a64523 A	3140	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
39236c6e A	3141	}
39236c6e A	3142
0a7de745 A	3143	if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) \|\|
0a7de745 A	3144	no_cache) {
39236c6e A	3145	/*
	3146	* If this is a no_cache mapping
	3147	* and the page has never been
	3148	* mapped before or was
	3149	* previously a no_cache page,
	3150	* then we want to leave pages
	3151	* in the speculative state so
	3152	* that they can be readily
	3153	* recycled if free memory runs
	3154	* low. Otherwise the page is
5ba3f43e	3155	* activated as normal.
39236c6e A	3156	*/
	3157
	3158	if (no_cache &&
	3159	(!previously_pmapped \|\|
0a7de745	3160	m->vmp_no_cache)) {
d9a64523	3161	m->vmp_no_cache = TRUE;
39236c6e	3162
0a7de745	3163	if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
39236c6e	3164	vm_page_speculate(m, FALSE);
0a7de745 A	3165	}
0a7de745 A	3166	} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
39236c6e A	3167	vm_page_activate(m);
	3168	}
	3169	}
	3170	}
	3171	/* we keep the page queues lock, if we need it later */
	3172	}
	3173	}
	3174	}
39236c6e A	3175	/* we're done with the page queues lock, if we ever took it */
39236c6e A	3176	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
f427ee49 A	3177	}
	3178
	3179	/*
	3180	* Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
	3181	* @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
	3182	* before being inserted into the pmap.
	3183	*/
	3184	static bool
	3185	vm_fault_enter_set_mapped(
	3186	vm_object_t object,
	3187	vm_page_t m,
	3188	vm_prot_t prot,
	3189	vm_prot_t fault_type)
	3190	{
	3191	bool page_needs_sync = false;
	3192	/*
	3193	* NOTE: we may only hold the vm_object lock SHARED
	3194	* at this point, so we need the phys_page lock to
	3195	* properly serialize updating the pmapped and
	3196	* xpmapped bits
	3197	*/
	3198	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
	3199	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3200
	3201	pmap_lock_phys_page(phys_page);
	3202	m->vmp_pmapped = TRUE;
	3203
	3204	if (!m->vmp_xpmapped) {
	3205	m->vmp_xpmapped = TRUE;
	3206
	3207	pmap_unlock_phys_page(phys_page);
	3208
	3209	if (!object->internal) {
	3210	OSAddAtomic(1, &vm_page_xpmapped_external_count);
	3211	}
	3212
	3213	#if defined(__arm__) \|\| defined(__arm64__)
	3214	page_needs_sync = true;
	3215	#else
	3216	if (object->internal &&
	3217	object->pager != NULL) {
	3218	/*
	3219	* This page could have been
	3220	* uncompressed by the
	3221	* compressor pager and its
	3222	* contents might be only in
	3223	* the data cache.
	3224	* Since it's being mapped for
	3225	* "execute" for the fist time,
	3226	* make sure the icache is in
	3227	* sync.
	3228	*/
	3229	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
	3230	page_needs_sync = true;
	3231	}
	3232	#endif
	3233	} else {
	3234	pmap_unlock_phys_page(phys_page);
	3235	}
	3236	} else {
	3237	if (m->vmp_pmapped == FALSE) {
	3238	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
	3239
	3240	pmap_lock_phys_page(phys_page);
3241	m->vmp_pmapped = TRUE;
3242	pmap_unlock_phys_page(phys_page);
3243	}
3244	}
3245
3246	if (fault_type & VM_PROT_WRITE) {
3247	if (m->vmp_wpmapped == FALSE) {
3248	vm_object_lock_assert_exclusive(object);
3249	if (!object->internal && object->pager) {
3250	task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3251	}
3252	m->vmp_wpmapped = TRUE;
3253	}
3254	}
3255	return page_needs_sync;
3256	}
39236c6e	3257
f427ee49 A	3258	/*
	3259	* Try to enter the given page into the pmap.
	3260	* Will retry without execute permission iff PMAP_CS is enabled and we encounter
	3261	* a codesigning failure on a non-execute fault.
	3262	*/
	3263	static kern_return_t
	3264	vm_fault_attempt_pmap_enter(
	3265	pmap_t pmap,
	3266	vm_map_offset_t vaddr,
	3267	vm_map_size_t fault_page_size,
	3268	vm_map_offset_t fault_phys_offset,
	3269	vm_page_t m,
	3270	vm_prot_t *prot,
	3271	vm_prot_t caller_prot,
	3272	vm_prot_t fault_type,
	3273	bool wired,
	3274	int pmap_options)
	3275	{
	3276	#if !PMAP_CS
	3277	#pragma unused(caller_prot)
	3278	#endif /* !PMAP_CS */
	3279	kern_return_t kr;
	3280	if (fault_page_size != PAGE_SIZE) {
	3281	DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
	3282	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	3283	fault_phys_offset < PAGE_SIZE),
	3284	"0x%llx\n", (uint64_t)fault_phys_offset);
	3285	} else {
	3286	assertf(fault_phys_offset == 0,
	3287	"0x%llx\n", (uint64_t)fault_phys_offset);
	3288	}
39236c6e	3289
f427ee49 A	3290	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3291	fault_phys_offset,
	3292	m, *prot, fault_type, 0,
	3293	wired,
	3294	pmap_options,
	3295	kr);
f427ee49 A	3296	return kr;
	3297	}
	3298
	3299	/*
	3300	* Enter the given page into the pmap.
	3301	* The map must be locked shared.
	3302	* The vm object must NOT be locked.
	3303	*
	3304	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3305	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3306	*/
	3307	static kern_return_t
	3308	vm_fault_pmap_enter(
	3309	pmap_t pmap,
	3310	vm_map_offset_t vaddr,
	3311	vm_map_size_t fault_page_size,
	3312	vm_map_offset_t fault_phys_offset,
	3313	vm_page_t m,
	3314	vm_prot_t *prot,
	3315	vm_prot_t caller_prot,
	3316	vm_prot_t fault_type,
	3317	bool wired,
	3318	int pmap_options,
	3319	boolean_t *need_retry)
	3320	{
	3321	kern_return_t kr;
	3322	if (need_retry != NULL) {
0a7de745	3323	/*
f427ee49 A	3324	* Although we don't hold a lock on this object, we hold a lock
	3325	* on the top object in the chain. To prevent a deadlock, we
	3326	* can't allow the pmap layer to block.
2d21ac55	3327	*/
f427ee49 A	3328	pmap_options \|= PMAP_OPTIONS_NOWAIT;
	3329	}
	3330	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3331	fault_page_size, fault_phys_offset,
	3332	m, prot, caller_prot, fault_type, wired, pmap_options);
	3333	if (kr == KERN_RESOURCE_SHORTAGE) {
	3334	if (need_retry) {
	3335	/*
	3336	* There's nothing we can do here since we hold the
	3337	* lock on the top object in the chain. The caller
	3338	* will need to deal with this by dropping that lock and retrying.
	3339	*/
	3340	*need_retry = TRUE;
	3341	vm_pmap_enter_retried++;
	3342	}
	3343	}
	3344	return kr;
	3345	}
fe8ab488	3346
f427ee49 A	3347	/*
	3348	* Enter the given page into the pmap.
	3349	* The vm map must be locked shared.
	3350	* The vm object must be locked exclusive, unless this is a soft fault.
	3351	* For a soft fault, the object must be locked shared or exclusive.
	3352	*
	3353	* @param need_retry if not null, avoid making a (potentially) blocking call into
	3354	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
	3355	*/
	3356	static kern_return_t
	3357	vm_fault_pmap_enter_with_object_lock(
	3358	vm_object_t object,
	3359	pmap_t pmap,
	3360	vm_map_offset_t vaddr,
	3361	vm_map_size_t fault_page_size,
	3362	vm_map_offset_t fault_phys_offset,
	3363	vm_page_t m,
	3364	vm_prot_t *prot,
	3365	vm_prot_t caller_prot,
	3366	vm_prot_t fault_type,
	3367	bool wired,
	3368	int pmap_options,
	3369	boolean_t *need_retry)
	3370	{
	3371	kern_return_t kr;
	3372	/*
	3373	* Prevent a deadlock by not
	3374	* holding the object lock if we need to wait for a page in
	3375	* pmap_enter() - <rdar://problem/7138958>
	3376	*/
	3377	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
	3378	fault_page_size, fault_phys_offset,
	3379	m, prot, caller_prot, fault_type, wired, pmap_options \| PMAP_OPTIONS_NOWAIT);
	3380	#if __x86_64__
	3381	if (kr == KERN_INVALID_ARGUMENT &&
	3382	pmap == PMAP_NULL &&
	3383	wired) {
	3384	/*
	3385	* Wiring a page in a pmap-less VM map:
	3386	* VMware's "vmmon" kernel extension does this
	3387	* to grab pages.
	3388	* Let it proceed even though the PMAP_ENTER() failed.
	3389	*/
	3390	kr = KERN_SUCCESS;
	3391	}
	3392	#endif /* __x86_64__ */
	3393
	3394	if (kr == KERN_RESOURCE_SHORTAGE) {
	3395	if (need_retry) {
fe8ab488	3396	/*
f427ee49 A	3397	* this will be non-null in the case where we hold the lock
	3398	* on the top-object in this chain... we can't just drop
	3399	* the lock on the object we're inserting the page into
	3400	* and recall the PMAP_ENTER since we can still cause
	3401	* a deadlock if one of the critical paths tries to
	3402	* acquire the lock on the top-object and we're blocked
	3403	* in PMAP_ENTER waiting for memory... our only recourse
	3404	* is to deal with it at a higher level where we can
	3405	* drop both locks.
fe8ab488	3406	*/
f427ee49 A	3407	*need_retry = TRUE;
	3408	vm_pmap_enter_retried++;
	3409	goto done;
	3410	}
	3411	/*
	3412	* The nonblocking version of pmap_enter did not succeed.
	3413	* and we don't need to drop other locks and retry
	3414	* at the level above us, so
	3415	* use the blocking version instead. Requires marking
	3416	* the page busy and unlocking the object
	3417	*/
	3418	boolean_t was_busy = m->vmp_busy;
5ba3f43e	3419
f427ee49	3420	vm_object_lock_assert_exclusive(object);
fe8ab488	3421
f427ee49 A	3422	m->vmp_busy = TRUE;
f427ee49 A	3423	vm_object_unlock(object);
fe8ab488	3424
f427ee49 A	3425	PMAP_ENTER_OPTIONS(pmap, vaddr,
	3426	fault_phys_offset,
	3427	m, *prot, fault_type,
	3428	0, wired,
	3429	pmap_options, kr);
fe8ab488	3430
f427ee49 A	3431	assert(VM_PAGE_OBJECT(m) == object);
	3432
	3433	/* Take the object lock again. */
	3434	vm_object_lock(object);
	3435
	3436	/* If the page was busy, someone else will wake it up.
	3437	* Otherwise, we have to do it now. */
	3438	assert(m->vmp_busy);
	3439	if (!was_busy) {
	3440	PAGE_WAKEUP_DONE(m);
	3441	}
	3442	vm_pmap_enter_blocked++;
	3443	}
	3444
	3445	done:
	3446	return kr;
	3447	}
	3448
	3449	/*
	3450	* Prepare to enter a page into the pmap by checking CS, protection bits,
	3451	* and setting mapped bits on the page_t.
	3452	* Does not modify the page's paging queue.
	3453	*
	3454	* page queue lock must NOT be held
	3455	* m->vmp_object must be locked
	3456	*
	3457	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3458	* from vm_fault() as part of a soft fault.
	3459	*/
	3460	static kern_return_t
	3461	vm_fault_enter_prepare(
	3462	vm_page_t m,
	3463	pmap_t pmap,
	3464	vm_map_offset_t vaddr,
	3465	vm_prot_t *prot,
	3466	vm_prot_t caller_prot,
	3467	vm_map_size_t fault_page_size,
	3468	vm_map_offset_t fault_phys_offset,
	3469	boolean_t change_wiring,
	3470	vm_prot_t fault_type,
	3471	vm_object_fault_info_t fault_info,
	3472	int *type_of_fault,
	3473	bool *page_needs_data_sync)
	3474	{
	3475	kern_return_t kr;
	3476	bool is_tainted = false;
	3477	vm_object_t object;
	3478	boolean_t cs_bypass = fault_info->cs_bypass;
	3479
	3480	object = VM_PAGE_OBJECT(m);
	3481
	3482	vm_object_lock_assert_held(object);
	3483
	3484	#if KASAN
	3485	if (pmap == kernel_pmap) {
	3486	kasan_notify_address(vaddr, PAGE_SIZE);
	3487	}
5ba3f43e	3488	#endif
f427ee49 A	3489
	3490	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
	3491
	3492	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
	3493	vm_object_lock_assert_exclusive(object);
	3494	} else if ((fault_type & VM_PROT_WRITE) == 0 &&
	3495	!change_wiring &&
	3496	(!m->vmp_wpmapped
	3497	#if VM_OBJECT_ACCESS_TRACKING
	3498	\|\| object->access_tracking
	3499	#endif /* VM_OBJECT_ACCESS_TRACKING */
	3500	)) {
	3501	/*
	3502	* This is not a "write" fault, so we
	3503	* might not have taken the object lock
	3504	* exclusively and we might not be able
	3505	* to update the "wpmapped" bit in
	3506	* vm_fault_enter().
	3507	* Let's just grant read access to
	3508	* the page for now and we'll
	3509	* soft-fault again if we need write
	3510	* access later...
	3511	*/
	3512
	3513	/* This had better not be a JIT page. */
	3514	if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
	3515	*prot &= ~VM_PROT_WRITE;
fe8ab488	3516	} else {
f427ee49 A	3517	assert(cs_bypass);
	3518	}
	3519	}
	3520	if (m->vmp_pmapped == FALSE) {
	3521	if (m->vmp_clustered) {
	3522	if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
	3523	/*
	3524	* found it in the cache, but this
	3525	* is the first fault-in of the page (m->vmp_pmapped == FALSE)
	3526	* so it must have come in as part of
	3527	* a cluster... account 1 pagein against it
	3528	*/
	3529	if (object->internal) {
	3530	*type_of_fault = DBG_PAGEIND_FAULT;
	3531	} else {
	3532	*type_of_fault = DBG_PAGEINV_FAULT;
	3533	}
39037602	3534
f427ee49	3535	VM_PAGE_COUNT_AS_PAGEIN(m);
fe8ab488	3536	}
f427ee49	3537	VM_PAGE_CONSUME_CLUSTERED(m);
fe8ab488	3538	}
f427ee49 A	3539	}
	3540
	3541	if (*type_of_fault != DBG_COW_FAULT) {
	3542	DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
	3543
	3544	if (pmap == kernel_pmap) {
	3545	DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
	3546	}
	3547	}
	3548
	3549	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
	3550	*prot, caller_prot, fault_page_size, fault_phys_offset,
	3551	fault_info, &is_tainted);
	3552	if (kr == KERN_SUCCESS) {
	3553	/*
	3554	* We either have a good page, or a tainted page that has been accepted by the process.
	3555	* In both cases the page will be entered into the pmap.
	3556	*/
	3557	page_needs_data_sync = vm_fault_enter_set_mapped(object, m, prot, fault_type);
	3558	if ((fault_type & VM_PROT_WRITE) && is_tainted) {
	3559	/*
	3560	* This page is tainted but we're inserting it anyways.
	3561	* Since it's writeable, we need to disconnect it from other pmaps
	3562	* now so those processes can take note.
	3563	*/
6d2010ae	3564
f427ee49 A	3565	/*
	3566	* We can only get here
	3567	* because of the CSE logic
	3568	*/
	3569	assert(pmap_get_vm_map_cs_enforced(pmap));
	3570	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
	3571	/*
	3572	* If we are faulting for a write, we can clear
	3573	* the execute bit - that will ensure the page is
	3574	* checked again before being executable, which
	3575	* protects against a map switch.
	3576	* This only happens the first time the page
	3577	* gets tainted, so we won't get stuck here
	3578	* to make an already writeable page executable.
	3579	*/
	3580	if (!cs_bypass) {
	3581	assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
	3582	*prot &= ~VM_PROT_EXECUTE;
b0d623f7	3583	}
4a3eedf9	3584	}
39037602	3585	assert(VM_PAGE_OBJECT(m) == object);
d1ecb069	3586
d9a64523 A	3587	#if VM_OBJECT_ACCESS_TRACKING
	3588	if (object->access_tracking) {
	3589	DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
	3590	if (fault_type & VM_PROT_WRITE) {
	3591	object->access_tracking_writes++;
	3592	vm_object_access_tracking_writes++;
	3593	} else {
	3594	object->access_tracking_reads++;
	3595	vm_object_access_tracking_reads++;
	3596	}
	3597	}
	3598	#endif /* VM_OBJECT_ACCESS_TRACKING */
f427ee49	3599	}
d9a64523	3600
f427ee49 A	3601	return kr;
f427ee49 A	3602	}
cb323159	3603
f427ee49 A	3604	/*
	3605	* page queue lock must NOT be held
	3606	* m->vmp_object must be locked
	3607	*
	3608	* NOTE: m->vmp_object could be locked "shared" only if we are called
	3609	* from vm_fault() as part of a soft fault. If so, we must be
	3610	* careful not to modify the VM object in any way that is not
	3611	* legal under a shared lock...
	3612	*/
	3613	kern_return_t
	3614	vm_fault_enter(
	3615	vm_page_t m,
	3616	pmap_t pmap,
	3617	vm_map_offset_t vaddr,
	3618	vm_map_size_t fault_page_size,
	3619	vm_map_offset_t fault_phys_offset,
	3620	vm_prot_t prot,
	3621	vm_prot_t caller_prot,
	3622	boolean_t wired,
	3623	boolean_t change_wiring,
	3624	vm_tag_t wire_tag,
	3625	vm_object_fault_info_t fault_info,
	3626	boolean_t *need_retry,
	3627	int *type_of_fault)
	3628	{
	3629	kern_return_t kr;
	3630	vm_object_t object;
	3631	bool page_needs_data_sync;
	3632	vm_prot_t fault_type;
	3633	int pmap_options = fault_info->pmap_options;
39236c6e	3634
f427ee49 A	3635	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
	3636	assert(m->vmp_fictitious);
	3637	return KERN_SUCCESS;
	3638	}
5ba3f43e	3639
f427ee49	3640	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
5ba3f43e	3641
f427ee49 A	3642	kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
	3643	fault_page_size, fault_phys_offset, change_wiring, fault_type,
	3644	fault_info, type_of_fault, &page_needs_data_sync);
	3645	object = VM_PAGE_OBJECT(m);
39037602	3646
f427ee49	3647	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
5ba3f43e	3648
f427ee49 A	3649	if (kr == KERN_SUCCESS) {
	3650	if (page_needs_data_sync) {
	3651	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
d1ecb069	3652	}
5ba3f43e	3653
f427ee49 A	3654	kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
	3655	fault_page_size, fault_phys_offset, m,
	3656	&prot, caller_prot, fault_type, wired, pmap_options, need_retry);
2d21ac55 A	3657	}
2d21ac55 A	3658
2d21ac55	3659	return kr;
55e303ae A	3660	}
55e303ae A	3661
fe8ab488	3662	void
cb323159	3663	vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
fe8ab488 A	3664	{
fe8ab488 A	3665	if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
5ba3f43e	3666	vm_fault(current_map(), /* map */
0a7de745	3667	vaddr, /* vaddr */
cb323159	3668	prot, /* fault_type */
0a7de745 A	3669	FALSE, /* change_wiring */
	3670	VM_KERN_MEMORY_NONE, /* tag - not wiring */
	3671	THREAD_UNINT, /* interruptible */
	3672	NULL, /* caller_pmap */
	3673	0 /* caller_pmap_addr */);
fe8ab488 A	3674	}
	3675	}
	3676
2d21ac55	3677
1c79356b A	3678	/*
	3679	* Routine: vm_fault
	3680	* Purpose:
	3681	* Handle page faults, including pseudo-faults
	3682	* used to change the wiring status of pages.
	3683	* Returns:
	3684	* Explicit continuations have been removed.
	3685	* Implementation:
	3686	* vm_fault and vm_fault_page save mucho state
	3687	* in the moral equivalent of a closure. The state
	3688	* structure is allocated when first entering vm_fault
	3689	* and deallocated when leaving vm_fault.
	3690	*/
	3691
0a7de745	3692	extern uint64_t get_current_unique_pid(void);
91447636	3693
2d21ac55 A	3694	unsigned long vm_fault_collapse_total = 0;
	3695	unsigned long vm_fault_collapse_skipped = 0;
	3696
39236c6e	3697
1c79356b	3698	kern_return_t
5ba3f43e	3699	vm_fault_external(
0a7de745 A	3700	vm_map_t map,
	3701	vm_map_offset_t vaddr,
	3702	vm_prot_t fault_type,
	3703	boolean_t change_wiring,
	3704	int interruptible,
	3705	pmap_t caller_pmap,
	3706	vm_map_offset_t caller_pmap_addr)
fe8ab488	3707	{
f427ee49 A	3708	return vm_fault_internal(map, vaddr, fault_type, change_wiring,
f427ee49 A	3709	change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
0a7de745 A	3710	interruptible, caller_pmap, caller_pmap_addr,
0a7de745 A	3711	NULL);
fe8ab488 A	3712	}
fe8ab488 A	3713
5ba3f43e A	3714	kern_return_t
5ba3f43e A	3715	vm_fault(
0a7de745 A	3716	vm_map_t map,
	3717	vm_map_offset_t vaddr,
	3718	vm_prot_t fault_type,
	3719	boolean_t change_wiring,
	3720	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3721	int interruptible,
	3722	pmap_t caller_pmap,
	3723	vm_map_offset_t caller_pmap_addr)
5ba3f43e A	3724	{
5ba3f43e A	3725	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
0a7de745 A	3726	interruptible, caller_pmap, caller_pmap_addr,
0a7de745 A	3727	NULL);
5ba3f43e	3728	}
3e170ce0	3729
cb323159 A	3730	static boolean_t
	3731	current_proc_is_privileged(void)
	3732	{
	3733	return csproc_get_platform_binary(current_proc());
	3734	}
	3735
	3736	uint64_t vm_copied_on_read = 0;
	3737
f427ee49 A	3738	/*
	3739	* Cleanup after a vm_fault_enter.
	3740	* At this point, the fault should either have failed (kr != KERN_SUCCESS)
	3741	* or the page should be in the pmap and on the correct paging queue.
	3742	*
	3743	* Precondition:
	3744	* map must be locked shared.
	3745	* m_object must be locked.
	3746	* If top_object != VM_OBJECT_NULL, it must be locked.
	3747	* real_map must be locked.
	3748	*
	3749	* Postcondition:
	3750	* map will be unlocked
	3751	* m_object will be unlocked
	3752	* top_object will be unlocked
	3753	* If real_map != map, it will be unlocked
	3754	*/
	3755	static void
	3756	vm_fault_complete(
	3757	vm_map_t map,
	3758	vm_map_t real_map,
	3759	vm_object_t object,
	3760	vm_object_t m_object,
	3761	vm_page_t m,
	3762	vm_map_offset_t offset,
	3763	vm_map_offset_t trace_real_vaddr,
	3764	vm_object_fault_info_t fault_info,
	3765	vm_prot_t caller_prot,
	3766	#if CONFIG_DTRACE
	3767	vm_map_offset_t real_vaddr,
	3768	#else
	3769	__unused vm_map_offset_t real_vaddr,
	3770	#endif /* CONFIG_DTRACE */
	3771	int type_of_fault,
	3772	boolean_t need_retry,
	3773	kern_return_t kr,
	3774	ppnum_t *physpage_p,
	3775	vm_prot_t prot,
	3776	vm_object_t top_object,
	3777	boolean_t need_collapse,
	3778	vm_map_offset_t cur_offset,
	3779	vm_prot_t fault_type,
	3780	vm_object_t *written_on_object,
	3781	memory_object_t *written_on_pager,
	3782	vm_object_offset_t *written_on_offset)
	3783	{
	3784	int event_code = 0;
	3785	vm_map_lock_assert_shared(map);
	3786	vm_object_lock_assert_held(m_object);
	3787	if (top_object != VM_OBJECT_NULL) {
	3788	vm_object_lock_assert_held(top_object);
	3789	}
	3790	vm_map_lock_assert_held(real_map);
	3791
	3792	if (m_object->internal) {
	3793	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	3794	} else if (m_object->object_is_shared_cache) {
	3795	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	3796	} else {
	3797	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	3798	}
	3799
	3800	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) \| (caller_prot << 8) \| type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
	3801	if (need_retry == FALSE) {
3802	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
3803	}
3804	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3805	if (kr == KERN_SUCCESS &&
3806	physpage_p != NULL) {
3807	/* for vm_map_wire_and_extract() */
3808	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3809	if (prot & VM_PROT_WRITE) {
3810	vm_object_lock_assert_exclusive(m_object);
3811	m->vmp_dirty = TRUE;
3812	}
3813	}
3814
3815	if (top_object != VM_OBJECT_NULL) {
3816	/*
3817	* It's safe to drop the top object
3818	* now that we've done our
3819	* vm_fault_enter(). Any other fault
3820	* in progress for that virtual
3821	* address will either find our page
3822	* and translation or put in a new page
3823	* and translation.
3824	*/
3825	vm_object_unlock(top_object);
3826	top_object = VM_OBJECT_NULL;
3827	}
3828
3829	if (need_collapse == TRUE) {
3830	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3831	}
3832
3833	if (need_retry == FALSE &&
3834	(type_of_fault == DBG_PAGEIND_FAULT \|\| type_of_fault == DBG_PAGEINV_FAULT \|\| type_of_fault == DBG_CACHE_HIT_FAULT)) {
3835	/*
3836	* evaluate access pattern and update state
3837	* vm_fault_deactivate_behind depends on the
3838	* state being up to date
3839	*/
3840	vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3841
3842	vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3843	}
3844	/*
3845	* That's it, clean up and return.
3846	*/
3847	if (m->vmp_busy) {
3848	vm_object_lock_assert_exclusive(m_object);
3849	PAGE_WAKEUP_DONE(m);
3850	}
3851
3852	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3853	vm_object_paging_begin(m_object);
3854
3855	assert(*written_on_object == VM_OBJECT_NULL);
3856	*written_on_object = m_object;
3857	*written_on_pager = m_object->pager;
3858	*written_on_offset = m_object->paging_offset + m->vmp_offset;
3859	}
3860	vm_object_unlock(object);
3861
3862	vm_map_unlock_read(map);
3863	if (real_map != map) {
3864	vm_map_unlock(real_map);
3865	}
3866	}
3867
3868	static inline int
3869	vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3870	{
3871	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3872	return DBG_COR_FAULT;
3873	}
3874	return type_of_fault;
3875	}
3876
fe8ab488 A	3877	kern_return_t
fe8ab488 A	3878	vm_fault_internal(
0a7de745 A	3879	vm_map_t map,
	3880	vm_map_offset_t vaddr,
	3881	vm_prot_t caller_prot,
	3882	boolean_t change_wiring,
	3883	vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
	3884	int interruptible,
	3885	pmap_t caller_pmap,
	3886	vm_map_offset_t caller_pmap_addr,
	3887	ppnum_t *physpage_p)
1c79356b	3888	{
0a7de745 A	3889	vm_map_version_t version; /* Map version for verificiation */
	3890	boolean_t wired; /* Should mapping be wired down? */
	3891	vm_object_t object; /* Top-level object */
	3892	vm_object_offset_t offset; /* Top-level offset */
	3893	vm_prot_t prot; /* Protection for mapping */
	3894	vm_object_t old_copy_object; /* Saved copy object */
	3895	vm_page_t result_page; /* Result of vm_fault_page */
	3896	vm_page_t top_page; /* Placeholder page */
	3897	kern_return_t kr;
	3898
	3899	vm_page_t m; /* Fast access to result_page */
	3900	kern_return_t error_code;
	3901	vm_object_t cur_object;
	3902	vm_object_t m_object = NULL;
	3903	vm_object_offset_t cur_offset;
	3904	vm_page_t cur_m;
	3905	vm_object_t new_object;
1c79356b	3906	int type_of_fault;
0a7de745 A	3907	pmap_t pmap;
	3908	wait_interrupt_t interruptible_state;
	3909	vm_map_t real_map = map;
	3910	vm_map_t original_map = map;
f427ee49	3911	bool object_locks_dropped = FALSE;
0a7de745 A	3912	vm_prot_t fault_type;
0a7de745 A	3913	vm_prot_t original_fault_type;
d9a64523	3914	struct vm_object_fault_info fault_info = {};
f427ee49	3915	bool need_collapse = FALSE;
0a7de745 A	3916	boolean_t need_retry = FALSE;
0a7de745 A	3917	boolean_t *need_retry_ptr = NULL;
f427ee49 A	3918	uint8_t object_lock_type = 0;
f427ee49 A	3919	uint8_t cur_object_lock_type;
0a7de745 A	3920	vm_object_t top_object = VM_OBJECT_NULL;
	3921	vm_object_t written_on_object = VM_OBJECT_NULL;
	3922	memory_object_t written_on_pager = NULL;
	3923	vm_object_offset_t written_on_offset = 0;
	3924	int throttle_delay;
	3925	int compressed_count_delta;
f427ee49 A	3926	uint8_t grab_options;
	3927	bool need_copy;
	3928	bool need_copy_on_read;
0a7de745 A	3929	vm_map_offset_t trace_vaddr;
0a7de745 A	3930	vm_map_offset_t trace_real_vaddr;
f427ee49 A	3931	vm_map_size_t fault_page_size;
	3932	vm_map_size_t fault_page_mask;
	3933	vm_map_offset_t fault_phys_offset;
0a7de745	3934	vm_map_offset_t real_vaddr;
f427ee49	3935	bool resilient_media_retry = FALSE;
cb323159 A	3936	vm_object_t resilient_media_object = VM_OBJECT_NULL;
cb323159 A	3937	vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
f427ee49 A	3938	bool page_needs_data_sync = false;
	3939	/*
	3940	* Was the VM object contended when vm_map_lookup_locked locked it?
	3941	* If so, the zero fill path will drop the lock
	3942	* NB: Ideally we would always drop the lock rather than rely on
	3943	* this heuristic, but vm_object_unlock currently takes > 30 cycles.
	3944	*/
	3945	bool object_is_contended = false;
1c79356b	3946
39037602	3947	real_vaddr = vaddr;
d190cdc3	3948	trace_real_vaddr = vaddr;
f427ee49 A	3949
	3950	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
	3951	fault_phys_offset = (vm_map_offset_t)-1;
	3952	fault_page_size = VM_MAP_PAGE_SIZE(original_map);
	3953	fault_page_mask = VM_MAP_PAGE_MASK(original_map);
	3954	if (fault_page_size < PAGE_SIZE) {
	3955	DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
	3956	vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
	3957	}
	3958	} else {
	3959	fault_phys_offset = 0;
	3960	fault_page_size = PAGE_SIZE;
	3961	fault_page_mask = PAGE_MASK;
	3962	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
	3963	}
de355530	3964
d190cdc3	3965	if (map == kernel_map) {
5ba3f43e A	3966	trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
5ba3f43e A	3967	trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
d190cdc3 A	3968	} else {
	3969	trace_vaddr = vaddr;
	3970	}
	3971
5ba3f43e	3972	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	3973	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_START,
	3974	((uint64_t)trace_vaddr >> 32),
	3975	trace_vaddr,
	3976	(map == kernel_map),
	3977	0,
	3978	0);
1c79356b	3979
0c530ab8	3980	if (get_preemption_level() != 0) {
0a7de745 A	3981	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	3982	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	3983	((uint64_t)trace_vaddr >> 32),
	3984	trace_vaddr,
	3985	KERN_FAILURE,
	3986	0,
	3987	0);
	3988
	3989	return KERN_FAILURE;
9bccf70c	3990	}
5ba3f43e	3991
d9a64523	3992	thread_t cthread = current_thread();
f427ee49	3993	bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
d9a64523 A	3994	uint64_t fstart = 0;
	3995
	3996	if (rtfault) {
	3997	fstart = mach_continuous_time();
	3998	}
	3999
9bccf70c	4000	interruptible_state = thread_interrupt_level(interruptible);
1c79356b	4001
3e170ce0 A	4002	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3e170ce0 A	4003
c3c9b80d A	4004	counter_inc(&vm_statistics_faults);
c3c9b80d A	4005	counter_inc(&current_task()->faults);
2d21ac55 A	4006	original_fault_type = fault_type;
2d21ac55 A	4007
cb323159	4008	need_copy = FALSE;
0a7de745	4009	if (fault_type & VM_PROT_WRITE) {
cb323159 A	4010	need_copy = TRUE;
	4011	}
	4012
f427ee49	4013	if (need_copy \|\| change_wiring) {
0a7de745 A	4014	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4015	} else {
	4016	object_lock_type = OBJECT_LOCK_SHARED;
	4017	}
2d21ac55 A	4018
	4019	cur_object_lock_type = OBJECT_LOCK_SHARED;
	4020
5ba3f43e A	4021	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
	4022	if (compressor_map) {
	4023	if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
	4024	panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void ) vaddr, caller_prot, (void ) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
5ba3f43e A	4025	}
	4026	}
	4027	}
2d21ac55	4028	RetryFault:
d9a64523 A	4029	assert(written_on_object == VM_OBJECT_NULL);
d9a64523 A	4030
1c79356b A	4031	/*
	4032	* assume we will hit a page in the cache
	4033	* otherwise, explicitly override with
	4034	* the real fault type once we determine it
	4035	*/
	4036	type_of_fault = DBG_CACHE_HIT_FAULT;
	4037
1c79356b A	4038	/*
	4039	* Find the backing store object and offset into
	4040	* it to begin the search.
	4041	*/
0c530ab8	4042	fault_type = original_fault_type;
1c79356b A	4043	map = original_map;
1c79356b A	4044	vm_map_lock_read(map);
1c79356b	4045
cb323159 A	4046	if (resilient_media_retry) {
	4047	/*
	4048	* If we have to insert a fake zero-filled page to hide
	4049	* a media failure to provide the real page, we need to
	4050	* resolve any pending copy-on-write on this mapping.
	4051	* VM_PROT_COPY tells vm_map_lookup_locked() to deal
	4052	* with that even if this is not a "write" fault.
	4053	*/
	4054	need_copy = TRUE;
	4055	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4056	}
	4057
	4058	kr = vm_map_lookup_locked(&map, vaddr,
	4059	(fault_type \| (need_copy ? VM_PROT_COPY : 0)),
0a7de745 A	4060	object_lock_type, &version,
	4061	&object, &offset, &prot, &wired,
	4062	&fault_info,
f427ee49 A	4063	&real_map,
f427ee49 A	4064	&object_is_contended);
1c79356b A	4065
	4066	if (kr != KERN_SUCCESS) {
	4067	vm_map_unlock_read(map);
	4068	goto done;
	4069	}
f427ee49 A	4070
f427ee49 A	4071
2d21ac55 A	4072	pmap = real_map->pmap;
2d21ac55 A	4073	fault_info.interruptible = interruptible;
b0d623f7	4074	fault_info.stealth = FALSE;
6d2010ae	4075	fault_info.io_sync = FALSE;
0b4c1975	4076	fault_info.mark_zf_absent = FALSE;
316670eb	4077	fault_info.batch_pmap_op = FALSE;
1c79356b	4078
cb323159 A	4079	if (resilient_media_retry) {
	4080	/*
	4081	* We're retrying this fault after having detected a media
	4082	* failure from a "resilient_media" mapping.
	4083	* Check that the mapping is still pointing at the object
	4084	* that just failed to provide a page.
	4085	*/
	4086	assert(resilient_media_object != VM_OBJECT_NULL);
	4087	assert(resilient_media_offset != (vm_object_offset_t)-1);
	4088	if (object != VM_OBJECT_NULL &&
	4089	object == resilient_media_object &&
	4090	offset == resilient_media_offset &&
	4091	fault_info.resilient_media) {
	4092	/*
	4093	* This mapping still points at the same object
	4094	* and is still "resilient_media": proceed in
	4095	* "recovery-from-media-failure" mode, where we'll
	4096	* insert a zero-filled page in the top object.
	4097	*/
	4098	// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
	4099	} else {
	4100	/* not recovering: reset state */
	4101	// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
	4102	resilient_media_retry = FALSE;
	4103	/* release our extra reference on failed object */
	4104	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	4105	vm_object_deallocate(resilient_media_object);
	4106	resilient_media_object = VM_OBJECT_NULL;
	4107	resilient_media_offset = (vm_object_offset_t)-1;
	4108	}
	4109	} else {
	4110	assert(resilient_media_object == VM_OBJECT_NULL);
	4111	resilient_media_offset = (vm_object_offset_t)-1;
	4112	}
	4113
1c79356b	4114	/*
2d21ac55 A	4115	* If the page is wired, we must fault for the current protection
2d21ac55 A	4116	* value, to avoid further faults.
1c79356b	4117	*/
2d21ac55	4118	if (wired) {
1c79356b	4119	fault_type = prot \| VM_PROT_WRITE;
cb323159 A	4120	}
cb323159 A	4121	if (wired \|\| need_copy) {
2d21ac55 A	4122	/*
	4123	* since we're treating this fault as a 'write'
	4124	* we must hold the top object lock exclusively
	4125	*/
	4126	if (object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	4127	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	4128
2d21ac55 A	4129	if (vm_object_lock_upgrade(object) == FALSE) {
0a7de745	4130	/*
2d21ac55 A	4131	* couldn't upgrade, so explictly
	4132	* take the lock exclusively
	4133	*/
0a7de745	4134	vm_object_lock(object);
2d21ac55 A	4135	}
	4136	}
	4137	}
1c79356b	4138
0a7de745	4139	#if VM_FAULT_CLASSIFY
1c79356b A	4140	/*
	4141	* Temporary data gathering code
	4142	*/
	4143	vm_fault_classify(object, offset, fault_type);
	4144	#endif
	4145	/*
	4146	* Fast fault code. The basic idea is to do as much as
	4147	* possible while holding the map lock and object locks.
	4148	* Busy pages are not used until the object lock has to
	4149	* be dropped to do something (copy, zero fill, pmap enter).
	4150	* Similarly, paging references aren't acquired until that
	4151	* point, and object references aren't used.
	4152	*
	4153	* If we can figure out what to do
	4154	* (zero fill, copy on write, pmap enter) while holding
	4155	* the locks, then it gets done. Otherwise, we give up,
	4156	* and use the original fault path (which doesn't hold
	4157	* the map lock, and relies on busy pages).
	4158	* The give up cases include:
0a7de745	4159	* - Have to talk to pager.
1c79356b A	4160	* - Page is busy, absent or in error.
	4161	* - Pager has locked out desired access.
	4162	* - Fault needs to be restarted.
	4163	* - Have to push page into copy object.
	4164	*
	4165	* The code is an infinite loop that moves one level down
	4166	* the shadow chain each time. cur_object and cur_offset
0a7de745	4167	* refer to the current object being examined. object and offset
1c79356b A	4168	* are the original object from the map. The loop is at the
	4169	* top level if and only if object and cur_object are the same.
	4170	*
	4171	* Invariants: Map lock is held throughout. Lock is held on
	4172	* original object and cur_object (if different) when
	4173	* continuing or exiting loop.
	4174	*
	4175	*/
	4176
5ba3f43e A	4177	#if defined(__arm64__)
	4178	/*
	4179	* Fail if reading an execute-only page in a
	4180	* pmap that enforces execute-only protection.
	4181	*/
	4182	if (fault_type == VM_PROT_READ &&
0a7de745 A	4183	(prot & VM_PROT_EXECUTE) &&
	4184	!(prot & VM_PROT_READ) &&
	4185	pmap_enforces_execute_only(pmap)) {
	4186	vm_object_unlock(object);
	4187	vm_map_unlock_read(map);
	4188	if (real_map != map) {
	4189	vm_map_unlock(real_map);
	4190	}
	4191	kr = KERN_PROTECTION_FAILURE;
	4192	goto done;
5ba3f43e A	4193	}
5ba3f43e A	4194	#endif
1c79356b	4195
f427ee49 A	4196	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
f427ee49 A	4197
1c79356b	4198	/*
2d21ac55 A	4199	* If this page is to be inserted in a copy delay object
	4200	* for writing, and if the object has a copy, then the
	4201	* copy delay strategy is implemented in the slow fault page.
1c79356b	4202	*/
2d21ac55	4203	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
0a7de745 A	4204	object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
	4205	goto handle_copy_delay;
	4206	}
2d21ac55	4207
1c79356b A	4208	cur_object = object;
	4209	cur_offset = offset;
	4210
39037602 A	4211	grab_options = 0;
	4212	#if CONFIG_SECLUDED_MEMORY
	4213	if (object->can_grab_secluded) {
	4214	grab_options \|= VM_PAGE_GRAB_SECLUDED;
	4215	}
	4216	#endif /* CONFIG_SECLUDED_MEMORY */
	4217
1c79356b	4218	while (TRUE) {
b0d623f7	4219	if (!cur_object->pager_created &&
0a7de745	4220	cur_object->phys_contiguous) { /* superpage */
b0d623f7	4221	break;
0a7de745	4222	}
b0d623f7 A	4223
	4224	if (cur_object->blocked_access) {
	4225	/*
	4226	* Access to this VM object has been blocked.
	4227	* Let the slow path handle it.
	4228	*/
	4229	break;
	4230	}
	4231
f427ee49	4232	m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
39037602	4233	m_object = NULL;
2d21ac55	4234
1c79356b	4235	if (m != VM_PAGE_NULL) {
39037602 A	4236	m_object = cur_object;
39037602 A	4237
d9a64523	4238	if (m->vmp_busy) {
0a7de745	4239	wait_result_t result;
143cc14e	4240
2d21ac55 A	4241	/*
	4242	* in order to do the PAGE_ASSERT_WAIT, we must
	4243	* have object that 'm' belongs to locked exclusively
	4244	*/
	4245	if (object != cur_object) {
2d21ac55	4246	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	4247	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	4248
2d21ac55 A	4249	if (vm_object_lock_upgrade(cur_object) == FALSE) {
0a7de745	4250	/*
2d21ac55	4251	* couldn't upgrade so go do a full retry
39236c6e A	4252	* immediately since we can no longer be
	4253	* certain about cur_object (since we
	4254	* don't hold a reference on it)...
	4255	* first drop the top object lock
2d21ac55	4256	*/
39236c6e A	4257	vm_object_unlock(object);
39236c6e A	4258
0a7de745 A	4259	vm_map_unlock_read(map);
	4260	if (real_map != map) {
	4261	vm_map_unlock(real_map);
	4262	}
2d21ac55 A	4263
	4264	goto RetryFault;
	4265	}
	4266	}
	4267	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	4268	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	4269
2d21ac55 A	4270	if (vm_object_lock_upgrade(object) == FALSE) {
0a7de745	4271	/*
2d21ac55 A	4272	* couldn't upgrade, so explictly take the lock
	4273	* exclusively and go relookup the page since we
	4274	* will have dropped the object lock and
	4275	* a different thread could have inserted
	4276	* a page at this offset
	4277	* no need for a full retry since we're
	4278	* at the top level of the object chain
	4279	*/
0a7de745	4280	vm_object_lock(object);
2d21ac55 A	4281
	4282	continue;
	4283	}
	4284	}
d9a64523	4285	if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
39236c6e	4286	/*
d9a64523	4287	* m->vmp_busy == TRUE and the object is locked exclusively
39236c6e A	4288	* if m->pageout_queue == TRUE after we acquire the
	4289	* queues lock, we are guaranteed that it is stable on
	4290	* the pageout queue and therefore reclaimable
	4291	*
	4292	* NOTE: this is only true for the internal pageout queue
	4293	* in the compressor world
	4294	*/
39037602 A	4295	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
39037602 A	4296
39236c6e A	4297	vm_page_lock_queues();
39236c6e A	4298
d9a64523	4299	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
39236c6e A	4300	vm_pageout_throttle_up(m);
	4301	vm_page_unlock_queues();
	4302
	4303	PAGE_WAKEUP_DONE(m);
	4304	goto reclaimed_from_pageout;
	4305	}
	4306	vm_page_unlock_queues();
	4307	}
0a7de745	4308	if (object != cur_object) {
39236c6e	4309	vm_object_unlock(object);
0a7de745	4310	}
39236c6e	4311
143cc14e	4312	vm_map_unlock_read(map);
0a7de745 A	4313	if (real_map != map) {
	4314	vm_map_unlock(real_map);
	4315	}
143cc14e	4316
143cc14e	4317	result = PAGE_ASSERT_WAIT(m, interruptible);
1c79356b	4318
143cc14e A	4319	vm_object_unlock(cur_object);
	4320
	4321	if (result == THREAD_WAITING) {
0a7de745	4322	result = thread_block(THREAD_CONTINUE_NULL);
143cc14e	4323	}
0a7de745 A	4324	if (result == THREAD_AWAKENED \|\| result == THREAD_RESTART) {
	4325	goto RetryFault;
	4326	}
143cc14e A	4327
	4328	kr = KERN_ABORTED;
	4329	goto done;
	4330	}
39236c6e	4331	reclaimed_from_pageout:
d9a64523	4332	if (m->vmp_laundry) {
316670eb A	4333	if (object != cur_object) {
	4334	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4335	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4336
	4337	vm_object_unlock(object);
	4338	vm_object_unlock(cur_object);
	4339
	4340	vm_map_unlock_read(map);
0a7de745	4341	if (real_map != map) {
316670eb	4342	vm_map_unlock(real_map);
0a7de745	4343	}
316670eb A	4344
	4345	goto RetryFault;
	4346	}
316670eb	4347	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
316670eb A	4348	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4349
	4350	if (vm_object_lock_upgrade(object) == FALSE) {
	4351	/*
	4352	* couldn't upgrade, so explictly take the lock
	4353	* exclusively and go relookup the page since we
	4354	* will have dropped the object lock and
	4355	* a different thread could have inserted
	4356	* a page at this offset
	4357	* no need for a full retry since we're
	4358	* at the top level of the object chain
	4359	*/
	4360	vm_object_lock(object);
	4361
	4362	continue;
	4363	}
	4364	}
316670eb A	4365	vm_pageout_steal_laundry(m, FALSE);
	4366	}
	4367
39037602	4368	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2d21ac55 A	4369	/*
	4370	* Guard page: let the slow path deal with it
	4371	*/
	4372	break;
	4373	}
d9a64523	4374	if (m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_private \|\| m->vmp_absent)) {
0a7de745	4375	/*
2d21ac55	4376	* Unusual case... let the slow path deal with it
1c79356b A	4377	*/
	4378	break;
	4379	}
39037602	4380	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
0a7de745	4381	if (object != cur_object) {
b0d623f7	4382	vm_object_unlock(object);
0a7de745	4383	}
b0d623f7	4384	vm_map_unlock_read(map);
0a7de745 A	4385	if (real_map != map) {
	4386	vm_map_unlock(real_map);
	4387	}
b0d623f7 A	4388	vm_object_unlock(cur_object);
	4389	kr = KERN_MEMORY_ERROR;
	4390	goto done;
	4391	}
39037602	4392	assert(m_object == VM_PAGE_OBJECT(m));
6d2010ae	4393
f427ee49 A	4394	if (vm_fault_cs_need_validation(map->pmap, m, m_object,
f427ee49 A	4395	PAGE_SIZE, 0) \|\|
fe8ab488	4396	(physpage_p != NULL && (prot & VM_PROT_WRITE))) {
cb323159	4397	upgrade_lock_and_retry:
2d21ac55	4398	/*
4a3eedf9	4399	* We might need to validate this page
2d21ac55 A	4400	* against its code signature, so we
	4401	* want to hold the VM object exclusively.
	4402	*/
0a7de745	4403	if (object != cur_object) {
2d21ac55 A	4404	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	4405	vm_object_unlock(object);
	4406	vm_object_unlock(cur_object);
	4407
0a7de745	4408	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	4409
2d21ac55 A	4410	vm_map_unlock_read(map);
0a7de745	4411	if (real_map != map) {
2d21ac55	4412	vm_map_unlock(real_map);
0a7de745	4413	}
2d21ac55 A	4414
	4415	goto RetryFault;
	4416	}
2d21ac55	4417	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	4418	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	4419
2d21ac55 A	4420	if (vm_object_lock_upgrade(object) == FALSE) {
0a7de745	4421	/*
2d21ac55 A	4422	* couldn't upgrade, so explictly take the lock
	4423	* exclusively and go relookup the page since we
	4424	* will have dropped the object lock and
	4425	* a different thread could have inserted
	4426	* a page at this offset
	4427	* no need for a full retry since we're
	4428	* at the top level of the object chain
	4429	*/
0a7de745	4430	vm_object_lock(object);
2d21ac55 A	4431
	4432	continue;
	4433	}
	4434	}
	4435	}
1c79356b A	4436	/*
	4437	* Two cases of map in faults:
	4438	* - At top level w/o copy object.
	4439	* - Read fault anywhere.
	4440	* --> must disallow write.
	4441	*/
	4442
4a3eedf9	4443	if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2d21ac55	4444	goto FastPmapEnter;
4a3eedf9	4445	}
1c79356b	4446
cb323159 A	4447	if (!need_copy &&
	4448	!fault_info.no_copy_on_read &&
	4449	cur_object != object &&
	4450	!cur_object->internal &&
	4451	!cur_object->pager_trusted &&
	4452	vm_protect_privileged_from_untrusted &&
	4453	!((prot & VM_PROT_EXECUTE) &&
	4454	cur_object->code_signed &&
f427ee49	4455	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
cb323159 A	4456	current_proc_is_privileged()) {
	4457	/*
	4458	* We're faulting on a page in "object" and
	4459	* went down the shadow chain to "cur_object"
	4460	* to find out that "cur_object"'s pager
	4461	* is not "trusted", i.e. we can not trust it
	4462	* to always return the same contents.
	4463	* Since the target is a "privileged" process,
	4464	* let's treat this as a copy-on-read fault, as
	4465	* if it was a copy-on-write fault.
	4466	* Once "object" gets a copy of this page, it
	4467	* won't have to rely on "cur_object" to
	4468	* provide the contents again.
	4469	*
	4470	* This is done by setting "need_copy" and
	4471	* retrying the fault from the top with the
	4472	* appropriate locking.
	4473	*
	4474	* Special case: if the mapping is executable
	4475	* and the untrusted object is code-signed and
	4476	* the process is "cs_enforced", we do not
	4477	* copy-on-read because that would break
	4478	* code-signing enforcement expectations (an
	4479	* executable page must belong to a code-signed
	4480	* object) and we can rely on code-signing
	4481	* to re-validate the page if it gets evicted
	4482	* and paged back in.
	4483	*/
	4484	// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4485	vm_copied_on_read++;
	4486	need_copy = TRUE;
	4487
	4488	vm_object_unlock(object);
	4489	vm_object_unlock(cur_object);
	4490	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4491	vm_map_unlock_read(map);
	4492	if (real_map != map) {
	4493	vm_map_unlock(real_map);
	4494	}
	4495	goto RetryFault;
	4496	}
	4497
	4498	if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
f427ee49	4499	if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5ba3f43e A	4500	prot &= ~VM_PROT_WRITE;
	4501	} else {
	4502	/*
	4503	* For a protection that the pmap cares
	4504	* about, we must hand over the full
	4505	* set of protections (so that the pmap
	4506	* layer can apply any desired policy).
	4507	* This means that cs_bypass must be
	4508	* set, as this can force us to pass
	4509	* RWX.
	4510	*/
	4511	assert(fault_info.cs_bypass);
	4512	}
39037602	4513
0a7de745 A	4514	if (object != cur_object) {
0a7de745 A	4515	/*
c910b4d9 A	4516	* We still need to hold the top object
	4517	* lock here to prevent a race between
	4518	* a read fault (taking only "shared"
	4519	* locks) and a write fault (taking
	4520	* an "exclusive" lock on the top
	4521	* object.
	4522	* Otherwise, as soon as we release the
	4523	* top lock, the write fault could
	4524	* proceed and actually complete before
	4525	* the read fault, and the copied page's
	4526	* translation could then be overwritten
	4527	* by the read fault's translation for
	4528	* the original page.
	4529	*
	4530	* Let's just record what the top object
	4531	* is and we'll release it later.
2d21ac55	4532	*/
c910b4d9	4533	top_object = object;
2d21ac55 A	4534
	4535	/*
	4536	* switch to the object that has the new page
	4537	*/
1c79356b	4538	object = cur_object;
2d21ac55	4539	object_lock_type = cur_object_lock_type;
1c79356b	4540	}
1c79356b	4541	FastPmapEnter:
39037602 A	4542	assert(m_object == VM_PAGE_OBJECT(m));
39037602 A	4543
1c79356b	4544	/*
2d21ac55 A	4545	* prepare for the pmap_enter...
	4546	* object and map are both locked
	4547	* m contains valid data
d9a64523	4548	* object == m->vmp_object
2d21ac55 A	4549	* cur_object == NULL or it's been unlocked
2d21ac55 A	4550	* no paging references on either object or cur_object
1c79356b	4551	*/
0a7de745	4552	if (top_object != VM_OBJECT_NULL \|\| object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
39236c6e	4553	need_retry_ptr = &need_retry;
0a7de745	4554	} else {
39236c6e	4555	need_retry_ptr = NULL;
0a7de745	4556	}
39236c6e	4557
f427ee49 A	4558	if (fault_page_size < PAGE_SIZE) {
	4559	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	4560	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	4561	fault_phys_offset < PAGE_SIZE),
	4562	"0x%llx\n", (uint64_t)fault_phys_offset);
	4563	} else {
	4564	assertf(fault_phys_offset == 0,
	4565	"0x%llx\n", (uint64_t)fault_phys_offset);
	4566	}
	4567
2d21ac55	4568	if (caller_pmap) {
0a7de745 A	4569	kr = vm_fault_enter(m,
	4570	caller_pmap,
	4571	caller_pmap_addr,
f427ee49 A	4572	fault_page_size,
f427ee49 A	4573	fault_phys_offset,
0a7de745 A	4574	prot,
	4575	caller_prot,
	4576	wired,
	4577	change_wiring,
	4578	wire_tag,
	4579	&fault_info,
	4580	need_retry_ptr,
	4581	&type_of_fault);
9bccf70c	4582	} else {
0a7de745 A	4583	kr = vm_fault_enter(m,
	4584	pmap,
	4585	vaddr,
f427ee49 A	4586	fault_page_size,
f427ee49 A	4587	fault_phys_offset,
0a7de745 A	4588	prot,
	4589	caller_prot,
	4590	wired,
	4591	change_wiring,
	4592	wire_tag,
	4593	&fault_info,
	4594	need_retry_ptr,
	4595	&type_of_fault);
9bccf70c	4596	}
1c79356b	4597
f427ee49 A	4598	vm_fault_complete(
	4599	map,
	4600	real_map,
	4601	object,
	4602	m_object,
	4603	m,
	4604	offset,
	4605	trace_real_vaddr,
	4606	&fault_info,
	4607	caller_prot,
	4608	real_vaddr,
	4609	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
	4610	need_retry,
	4611	kr,
	4612	physpage_p,
	4613	prot,
	4614	top_object,
	4615	need_collapse,
	4616	cur_offset,
	4617	fault_type,
	4618	&written_on_object,
	4619	&written_on_pager,
	4620	&written_on_offset);
	4621	top_object = VM_OBJECT_NULL;
316670eb A	4622	if (need_retry == TRUE) {
	4623	/*
	4624	* vm_fault_enter couldn't complete the PMAP_ENTER...
	4625	* at this point we don't hold any locks so it's safe
	4626	* to ask the pmap layer to expand the page table to
	4627	* accommodate this mapping... once expanded, we'll
	4628	* re-drive the fault which should result in vm_fault_enter
	4629	* being able to successfully enter the mapping this time around
	4630	*/
fe8ab488 A	4631	(void)pmap_enter_options(
	4632	pmap, vaddr, 0, 0, 0, 0, 0,
	4633	PMAP_OPTIONS_NOENTER, NULL);
5ba3f43e	4634
316670eb A	4635	need_retry = FALSE;
	4636	goto RetryFault;
	4637	}
2d21ac55	4638	goto done;
1c79356b	4639	}
1c79356b	4640	/*
2d21ac55	4641	* COPY ON WRITE FAULT
b0d623f7 A	4642	*/
	4643	assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
	4644
0a7de745	4645	/*
2d21ac55 A	4646	* If objects match, then
	4647	* object->copy must not be NULL (else control
	4648	* would be in previous code block), and we
	4649	* have a potential push into the copy object
	4650	* with which we can't cope with here.
1c79356b	4651	*/
2d21ac55	4652	if (cur_object == object) {
0a7de745	4653	/*
2d21ac55 A	4654	* must take the slow path to
	4655	* deal with the copy push
	4656	*/
1c79356b	4657	break;
2d21ac55	4658	}
5ba3f43e	4659
1c79356b	4660	/*
2d21ac55 A	4661	* This is now a shadow based copy on write
	4662	* fault -- it requires a copy up the shadow
	4663	* chain.
6d2010ae	4664	*/
39037602	4665	assert(m_object == VM_PAGE_OBJECT(m));
5ba3f43e	4666
6d2010ae	4667	if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
f427ee49 A	4668	vm_fault_cs_need_validation(NULL, m, m_object,
f427ee49 A	4669	PAGE_SIZE, 0)) {
cb323159	4670	goto upgrade_lock_and_retry;
6d2010ae A	4671	}
	4672
	4673	/*
2d21ac55 A	4674	* Allocate a page in the original top level
	4675	* object. Give up if allocate fails. Also
	4676	* need to remember current page, as it's the
	4677	* source of the copy.
1c79356b	4678	*
5ba3f43e	4679	* at this point we hold locks on both
2d21ac55 A	4680	* object and cur_object... no need to take
	4681	* paging refs or mark pages BUSY since
	4682	* we don't drop either object lock until
	4683	* the page has been copied and inserted
1c79356b A	4684	*/
1c79356b A	4685	cur_m = m;
39037602 A	4686	m = vm_page_grab_options(grab_options);
39037602 A	4687	m_object = NULL;
2d21ac55	4688
1c79356b	4689	if (m == VM_PAGE_NULL) {
0a7de745	4690	/*
2d21ac55 A	4691	* no free page currently available...
	4692	* must take the slow path
	4693	*/
1c79356b A	4694	break;
1c79356b A	4695	}
1c79356b	4696	/*
2d21ac55	4697	* Now do the copy. Mark the source page busy...
1c79356b A	4698	*
	4699	* NOTE: This code holds the map lock across
	4700	* the page copy.
	4701	*/
1c79356b	4702	vm_page_copy(cur_m, m);
f427ee49 A	4703	vm_page_insert(m, object, vm_object_trunc_page(offset));
	4704	if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
	4705	DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	4706	}
39037602	4707	m_object = object;
316670eb	4708	SET_PAGE_DIRTY(m, FALSE);
1c79356b A	4709
1c79356b A	4710	/*
2d21ac55	4711	* Now cope with the source page and object
1c79356b	4712	*/
0a7de745 A	4713	if (object->ref_count > 1 && cur_m->vmp_pmapped) {
0a7de745 A	4714	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
f427ee49 A	4715	} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
	4716	/*
	4717	* We've copied the full 16K page but we're
	4718	* about to call vm_fault_enter() only for
	4719	* the 4K chunk we're faulting on. The other
	4720	* three 4K chunks in that page could still
	4721	* be pmapped in this pmap.
	4722	* Since the VM object layer thinks that the
	4723	* entire page has been dealt with and the
	4724	* original page might no longer be needed,
	4725	* it might collapse/bypass the original VM
	4726	* object and free its pages, which would be
	4727	* bad (and would trigger pmap_verify_free()
	4728	* assertions) if the other 4K chunks are still
	4729	* pmapped.
	4730	*/
	4731	/*
	4732	* XXX FBDP TODO4K: to be revisisted
	4733	* Technically, we need to pmap_disconnect()
	4734	* only the target pmap's mappings for the 4K
	4735	* chunks of this 16K VM page. If other pmaps
	4736	* have PTEs on these chunks, that means that
	4737	* the associated VM map must have a reference
	4738	* on the VM object, so no need to worry about
	4739	* those.
	4740	* pmap_protect() for each 4K chunk would be
	4741	* better but we'd have to check which chunks
	4742	* are actually mapped before and after this
	4743	* one.
	4744	* A full-blown pmap_disconnect() is easier
	4745	* for now but not efficient.
	4746	*/
	4747	DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
	4748	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
0a7de745	4749	}
5ba3f43e	4750
d9a64523	4751	if (cur_m->vmp_clustered) {
fe8ab488 A	4752	VM_PAGE_COUNT_AS_PAGEIN(cur_m);
fe8ab488 A	4753	VM_PAGE_CONSUME_CLUSTERED(cur_m);
39037602	4754	vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
fe8ab488	4755	}
2d21ac55	4756	need_collapse = TRUE;
1c79356b	4757
2d21ac55 A	4758	if (!cur_object->internal &&
2d21ac55 A	4759	cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
0a7de745	4760	/*
2d21ac55 A	4761	* The object from which we've just
	4762	* copied a page is most probably backed
	4763	* by a vnode. We don't want to waste too
	4764	* much time trying to collapse the VM objects
	4765	* and create a bottleneck when several tasks
	4766	* map the same file.
	4767	*/
0a7de745 A	4768	if (cur_object->copy == object) {
0a7de745 A	4769	/*
2d21ac55 A	4770	* Shared mapping or no COW yet.
	4771	* We can never collapse a copy
	4772	* object into its backing object.
	4773	*/
0a7de745	4774	need_collapse = FALSE;
2d21ac55	4775	} else if (cur_object->copy == object->shadow &&
0a7de745 A	4776	object->shadow->resident_page_count == 0) {
0a7de745 A	4777	/*
2d21ac55 A	4778	* Shared mapping after a COW occurred.
2d21ac55 A	4779	*/
0a7de745	4780	need_collapse = FALSE;
2d21ac55 A	4781	}
2d21ac55 A	4782	}
1c79356b A	4783	vm_object_unlock(cur_object);
1c79356b A	4784
0a7de745 A	4785	if (need_collapse == FALSE) {
	4786	vm_fault_collapse_skipped++;
	4787	}
2d21ac55 A	4788	vm_fault_collapse_total++;
	4789
	4790	type_of_fault = DBG_COW_FAULT;
c3c9b80d	4791	counter_inc(&vm_statistics_cow_faults);
2d21ac55 A	4792	DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2d21ac55 A	4793	current_task()->cow_faults++;
1c79356b A	4794
1c79356b A	4795	goto FastPmapEnter;
2d21ac55	4796	} else {
1c79356b	4797	/*
2d21ac55	4798	* No page at cur_object, cur_offset... m == NULL
1c79356b	4799	*/
1c79356b	4800	if (cur_object->pager_created) {
f427ee49	4801	vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
39236c6e	4802
0a7de745 A	4803	if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
0a7de745 A	4804	int my_fault_type;
f427ee49 A	4805	uint8_t c_flags = C_DONT_BLOCK;
f427ee49 A	4806	bool insert_cur_object = FALSE;
39236c6e	4807
0a7de745	4808	/*
2d21ac55	4809	* May have to talk to a pager...
39236c6e A	4810	* if so, take the slow path by
	4811	* doing a 'break' from the while (TRUE) loop
	4812	*
	4813	* external_state will only be set to VM_EXTERNAL_STATE_EXISTS
	4814	* if the compressor is active and the page exists there
2d21ac55	4815	*/
0a7de745	4816	if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
39236c6e	4817	break;
0a7de745	4818	}
39236c6e A	4819
	4820	if (map == kernel_map \|\| real_map == kernel_map) {
	4821	/*
	4822	* can't call into the compressor with the kernel_map
	4823	* lock held, since the compressor may try to operate
	4824	* on the kernel map in order to return an empty c_segment
	4825	*/
	4826	break;
	4827	}
	4828	if (object != cur_object) {
0a7de745	4829	if (fault_type & VM_PROT_WRITE) {
39236c6e	4830	c_flags \|= C_KEEP;
0a7de745	4831	} else {
39236c6e	4832	insert_cur_object = TRUE;
0a7de745	4833	}
39236c6e A	4834	}
39236c6e A	4835	if (insert_cur_object == TRUE) {
39236c6e	4836	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
39236c6e A	4837	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4838
	4839	if (vm_object_lock_upgrade(cur_object) == FALSE) {
	4840	/*
	4841	* couldn't upgrade so go do a full retry
	4842	* immediately since we can no longer be
	4843	* certain about cur_object (since we
	4844	* don't hold a reference on it)...
	4845	* first drop the top object lock
	4846	*/
	4847	vm_object_unlock(object);
	4848
	4849	vm_map_unlock_read(map);
0a7de745	4850	if (real_map != map) {
39236c6e	4851	vm_map_unlock(real_map);
0a7de745	4852	}
39236c6e A	4853
	4854	goto RetryFault;
	4855	}
	4856	}
	4857	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
39236c6e A	4858	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
	4859
	4860	if (object != cur_object) {
	4861	/*
	4862	* we can't go for the upgrade on the top
	4863	* lock since the upgrade may block waiting
	4864	* for readers to drain... since we hold
	4865	* cur_object locked at this point, waiting
	4866	* for the readers to drain would represent
	4867	* a lock order inversion since the lock order
	4868	* for objects is the reference order in the
	4869	* shadown chain
	4870	*/
	4871	vm_object_unlock(object);
	4872	vm_object_unlock(cur_object);
	4873
	4874	vm_map_unlock_read(map);
0a7de745	4875	if (real_map != map) {
39236c6e	4876	vm_map_unlock(real_map);
0a7de745	4877	}
39236c6e A	4878
	4879	goto RetryFault;
	4880	}
	4881	if (vm_object_lock_upgrade(object) == FALSE) {
	4882	/*
	4883	* couldn't upgrade, so explictly take the lock
	4884	* exclusively and go relookup the page since we
	4885	* will have dropped the object lock and
	4886	* a different thread could have inserted
	4887	* a page at this offset
	4888	* no need for a full retry since we're
	4889	* at the top level of the object chain
	4890	*/
	4891	vm_object_lock(object);
5ba3f43e	4892
39236c6e A	4893	continue;
	4894	}
	4895	}
39037602 A	4896	m = vm_page_grab_options(grab_options);
39037602 A	4897	m_object = NULL;
39236c6e A	4898
	4899	if (m == VM_PAGE_NULL) {
	4900	/*
	4901	* no free page currently available...
	4902	* must take the slow path
	4903	*/
	4904	break;
	4905	}
fe8ab488 A	4906
	4907	/*
	4908	* The object is and remains locked
	4909	* so no need to take a
	4910	* "paging_in_progress" reference.
	4911	*/
f427ee49	4912	bool shared_lock;
fe8ab488	4913	if ((object == cur_object &&
0a7de745	4914	object_lock_type == OBJECT_LOCK_EXCLUSIVE) \|\|
fe8ab488	4915	(object != cur_object &&
0a7de745	4916	cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
fe8ab488 A	4917	shared_lock = FALSE;
	4918	} else {
	4919	shared_lock = TRUE;
	4920	}
	4921
	4922	kr = vm_compressor_pager_get(
	4923	cur_object->pager,
f427ee49 A	4924	(vm_object_trunc_page(cur_offset)
f427ee49 A	4925	+ cur_object->paging_offset),
39037602	4926	VM_PAGE_GET_PHYS_PAGE(m),
fe8ab488 A	4927	&my_fault_type,
	4928	c_flags,
	4929	&compressed_count_delta);
	4930
	4931	vm_compressor_pager_count(
	4932	cur_object->pager,
	4933	compressed_count_delta,
	4934	shared_lock,
	4935	cur_object);
	4936
	4937	if (kr != KERN_SUCCESS) {
39037602 A	4938	vm_page_release(m, FALSE);
39037602 A	4939	m = VM_PAGE_NULL;
f427ee49 A	4940	}
	4941	/*
	4942	* If vm_compressor_pager_get() returns
	4943	* KERN_MEMORY_FAILURE, then the
	4944	* compressed data is permanently lost,
	4945	* so return this error immediately.
	4946	*/
	4947	if (kr == KERN_MEMORY_FAILURE) {
	4948	if (object != cur_object) {
	4949	vm_object_unlock(cur_object);
	4950	}
	4951	vm_object_unlock(object);
	4952	vm_map_unlock_read(map);
	4953	if (real_map != map) {
	4954	vm_map_unlock(real_map);
	4955	}
	4956	goto done;
	4957	} else if (kr != KERN_SUCCESS) {
39236c6e A	4958	break;
39236c6e A	4959	}
d9a64523	4960	m->vmp_dirty = TRUE;
39236c6e	4961
fe8ab488 A	4962	/*
	4963	* If the object is purgeable, its
	4964	* owner's purgeable ledgers will be
	4965	* updated in vm_page_insert() but the
	4966	* page was also accounted for in a
	4967	* "compressed purgeable" ledger, so
	4968	* update that now.
	4969	*/
	4970	if (object != cur_object &&
	4971	!insert_cur_object) {
	4972	/*
	4973	* We're not going to insert
	4974	* the decompressed page into
	4975	* the object it came from.
	4976	*
	4977	* We're dealing with a
	4978	* copy-on-write fault on
	4979	* "object".
	4980	* We're going to decompress
	4981	* the page directly into the
	4982	* target "object" while
	4983	* keepin the compressed
	4984	* page for "cur_object", so
	4985	* no ledger update in that
	4986	* case.
	4987	*/
d9a64523	4988	} else if (((cur_object->purgable ==
0a7de745 A	4989	VM_PURGABLE_DENY) &&
	4990	(!cur_object->vo_ledger_tag)) \|\|
	4991	(cur_object->vo_owner ==
	4992	NULL)) {
fe8ab488 A	4993	/*
fe8ab488 A	4994	* "cur_object" is not purgeable
d9a64523 A	4995	* and is not ledger-taged, or
	4996	* there's no owner for it,
	4997	* so no owner's ledgers to
	4998	* update.
fe8ab488 A	4999	*/
	5000	} else {
	5001	/*
	5002	* One less compressed
d9a64523	5003	* purgeable/tagged page for
fe8ab488 A	5004	* cur_object's owner.
fe8ab488 A	5005	*/
d9a64523	5006	vm_object_owner_compressed_update(
fe8ab488 A	5007	cur_object,
	5008	-1);
	5009	}
	5010
	5011	if (insert_cur_object) {
f427ee49	5012	vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
39037602	5013	m_object = cur_object;
fe8ab488	5014	} else {
f427ee49	5015	vm_page_insert(m, object, vm_object_trunc_page(offset));
39037602	5016	m_object = object;
fe8ab488	5017	}
39236c6e	5018
39037602	5019	if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
0a7de745	5020	/*
39236c6e A	5021	* If the page is not cacheable,
	5022	* we can't let its contents
	5023	* linger in the data cache
	5024	* after the decompression.
	5025	*/
39037602	5026	pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
39236c6e	5027	}
fe8ab488	5028
39236c6e A	5029	type_of_fault = my_fault_type;
39236c6e A	5030
cb323159	5031	VM_STAT_DECOMPRESSIONS();
39236c6e A	5032
	5033	if (cur_object != object) {
	5034	if (insert_cur_object) {
	5035	top_object = object;
	5036	/*
	5037	* switch to the object that has the new page
	5038	*/
	5039	object = cur_object;
	5040	object_lock_type = cur_object_lock_type;
	5041	} else {
	5042	vm_object_unlock(cur_object);
	5043	cur_object = object;
	5044	}
	5045	}
	5046	goto FastPmapEnter;
2d21ac55	5047	}
1c79356b	5048	/*
2d21ac55 A	5049	* existence map present and indicates
2d21ac55 A	5050	* that the pager doesn't have this page
1c79356b	5051	*/
1c79356b	5052	}
cb323159 A	5053	if (cur_object->shadow == VM_OBJECT_NULL \|\|
cb323159 A	5054	resilient_media_retry) {
2d21ac55 A	5055	/*
	5056	* Zero fill fault. Page gets
	5057	* inserted into the original object.
	5058	*/
b0d623f7	5059	if (cur_object->shadow_severed \|\|
39037602 A	5060	VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) \|\|
	5061	cur_object == compressor_object \|\|
	5062	cur_object == kernel_object \|\|
	5063	cur_object == vm_submap_object) {
0a7de745 A	5064	if (object != cur_object) {
	5065	vm_object_unlock(cur_object);
	5066	}
1c79356b	5067	vm_object_unlock(object);
2d21ac55	5068
1c79356b	5069	vm_map_unlock_read(map);
0a7de745	5070	if (real_map != map) {
91447636	5071	vm_map_unlock(real_map);
0a7de745	5072	}
1c79356b	5073
2d21ac55 A	5074	kr = KERN_MEMORY_ERROR;
	5075	goto done;
	5076	}
0a7de745	5077	if (cur_object != object) {
2d21ac55	5078	vm_object_unlock(cur_object);
1c79356b	5079
2d21ac55	5080	cur_object = object;
55e303ae	5081	}
2d21ac55	5082	if (object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	5083	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	5084
2d21ac55 A	5085	if (vm_object_lock_upgrade(object) == FALSE) {
0a7de745	5086	/*
2d21ac55 A	5087	* couldn't upgrade so do a full retry on the fault
	5088	* since we dropped the object lock which
	5089	* could allow another thread to insert
	5090	* a page at this offset
	5091	*/
0a7de745 A	5092	vm_map_unlock_read(map);
	5093	if (real_map != map) {
	5094	vm_map_unlock(real_map);
	5095	}
2d21ac55 A	5096
	5097	goto RetryFault;
	5098	}
1c79356b	5099	}
cb323159 A	5100	if (!object->internal) {
	5101	panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
	5102	}
f427ee49	5103	m = vm_page_alloc(object, vm_object_trunc_page(offset));
39037602	5104	m_object = NULL;
2d21ac55	5105
1c79356b	5106	if (m == VM_PAGE_NULL) {
0a7de745	5107	/*
2d21ac55 A	5108	* no free page currently available...
	5109	* must take the slow path
	5110	*/
1c79356b A	5111	break;
1c79356b A	5112	}
39037602	5113	m_object = object;
1c79356b	5114
1c79356b	5115	/*
f427ee49 A	5116	* Zeroing the page and entering into it into the pmap
f427ee49 A	5117	* represents a significant amount of the zero fill fault handler's work.
1c79356b	5118	*
f427ee49 A	5119	* To improve fault scalability, we'll drop the object lock, if it appears contended,
	5120	* now that we've inserted the page into the vm object.
	5121	* Before dropping the lock, we need to check protection bits and set the
	5122	* mapped bits on the page. Then we can mark the page busy, drop the lock,
	5123	* zero it, and do the pmap enter. We'll need to reacquire the lock
	5124	* to clear the busy bit and wake up any waiters.
1c79356b	5125	*/
f427ee49 A	5126	vm_fault_cs_clear(m);
	5127	m->vmp_pmapped = TRUE;
	5128	if (map->no_zero_fill) {
	5129	type_of_fault = DBG_NZF_PAGE_FAULT;
	5130	} else {
	5131	type_of_fault = DBG_ZERO_FILL_FAULT;
	5132	}
	5133	{
	5134	pmap_t destination_pmap;
	5135	vm_map_offset_t destination_pmap_vaddr;
	5136	vm_prot_t enter_fault_type;
	5137	if (caller_pmap) {
	5138	destination_pmap = caller_pmap;
	5139	destination_pmap_vaddr = caller_pmap_addr;
	5140	} else {
	5141	destination_pmap = pmap;
	5142	destination_pmap_vaddr = vaddr;
	5143	}
	5144	if (change_wiring) {
	5145	enter_fault_type = VM_PROT_NONE;
	5146	} else {
	5147	enter_fault_type = caller_prot;
	5148	}
	5149	kr = vm_fault_enter_prepare(m,
	5150	destination_pmap,
	5151	destination_pmap_vaddr,
	5152	&prot,
	5153	caller_prot,
	5154	fault_page_size,
	5155	fault_phys_offset,
	5156	change_wiring,
	5157	enter_fault_type,
	5158	&fault_info,
	5159	&type_of_fault,
	5160	&page_needs_data_sync);
	5161	if (kr != KERN_SUCCESS) {
	5162	goto zero_fill_cleanup;
	5163	}
143cc14e	5164
f427ee49 A	5165	if (object_is_contended) {
	5166	/*
	5167	* At this point the page is in the vm object, but not on a paging queue.
	5168	* Since it's accessible to another thread but its contents are invalid
	5169	* (it hasn't been zeroed) mark it busy before dropping the object lock.
	5170	*/
	5171	m->vmp_busy = TRUE;
	5172	vm_object_unlock(object);
	5173	}
	5174	if (type_of_fault == DBG_ZERO_FILL_FAULT) {
	5175	/*
	5176	* Now zero fill page...
	5177	* the page is probably going to
	5178	* be written soon, so don't bother
	5179	* to clear the modified bit
	5180	*
	5181	* NOTE: This code holds the map
	5182	* lock across the zero fill.
	5183	*/
	5184	vm_page_zero_fill(m);
c3c9b80d	5185	counter_inc(&vm_statistics_zero_fill_count);
f427ee49 A	5186	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
	5187	}
	5188	if (page_needs_data_sync) {
	5189	pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
	5190	}
	5191
	5192	if (top_object != VM_OBJECT_NULL) {
	5193	need_retry_ptr = &need_retry;
	5194	} else {
	5195	need_retry_ptr = NULL;
	5196	}
	5197	if (object_is_contended) {
	5198	kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
	5199	fault_page_size, fault_phys_offset,
	5200	m, &prot, caller_prot, enter_fault_type, wired,
	5201	fault_info.pmap_options, need_retry_ptr);
	5202	vm_object_lock(object);
	5203	} else {
	5204	kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
	5205	fault_page_size, fault_phys_offset,
	5206	m, &prot, caller_prot, enter_fault_type, wired,
	5207	fault_info.pmap_options, need_retry_ptr);
	5208	}
	5209	}
	5210	zero_fill_cleanup:
	5211	if (!VM_DYNAMIC_PAGING_ENABLED() &&
	5212	(object->purgable == VM_PURGABLE_DENY \|\|
	5213	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
	5214	object->purgable == VM_PURGABLE_VOLATILE)) {
	5215	vm_page_lockspin_queues();
	5216	if (!VM_DYNAMIC_PAGING_ENABLED()) {
	5217	vm_fault_enqueue_throttled_locked(m);
	5218	}
	5219	vm_page_unlock_queues();
	5220	}
	5221	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
	5222
	5223	vm_fault_complete(
	5224	map,
	5225	real_map,
	5226	object,
	5227	m_object,
	5228	m,
	5229	offset,
	5230	trace_real_vaddr,
	5231	&fault_info,
	5232	caller_prot,
	5233	real_vaddr,
	5234	type_of_fault,
	5235	need_retry,
	5236	kr,
	5237	physpage_p,
	5238	prot,
	5239	top_object,
	5240	need_collapse,
	5241	cur_offset,
	5242	fault_type,
	5243	&written_on_object,
	5244	&written_on_pager,
	5245	&written_on_offset);
	5246	top_object = VM_OBJECT_NULL;
	5247	if (need_retry == TRUE) {
	5248	/*
	5249	* vm_fault_enter couldn't complete the PMAP_ENTER...
5250	* at this point we don't hold any locks so it's safe
5251	* to ask the pmap layer to expand the page table to
5252	* accommodate this mapping... once expanded, we'll
5253	* re-drive the fault which should result in vm_fault_enter
5254	* being able to successfully enter the mapping this time around
5255	*/
5256	(void)pmap_enter_options(
5257	pmap, vaddr, 0, 0, 0, 0, 0,
5258	PMAP_OPTIONS_NOENTER, NULL);
5259
5260	need_retry = FALSE;
5261	goto RetryFault;
5262	}
5263	goto done;
0a7de745	5264	}
1c79356b	5265	/*
2d21ac55	5266	* On to the next level in the shadow chain
1c79356b	5267	*/
6d2010ae	5268	cur_offset += cur_object->vo_shadow_offset;
1c79356b	5269	new_object = cur_object->shadow;
f427ee49	5270	fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
2d21ac55 A	5271
	5272	/*
	5273	* take the new_object's lock with the indicated state
	5274	*/
0a7de745 A	5275	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
	5276	vm_object_lock_shared(new_object);
	5277	} else {
	5278	vm_object_lock(new_object);
	5279	}
2d21ac55	5280
0a7de745	5281	if (cur_object != object) {
1c79356b	5282	vm_object_unlock(cur_object);
0a7de745	5283	}
2d21ac55	5284
1c79356b A	5285	cur_object = new_object;
	5286
	5287	continue;
	5288	}
	5289	}
1c79356b	5290	/*
2d21ac55 A	5291	* Cleanup from fast fault failure. Drop any object
2d21ac55 A	5292	* lock other than original and drop map lock.
1c79356b	5293	*/
0a7de745	5294	if (object != cur_object) {
1c79356b	5295	vm_object_unlock(cur_object);
0a7de745	5296	}
2d21ac55 A	5297
	5298	/*
	5299	* must own the object lock exclusively at this point
	5300	*/
	5301	if (object_lock_type == OBJECT_LOCK_SHARED) {
0a7de745	5302	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2d21ac55 A	5303
2d21ac55 A	5304	if (vm_object_lock_upgrade(object) == FALSE) {
0a7de745	5305	/*
2d21ac55 A	5306	* couldn't upgrade, so explictly
	5307	* take the lock exclusively
	5308	* no need to retry the fault at this
	5309	* point since "vm_fault_page" will
	5310	* completely re-evaluate the state
	5311	*/
0a7de745	5312	vm_object_lock(object);
2d21ac55	5313	}
1c79356b	5314	}
143cc14e	5315
2d21ac55 A	5316	handle_copy_delay:
2d21ac55 A	5317	vm_map_unlock_read(map);
0a7de745	5318	if (real_map != map) {
91447636	5319	vm_map_unlock(real_map);
0a7de745	5320	}
1c79356b	5321
813fb2f6	5322	if (__improbable(object == compressor_object \|\|
0a7de745 A	5323	object == kernel_object \|\|
0a7de745 A	5324	object == vm_submap_object)) {
813fb2f6 A	5325	/*
	5326	* These objects are explicitly managed and populated by the
	5327	* kernel. The virtual ranges backed by these objects should
	5328	* either have wired pages or "holes" that are not supposed to
	5329	* be accessed at all until they get explicitly populated.
	5330	* We should never have to resolve a fault on a mapping backed
	5331	* by one of these VM objects and providing a zero-filled page
	5332	* would be wrong here, so let's fail the fault and let the
	5333	* caller crash or recover.
	5334	*/
	5335	vm_object_unlock(object);
	5336	kr = KERN_MEMORY_ERROR;
	5337	goto done;
	5338	}
	5339
39037602 A	5340	assert(object != compressor_object);
	5341	assert(object != kernel_object);
	5342	assert(object != vm_submap_object);
	5343
cb323159 A	5344	if (resilient_media_retry) {
	5345	/*
	5346	* We could get here if we failed to get a free page
	5347	* to zero-fill and had to take the slow path again.
	5348	* Reset our "recovery-from-failed-media" state.
	5349	*/
	5350	assert(resilient_media_object != VM_OBJECT_NULL);
	5351	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5352	/* release our extra reference on failed object */
	5353	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5354	vm_object_deallocate(resilient_media_object);
	5355	resilient_media_object = VM_OBJECT_NULL;
	5356	resilient_media_offset = (vm_object_offset_t)-1;
	5357	resilient_media_retry = FALSE;
	5358	}
	5359
0a7de745	5360	/*
2d21ac55 A	5361	* Make a reference to this object to
	5362	* prevent its disposal while we are messing with
	5363	* it. Once we have the reference, the map is free
	5364	* to be diddled. Since objects reference their
	5365	* shadows (and copies), they will stay around as well.
1c79356b	5366	*/
2d21ac55	5367	vm_object_reference_locked(object);
1c79356b A	5368	vm_object_paging_begin(object);
1c79356b A	5369
cb323159	5370	set_thread_pagein_error(cthread, 0);
2d21ac55	5371	error_code = 0;
55e303ae	5372
39236c6e	5373	result_page = VM_PAGE_NULL;
1c79356b	5374	kr = vm_fault_page(object, offset, fault_type,
0a7de745 A	5375	(change_wiring && !wired),
	5376	FALSE, /* page not looked up */
	5377	&prot, &result_page, &top_page,
	5378	&type_of_fault,
	5379	&error_code, map->no_zero_fill,
	5380	FALSE, &fault_info);
1c79356b A	5381
1c79356b A	5382	/*
2d21ac55 A	5383	* if kr != VM_FAULT_SUCCESS, then the paging reference
	5384	* has been dropped and the object unlocked... the ref_count
	5385	* is still held
	5386	*
	5387	* if kr == VM_FAULT_SUCCESS, then the paging reference
	5388	* is still held along with the ref_count on the original object
	5389	*
b0d623f7	5390	* the object is returned locked with a paging reference
2d21ac55	5391	*
5ba3f43e	5392	* if top_page != NULL, then it's BUSY and the
2d21ac55 A	5393	* object it belongs to has a paging reference
2d21ac55 A	5394	* but is returned unlocked
1c79356b	5395	*/
b0d623f7 A	5396	if (kr != VM_FAULT_SUCCESS &&
b0d623f7 A	5397	kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
cb323159 A	5398	if (kr == VM_FAULT_MEMORY_ERROR &&
	5399	fault_info.resilient_media) {
	5400	assertf(object->internal, "object %p", object);
	5401	/*
	5402	* This fault failed but the mapping was
	5403	* "media resilient", so we'll retry the fault in
	5404	* recovery mode to get a zero-filled page in the
	5405	* top object.
	5406	* Keep the reference on the failing object so
	5407	* that we can check that the mapping is still
	5408	* pointing to it when we retry the fault.
	5409	*/
	5410	// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
	5411	assert(!resilient_media_retry); /* no double retry */
	5412	assert(resilient_media_object == VM_OBJECT_NULL);
	5413	assert(resilient_media_offset == (vm_object_offset_t)-1);
	5414	resilient_media_retry = TRUE;
	5415	resilient_media_object = object;
	5416	resilient_media_offset = offset;
	5417	// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
	5418	goto RetryFault;
	5419	} else {
	5420	/*
	5421	* we didn't succeed, lose the object reference
	5422	* immediately.
	5423	*/
	5424	vm_object_deallocate(object);
	5425	object = VM_OBJECT_NULL; /* no longer valid */
	5426	}
1c79356b	5427
2d21ac55 A	5428	/*
	5429	* See why we failed, and take corrective action.
	5430	*/
	5431	switch (kr) {
1c79356b	5432	case VM_FAULT_MEMORY_SHORTAGE:
5ba3f43e	5433	if (vm_page_wait((change_wiring) ?
0a7de745 A	5434	THREAD_UNINT :
0a7de745 A	5435	THREAD_ABORTSAFE)) {
1c79356b	5436	goto RetryFault;
0a7de745	5437	}
f427ee49	5438	OS_FALLTHROUGH;
1c79356b A	5439	case VM_FAULT_INTERRUPTED:
	5440	kr = KERN_ABORTED;
	5441	goto done;
	5442	case VM_FAULT_RETRY:
	5443	goto RetryFault;
1c79356b	5444	case VM_FAULT_MEMORY_ERROR:
0a7de745	5445	if (error_code) {
1c79356b	5446	kr = error_code;
0a7de745	5447	} else {
1c79356b	5448	kr = KERN_MEMORY_ERROR;
0a7de745	5449	}
1c79356b	5450	goto done;
b0d623f7 A	5451	default:
b0d623f7 A	5452	panic("vm_fault: unexpected error 0x%x from "
0a7de745	5453	"vm_fault_page()\n", kr);
2d21ac55	5454	}
1c79356b	5455	}
1c79356b	5456	m = result_page;
39037602	5457	m_object = NULL;
1c79356b	5458
2d21ac55	5459	if (m != VM_PAGE_NULL) {
39037602	5460	m_object = VM_PAGE_OBJECT(m);
0b4e3aa0	5461	assert((change_wiring && !wired) ?
0a7de745 A	5462	(top_page == VM_PAGE_NULL) :
0a7de745 A	5463	((top_page == VM_PAGE_NULL) == (m_object == object)));
0b4e3aa0	5464	}
1c79356b A	5465
1c79356b A	5466	/*
2d21ac55 A	5467	* What to do with the resulting page from vm_fault_page
2d21ac55 A	5468	* if it doesn't get entered into the physical map:
1c79356b	5469	*/
0a7de745 A	5470	#define RELEASE_PAGE(m) \
	5471	MACRO_BEGIN \
	5472	PAGE_WAKEUP_DONE(m); \
	5473	if ( !VM_PAGE_PAGEABLE(m)) { \
	5474	vm_page_lockspin_queues(); \
	5475	if ( !VM_PAGE_PAGEABLE(m)) \
	5476	vm_page_activate(m); \
	5477	vm_page_unlock_queues(); \
	5478	} \
1c79356b A	5479	MACRO_END
1c79356b A	5480
5ba3f43e A	5481
5ba3f43e A	5482	object_locks_dropped = FALSE;
1c79356b	5483	/*
2d21ac55	5484	* We must verify that the maps have not changed
5ba3f43e A	5485	* since our last lookup. vm_map_verify() needs the
	5486	* map lock (shared) but we are holding object locks.
	5487	* So we do a try_lock() first and, if that fails, we
	5488	* drop the object locks and go in for the map lock again.
1c79356b	5489	*/
5ba3f43e	5490	if (!vm_map_try_lock_read(original_map)) {
5ba3f43e A	5491	if (m != VM_PAGE_NULL) {
	5492	old_copy_object = m_object->copy;
	5493	vm_object_unlock(m_object);
	5494	} else {
	5495	old_copy_object = VM_OBJECT_NULL;
	5496	vm_object_unlock(object);
	5497	}
	5498
	5499	object_locks_dropped = TRUE;
	5500
	5501	vm_map_lock_read(original_map);
b0d623f7	5502	}
2d21ac55	5503
1c79356b	5504	if ((map != original_map) \|\| !vm_map_verify(map, &version)) {
5ba3f43e A	5505	if (object_locks_dropped == FALSE) {
	5506	if (m != VM_PAGE_NULL) {
	5507	old_copy_object = m_object->copy;
	5508	vm_object_unlock(m_object);
	5509	} else {
	5510	old_copy_object = VM_OBJECT_NULL;
	5511	vm_object_unlock(object);
	5512	}
0a7de745	5513
5ba3f43e A	5514	object_locks_dropped = TRUE;
	5515	}
	5516
	5517	/*
	5518	* no object locks are held at this point
	5519	*/
0a7de745 A	5520	vm_object_t retry_object;
	5521	vm_object_offset_t retry_offset;
	5522	vm_prot_t retry_prot;
1c79356b A	5523
1c79356b A	5524	/*
2d21ac55 A	5525	* To avoid trying to write_lock the map while another
	5526	* thread has it read_locked (in vm_map_pageable), we
	5527	* do not try for write permission. If the page is
	5528	* still writable, we will get write permission. If it
	5529	* is not, or has been marked needs_copy, we enter the
	5530	* mapping without write permission, and will merely
	5531	* take another fault.
1c79356b A	5532	*/
1c79356b A	5533	map = original_map;
2d21ac55	5534
1c79356b	5535	kr = vm_map_lookup_locked(&map, vaddr,
0a7de745 A	5536	fault_type & ~VM_PROT_WRITE,
	5537	OBJECT_LOCK_EXCLUSIVE, &version,
	5538	&retry_object, &retry_offset, &retry_prot,
	5539	&wired,
	5540	&fault_info,
f427ee49 A	5541	&real_map,
f427ee49 A	5542	NULL);
91447636	5543	pmap = real_map->pmap;
1c79356b A	5544
	5545	if (kr != KERN_SUCCESS) {
	5546	vm_map_unlock_read(map);
2d21ac55 A	5547
2d21ac55 A	5548	if (m != VM_PAGE_NULL) {
39037602 A	5549	assert(VM_PAGE_OBJECT(m) == m_object);
39037602 A	5550
0a7de745	5551	/*
2d21ac55 A	5552	* retake the lock so that
	5553	* we can drop the paging reference
	5554	* in vm_fault_cleanup and do the
	5555	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5556	*/
39037602	5557	vm_object_lock(m_object);
2d21ac55	5558
0b4e3aa0	5559	RELEASE_PAGE(m);
2d21ac55	5560
39037602	5561	vm_fault_cleanup(m_object, top_page);
0b4e3aa0	5562	} else {
0a7de745	5563	/*
2d21ac55 A	5564	* retake the lock so that
	5565	* we can drop the paging reference
	5566	* in vm_fault_cleanup
	5567	*/
0a7de745	5568	vm_object_lock(object);
2d21ac55	5569
0a7de745	5570	vm_fault_cleanup(object, top_page);
0b4e3aa0	5571	}
2d21ac55 A	5572	vm_object_deallocate(object);
2d21ac55 A	5573
1c79356b A	5574	goto done;
1c79356b A	5575	}
1c79356b	5576	vm_object_unlock(retry_object);
1c79356b	5577
2d21ac55	5578	if ((retry_object != object) \|\| (retry_offset != offset)) {
1c79356b	5579	vm_map_unlock_read(map);
0a7de745	5580	if (real_map != map) {
91447636	5581	vm_map_unlock(real_map);
0a7de745	5582	}
2d21ac55 A	5583
2d21ac55 A	5584	if (m != VM_PAGE_NULL) {
39037602 A	5585	assert(VM_PAGE_OBJECT(m) == m_object);
39037602 A	5586
0a7de745	5587	/*
2d21ac55 A	5588	* retake the lock so that
	5589	* we can drop the paging reference
	5590	* in vm_fault_cleanup and do the
	5591	* PAGE_WAKEUP_DONE in RELEASE_PAGE
	5592	*/
0a7de745	5593	vm_object_lock(m_object);
2d21ac55	5594
0b4e3aa0	5595	RELEASE_PAGE(m);
2d21ac55	5596
39037602	5597	vm_fault_cleanup(m_object, top_page);
0b4e3aa0	5598	} else {
0a7de745	5599	/*
2d21ac55 A	5600	* retake the lock so that
	5601	* we can drop the paging reference
	5602	* in vm_fault_cleanup
	5603	*/
0a7de745	5604	vm_object_lock(object);
2d21ac55	5605
0a7de745	5606	vm_fault_cleanup(object, top_page);
0b4e3aa0	5607	}
2d21ac55 A	5608	vm_object_deallocate(object);
2d21ac55 A	5609
1c79356b A	5610	goto RetryFault;
1c79356b A	5611	}
1c79356b	5612	/*
2d21ac55 A	5613	* Check whether the protection has changed or the object
2d21ac55 A	5614	* has been copied while we left the map unlocked.
1c79356b	5615	*/
f427ee49	5616	if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5ba3f43e A	5617	/* If the pmap layer cares, pass the full set. */
	5618	prot = retry_prot;
	5619	} else {
	5620	prot &= retry_prot;
	5621	}
0b4e3aa0	5622	}
1c79356b	5623
5ba3f43e A	5624	if (object_locks_dropped == TRUE) {
	5625	if (m != VM_PAGE_NULL) {
	5626	vm_object_lock(m_object);
	5627
	5628	if (m_object->copy != old_copy_object) {
	5629	/*
	5630	* The copy object changed while the top-level object
	5631	* was unlocked, so take away write permission.
	5632	*/
f427ee49	5633	assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5ba3f43e A	5634	prot &= ~VM_PROT_WRITE;
5ba3f43e A	5635	}
0a7de745	5636	} else {
5ba3f43e	5637	vm_object_lock(object);
0a7de745	5638	}
5ba3f43e A	5639
	5640	object_locks_dropped = FALSE;
	5641	}
1c79356b	5642
cb323159 A	5643	if (!need_copy &&
	5644	!fault_info.no_copy_on_read &&
	5645	m != VM_PAGE_NULL &&
	5646	VM_PAGE_OBJECT(m) != object &&
	5647	!VM_PAGE_OBJECT(m)->pager_trusted &&
	5648	vm_protect_privileged_from_untrusted &&
	5649	!((prot & VM_PROT_EXECUTE) &&
	5650	VM_PAGE_OBJECT(m)->code_signed &&
f427ee49	5651	pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
cb323159 A	5652	current_proc_is_privileged()) {
	5653	/*
	5654	* We found the page we want in an "untrusted" VM object
	5655	* down the shadow chain. Since the target is "privileged"
	5656	* we want to perform a copy-on-read of that page, so that the
	5657	* mapped object gets a stable copy and does not have to
	5658	* rely on the "untrusted" object to provide the same
	5659	* contents if the page gets reclaimed and has to be paged
	5660	* in again later on.
	5661	*
	5662	* Special case: if the mapping is executable and the untrusted
	5663	* object is code-signed and the process is "cs_enforced", we
	5664	* do not copy-on-read because that would break code-signing
	5665	* enforcement expectations (an executable page must belong
	5666	* to a code-signed object) and we can rely on code-signing
	5667	* to re-validate the page if it gets evicted and paged back in.
	5668	*/
	5669	// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
	5670	vm_copied_on_read++;
	5671	need_copy_on_read = TRUE;
	5672	need_copy = TRUE;
	5673	} else {
	5674	need_copy_on_read = FALSE;
	5675	}
	5676
1c79356b	5677	/*
2d21ac55 A	5678	* If we want to wire down this page, but no longer have
2d21ac55 A	5679	* adequate permissions, we must start all over.
cb323159	5680	* If we decided to copy-on-read, we must also start all over.
1c79356b	5681	*/
cb323159 A	5682	if ((wired && (fault_type != (prot \| VM_PROT_WRITE))) \|\|
cb323159 A	5683	need_copy_on_read) {
5ba3f43e	5684	vm_map_unlock_read(map);
0a7de745	5685	if (real_map != map) {
91447636	5686	vm_map_unlock(real_map);
0a7de745	5687	}
1c79356b	5688
2d21ac55	5689	if (m != VM_PAGE_NULL) {
39037602 A	5690	assert(VM_PAGE_OBJECT(m) == m_object);
39037602 A	5691
2d21ac55	5692	RELEASE_PAGE(m);
91447636	5693
39037602	5694	vm_fault_cleanup(m_object, top_page);
0a7de745 A	5695	} else {
	5696	vm_fault_cleanup(object, top_page);
	5697	}
0b4e3aa0	5698
2d21ac55	5699	vm_object_deallocate(object);
55e303ae	5700
2d21ac55 A	5701	goto RetryFault;
	5702	}
	5703	if (m != VM_PAGE_NULL) {
55e303ae	5704	/*
2d21ac55 A	5705	* Put this page into the physical map.
	5706	* We had to do the unlock above because pmap_enter
	5707	* may cause other faults. The page may be on
	5708	* the pageout queues. If the pageout daemon comes
	5709	* across the page, it will remove it from the queues.
55e303ae	5710	*/
f427ee49 A	5711	if (fault_page_size < PAGE_SIZE) {
	5712	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
	5713	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
	5714	fault_phys_offset < PAGE_SIZE),
	5715	"0x%llx\n", (uint64_t)fault_phys_offset);
	5716	} else {
	5717	assertf(fault_phys_offset == 0,
	5718	"0x%llx\n", (uint64_t)fault_phys_offset);
	5719	}
2d21ac55 A	5720	if (caller_pmap) {
2d21ac55 A	5721	kr = vm_fault_enter(m,
0a7de745 A	5722	caller_pmap,
0a7de745 A	5723	caller_pmap_addr,
f427ee49 A	5724	fault_page_size,
f427ee49 A	5725	fault_phys_offset,
0a7de745 A	5726	prot,
	5727	caller_prot,
	5728	wired,
	5729	change_wiring,
	5730	wire_tag,
	5731	&fault_info,
	5732	NULL,
	5733	&type_of_fault);
2d21ac55 A	5734	} else {
2d21ac55 A	5735	kr = vm_fault_enter(m,
0a7de745 A	5736	pmap,
0a7de745 A	5737	vaddr,
f427ee49 A	5738	fault_page_size,
f427ee49 A	5739	fault_phys_offset,
0a7de745 A	5740	prot,
	5741	caller_prot,
	5742	wired,
	5743	change_wiring,
	5744	wire_tag,
	5745	&fault_info,
	5746	NULL,
	5747	&type_of_fault);
2d21ac55	5748	}
39037602 A	5749	assert(VM_PAGE_OBJECT(m) == m_object);
39037602 A	5750
0a7de745 A	5751	{
0a7de745 A	5752	int event_code = 0;
39037602	5753
0a7de745 A	5754	if (m_object->internal) {
	5755	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
	5756	} else if (m_object->object_is_shared_cache) {
	5757	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
	5758	} else {
	5759	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
	5760	}
39037602	5761
f427ee49	5762	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) \| (caller_prot << 8) \| vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
94ff46dc	5763	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
39037602	5764
0a7de745 A	5765	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
0a7de745 A	5766	}
2d21ac55 A	5767	if (kr != KERN_SUCCESS) {
2d21ac55 A	5768	/* abort this page fault */
5ba3f43e	5769	vm_map_unlock_read(map);
0a7de745	5770	if (real_map != map) {
2d21ac55	5771	vm_map_unlock(real_map);
0a7de745	5772	}
2d21ac55	5773	PAGE_WAKEUP_DONE(m);
39037602	5774	vm_fault_cleanup(m_object, top_page);
2d21ac55 A	5775	vm_object_deallocate(object);
2d21ac55 A	5776	goto done;
0b4e3aa0	5777	}
fe8ab488 A	5778	if (physpage_p != NULL) {
fe8ab488 A	5779	/* for vm_map_wire_and_extract() */
39037602	5780	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
fe8ab488	5781	if (prot & VM_PROT_WRITE) {
39037602	5782	vm_object_lock_assert_exclusive(m_object);
d9a64523	5783	m->vmp_dirty = TRUE;
fe8ab488 A	5784	}
fe8ab488 A	5785	}
0b4e3aa0	5786	} else {
0a7de745 A	5787	vm_map_entry_t entry;
	5788	vm_map_offset_t laddr;
	5789	vm_map_offset_t ldelta, hdelta;
143cc14e	5790
5ba3f43e	5791	/*
0b4e3aa0	5792	* do a pmap block mapping from the physical address
5ba3f43e	5793	* in the object
0b4e3aa0	5794	*/
9bccf70c	5795
0a7de745	5796	if (real_map != map) {
91447636	5797	vm_map_unlock(real_map);
0a7de745	5798	}
2d21ac55	5799
9bccf70c A	5800	if (original_map != map) {
	5801	vm_map_unlock_read(map);
	5802	vm_map_lock_read(original_map);
	5803	map = original_map;
	5804	}
91447636	5805	real_map = map;
9bccf70c A	5806
	5807	laddr = vaddr;
	5808	hdelta = 0xFFFFF000;
	5809	ldelta = 0xFFFFF000;
	5810
2d21ac55	5811	while (vm_map_lookup_entry(map, laddr, &entry)) {
0a7de745	5812	if (ldelta > (laddr - entry->vme_start)) {
9bccf70c	5813	ldelta = laddr - entry->vme_start;
0a7de745 A	5814	}
0a7de745 A	5815	if (hdelta > (entry->vme_end - laddr)) {
9bccf70c	5816	hdelta = entry->vme_end - laddr;
0a7de745	5817	}
2d21ac55	5818	if (entry->is_sub_map) {
5ba3f43e	5819	laddr = ((laddr - entry->vme_start)
0a7de745	5820	+ VME_OFFSET(entry));
3e170ce0	5821	vm_map_lock_read(VME_SUBMAP(entry));
2d21ac55	5822
0a7de745	5823	if (map != real_map) {
9bccf70c	5824	vm_map_unlock_read(map);
0a7de745	5825	}
2d21ac55	5826	if (entry->use_pmap) {
91447636	5827	vm_map_unlock_read(real_map);
3e170ce0	5828	real_map = VME_SUBMAP(entry);
9bccf70c	5829	}
3e170ce0	5830	map = VME_SUBMAP(entry);
9bccf70c A	5831	} else {
	5832	break;
	5833	}
	5834	}
	5835
5ba3f43e	5836	if (vm_map_lookup_entry(map, laddr, &entry) &&
3e170ce0 A	5837	(VME_OBJECT(entry) != NULL) &&
3e170ce0 A	5838	(VME_OBJECT(entry) == object)) {
f427ee49	5839	uint16_t superpage;
2d21ac55	5840
3e170ce0	5841	if (!object->pager_created &&
4bd07ac2 A	5842	object->phys_contiguous &&
	5843	VME_OFFSET(entry) == 0 &&
	5844	(entry->vme_end - entry->vme_start == object->vo_size) &&
0a7de745	5845	VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
3e170ce0 A	5846	superpage = VM_MEM_SUPERPAGE;
	5847	} else {
	5848	superpage = 0;
	5849	}
fe8ab488 A	5850
	5851	if (superpage && physpage_p) {
	5852	/* for vm_map_wire_and_extract() */
3e170ce0	5853	*physpage_p = (ppnum_t)
0a7de745 A	5854	((((vm_map_offset_t)
	5855	object->vo_shadow_offset)
	5856	+ VME_OFFSET(entry)
	5857	+ (laddr - entry->vme_start))
	5858	>> PAGE_SHIFT);
fe8ab488 A	5859	}
fe8ab488 A	5860
2d21ac55 A	5861	if (caller_pmap) {
	5862	/*
	5863	* Set up a block mapped area
	5864	*/
fe8ab488	5865	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5ba3f43e	5866	kr = pmap_map_block(caller_pmap,
0a7de745 A	5867	(addr64_t)(caller_pmap_addr - ldelta),
	5868	(ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
	5869	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5870	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5871	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
5ba3f43e A	5872
	5873	if (kr != KERN_SUCCESS) {
	5874	goto cleanup;
	5875	}
	5876	} else {
2d21ac55 A	5877	/*
	5878	* Set up a block mapped area
	5879	*/
fe8ab488	5880	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5ba3f43e	5881	kr = pmap_map_block(real_map->pmap,
0a7de745 A	5882	(addr64_t)(vaddr - ldelta),
	5883	(ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
	5884	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
	5885	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
	5886	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, 0);
5ba3f43e A	5887
	5888	if (kr != KERN_SUCCESS) {
	5889	goto cleanup;
	5890	}
9bccf70c A	5891	}
9bccf70c A	5892	}
0b4e3aa0	5893	}
1c79356b	5894
5ba3f43e A	5895	/*
	5896	* Success
	5897	*/
	5898	kr = KERN_SUCCESS;
	5899
	5900	/*
	5901	* TODO: could most of the done cases just use cleanup?
	5902	*/
	5903	cleanup:
1c79356b	5904	/*
2d21ac55	5905	* Unlock everything, and return
1c79356b	5906	*/
5ba3f43e	5907	vm_map_unlock_read(map);
0a7de745	5908	if (real_map != map) {
91447636	5909	vm_map_unlock(real_map);
0a7de745	5910	}
2d21ac55 A	5911
2d21ac55 A	5912	if (m != VM_PAGE_NULL) {
39037602 A	5913	assert(VM_PAGE_OBJECT(m) == m_object);
39037602 A	5914
d9a64523	5915	if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
d9a64523 A	5916	vm_object_paging_begin(m_object);
	5917
	5918	assert(written_on_object == VM_OBJECT_NULL);
	5919	written_on_object = m_object;
	5920	written_on_pager = m_object->pager;
	5921	written_on_offset = m_object->paging_offset + m->vmp_offset;
	5922	}
0b4e3aa0	5923	PAGE_WAKEUP_DONE(m);
1c79356b	5924
39037602	5925	vm_fault_cleanup(m_object, top_page);
0a7de745 A	5926	} else {
	5927	vm_fault_cleanup(object, top_page);
	5928	}
1c79356b	5929
2d21ac55 A	5930	vm_object_deallocate(object);
2d21ac55 A	5931
0a7de745	5932	#undef RELEASE_PAGE
91447636	5933
2d21ac55	5934	done:
9bccf70c	5935	thread_interrupt_level(interruptible_state);
1c79356b	5936
cb323159 A	5937	if (resilient_media_object != VM_OBJECT_NULL) {
	5938	assert(resilient_media_retry);
	5939	assert(resilient_media_offset != (vm_object_offset_t)-1);
	5940	/* release extra reference on failed object */
	5941	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
	5942	vm_object_deallocate(resilient_media_object);
	5943	resilient_media_object = VM_OBJECT_NULL;
	5944	resilient_media_offset = (vm_object_offset_t)-1;
	5945	resilient_media_retry = FALSE;
	5946	}
	5947	assert(!resilient_media_retry);
	5948
39236c6e	5949	/*
04b8595b	5950	* Only I/O throttle on faults which cause a pagein/swapin.
39236c6e A	5951	*/
	5952	if ((type_of_fault == DBG_PAGEIND_FAULT) \|\| (type_of_fault == DBG_PAGEINV_FAULT) \|\| (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
	5953	throttle_lowpri_io(1);
04b8595b A	5954	} else {
04b8595b A	5955	if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
04b8595b	5956	if ((throttle_delay = vm_page_throttled(TRUE))) {
04b8595b	5957	if (vm_debug_events) {
0a7de745	5958	if (type_of_fault == DBG_COMPRESSOR_FAULT) {
04b8595b	5959	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
0a7de745	5960	} else if (type_of_fault == DBG_COW_FAULT) {
04b8595b	5961	VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
0a7de745	5962	} else {
04b8595b	5963	VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
0a7de745	5964	}
04b8595b A	5965	}
	5966	delay(throttle_delay);
	5967	}
	5968	}
	5969	}
d9a64523 A	5970
d9a64523 A	5971	if (written_on_object) {
d9a64523 A	5972	vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
	5973
	5974	vm_object_lock(written_on_object);
	5975	vm_object_paging_end(written_on_object);
	5976	vm_object_unlock(written_on_object);
	5977
	5978	written_on_object = VM_OBJECT_NULL;
	5979	}
	5980
	5981	if (rtfault) {
	5982	vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
	5983	}
	5984
5ba3f43e	5985	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
0a7de745 A	5986	(MACHDBG_CODE(DBG_MACH_VM, 2)) \| DBG_FUNC_END,
	5987	((uint64_t)trace_vaddr >> 32),
	5988	trace_vaddr,
	5989	kr,
f427ee49	5990	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
0a7de745 A	5991	0);
0a7de745 A	5992
f427ee49 A	5993	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
	5994	DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
	5995	}
	5996
0a7de745	5997	return kr;
1c79356b A	5998	}
	5999
	6000	/*
	6001	* vm_fault_wire:
	6002	*
	6003	* Wire down a range of virtual addresses in a map.
	6004	*/
	6005	kern_return_t
	6006	vm_fault_wire(
0a7de745 A	6007	vm_map_t map,
0a7de745 A	6008	vm_map_entry_t entry,
3e170ce0	6009	vm_prot_t prot,
5ba3f43e	6010	vm_tag_t wire_tag,
0a7de745 A	6011	pmap_t pmap,
	6012	vm_map_offset_t pmap_addr,
	6013	ppnum_t *physpage_p)
1c79356b	6014	{
0a7de745 A	6015	vm_map_offset_t va;
	6016	vm_map_offset_t end_addr = entry->vme_end;
	6017	kern_return_t rc;
f427ee49	6018	vm_map_size_t effective_page_size;
1c79356b A	6019
	6020	assert(entry->in_transition);
	6021
5ba3f43e A	6022	if ((VME_OBJECT(entry) != NULL) &&
5ba3f43e A	6023	!entry->is_sub_map &&
3e170ce0	6024	VME_OBJECT(entry)->phys_contiguous) {
9bccf70c A	6025	return KERN_SUCCESS;
	6026	}
	6027
1c79356b A	6028	/*
	6029	* Inform the physical mapping system that the
	6030	* range of addresses may not fault, so that
	6031	* page tables and such can be locked down as well.
	6032	*/
	6033
5ba3f43e	6034	pmap_pageable(pmap, pmap_addr,
0a7de745	6035	pmap_addr + (end_addr - entry->vme_start), FALSE);
1c79356b A	6036
	6037	/*
	6038	* We simulate a fault to get the page and enter it
	6039	* in the physical map.
	6040	*/
	6041
f427ee49 A	6042	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6043	for (va = entry->vme_start;
	6044	va < end_addr;
	6045	va += effective_page_size) {
5ba3f43e	6046	rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
0a7de745 A	6047	pmap_addr + (va - entry->vme_start),
0a7de745 A	6048	physpage_p);
fe8ab488	6049	if (rc != KERN_SUCCESS) {
5ba3f43e	6050	rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
0a7de745 A	6051	((pmap == kernel_pmap)
	6052	? THREAD_UNINT
	6053	: THREAD_ABORTSAFE),
	6054	pmap,
	6055	(pmap_addr +
	6056	(va - entry->vme_start)),
	6057	physpage_p);
2d21ac55	6058	DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
1c79356b A	6059	}
	6060
	6061	if (rc != KERN_SUCCESS) {
0a7de745	6062	struct vm_map_entry tmp_entry = *entry;
1c79356b A	6063
	6064	/* unwire wired pages */
	6065	tmp_entry.vme_end = va;
5ba3f43e	6066	vm_fault_unwire(map,
0a7de745	6067	&tmp_entry, FALSE, pmap, pmap_addr);
1c79356b A	6068
	6069	return rc;
	6070	}
	6071	}
	6072	return KERN_SUCCESS;
	6073	}
	6074
	6075	/*
	6076	* vm_fault_unwire:
	6077	*
	6078	* Unwire a range of virtual addresses in a map.
	6079	*/
	6080	void
	6081	vm_fault_unwire(
0a7de745 A	6082	vm_map_t map,
	6083	vm_map_entry_t entry,
	6084	boolean_t deallocate,
	6085	pmap_t pmap,
	6086	vm_map_offset_t pmap_addr)
1c79356b	6087	{
0a7de745 A	6088	vm_map_offset_t va;
	6089	vm_map_offset_t end_addr = entry->vme_end;
	6090	vm_object_t object;
d9a64523	6091	struct vm_object_fault_info fault_info = {};
5ba3f43e	6092	unsigned int unwired_pages;
f427ee49	6093	vm_map_size_t effective_page_size;
1c79356b	6094
3e170ce0	6095	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
1c79356b	6096
2d21ac55 A	6097	/*
	6098	* If it's marked phys_contiguous, then vm_fault_wire() didn't actually
	6099	* do anything since such memory is wired by default. So we don't have
	6100	* anything to undo here.
	6101	*/
	6102
0a7de745	6103	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
2d21ac55	6104	return;
0a7de745	6105	}
2d21ac55 A	6106
	6107	fault_info.interruptible = THREAD_UNINT;
	6108	fault_info.behavior = entry->behavior;
3e170ce0	6109	fault_info.user_tag = VME_ALIAS(entry);
fe8ab488 A	6110	if (entry->iokit_acct \|\|
	6111	(!entry->is_sub_map && !entry->use_pmap)) {
	6112	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6113	}
3e170ce0 A	6114	fault_info.lo_offset = VME_OFFSET(entry);
3e170ce0 A	6115	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
2d21ac55	6116	fault_info.no_cache = entry->no_cache;
b0d623f7	6117	fault_info.stealth = TRUE;
2d21ac55	6118
5ba3f43e A	6119	unwired_pages = 0;
5ba3f43e A	6120
1c79356b A	6121	/*
	6122	* Since the pages are wired down, we must be able to
	6123	* get their mappings from the physical map system.
	6124	*/
	6125
f427ee49 A	6126	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6127	for (va = entry->vme_start;
	6128	va < end_addr;
	6129	va += effective_page_size) {
1c79356b	6130	if (object == VM_OBJECT_NULL) {
593a1d5f	6131	if (pmap) {
5ba3f43e	6132	pmap_change_wiring(pmap,
0a7de745	6133	pmap_addr + (va - entry->vme_start), FALSE);
593a1d5f	6134	}
5ba3f43e	6135	(void) vm_fault(map, va, VM_PROT_NONE,
0a7de745	6136	TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
1c79356b	6137	} else {
0a7de745 A	6138	vm_prot_t prot;
	6139	vm_page_t result_page;
	6140	vm_page_t top_page;
	6141	vm_object_t result_object;
1c79356b A	6142	vm_fault_return_t result;
1c79356b A	6143
d9a64523 A	6144	/* cap cluster size at maximum UPL size */
	6145	upl_size_t cluster_size;
	6146	if (os_sub_overflow(end_addr, va, &cluster_size)) {
	6147	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
b0d623f7	6148	}
d9a64523	6149	fault_info.cluster_size = cluster_size;
2d21ac55	6150
1c79356b A	6151	do {
	6152	prot = VM_PROT_NONE;
	6153
	6154	vm_object_lock(object);
	6155	vm_object_paging_begin(object);
39236c6e	6156	result_page = VM_PAGE_NULL;
0a7de745	6157	result = vm_fault_page(
2d21ac55	6158	object,
3e170ce0	6159	(VME_OFFSET(entry) +
0a7de745	6160	(va - entry->vme_start)),
2d21ac55	6161	VM_PROT_NONE, TRUE,
39236c6e	6162	FALSE, /* page not looked up */
2d21ac55 A	6163	&prot, &result_page, &top_page,
2d21ac55 A	6164	(int *)0,
5ba3f43e	6165	NULL, map->no_zero_fill,
2d21ac55	6166	FALSE, &fault_info);
1c79356b A	6167	} while (result == VM_FAULT_RETRY);
1c79356b A	6168
2d21ac55 A	6169	/*
	6170	* If this was a mapping to a file on a device that has been forcibly
	6171	* unmounted, then we won't get a page back from vm_fault_page(). Just
	6172	* move on to the next one in case the remaining pages are mapped from
	6173	* different objects. During a forced unmount, the object is terminated
	6174	* so the alive flag will be false if this happens. A forced unmount will
5ba3f43e	6175	* will occur when an external disk is unplugged before the user does an
2d21ac55 A	6176	* eject, so we don't want to panic in that situation.
	6177	*/
	6178
0a7de745	6179	if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
2d21ac55	6180	continue;
0a7de745	6181	}
2d21ac55	6182
39236c6e A	6183	if (result == VM_FAULT_MEMORY_ERROR &&
	6184	object == kernel_object) {
	6185	/*
	6186	* This must have been allocated with
	6187	* KMA_KOBJECT and KMA_VAONLY and there's
	6188	* no physical page at this offset.
	6189	* We're done (no page to free).
	6190	*/
	6191	assert(deallocate);
	6192	continue;
	6193	}
	6194
0a7de745	6195	if (result != VM_FAULT_SUCCESS) {
1c79356b	6196	panic("vm_fault_unwire: failure");
0a7de745	6197	}
1c79356b	6198
39037602	6199	result_object = VM_PAGE_OBJECT(result_page);
2d21ac55	6200
1c79356b	6201	if (deallocate) {
39037602	6202	assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
0a7de745	6203	vm_page_fictitious_addr);
39037602	6204	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5ba3f43e A	6205	if (VM_PAGE_WIRED(result_page)) {
	6206	unwired_pages++;
	6207	}
1c79356b A	6208	VM_PAGE_FREE(result_page);
1c79356b A	6209	} else {
0a7de745	6210	if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
5ba3f43e	6211	pmap_change_wiring(pmap,
6d2010ae	6212	pmap_addr + (va - entry->vme_start), FALSE);
0a7de745	6213	}
6d2010ae A	6214
6d2010ae A	6215
b0d623f7 A	6216	if (VM_PAGE_WIRED(result_page)) {
b0d623f7 A	6217	vm_page_lockspin_queues();
0b4c1975	6218	vm_page_unwire(result_page, TRUE);
b0d623f7	6219	vm_page_unlock_queues();
5ba3f43e	6220	unwired_pages++;
b0d623f7	6221	}
0a7de745	6222	if (entry->zero_wired_pages) {
39037602	6223	pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
b0d623f7 A	6224	entry->zero_wired_pages = FALSE;
	6225	}
	6226
1c79356b A	6227	PAGE_WAKEUP_DONE(result_page);
1c79356b A	6228	}
1c79356b A	6229	vm_fault_cleanup(result_object, top_page);
	6230	}
	6231	}
	6232
	6233	/*
	6234	* Inform the physical mapping system that the range
	6235	* of addresses may fault, so that page tables and
	6236	* such may be unwired themselves.
	6237	*/
	6238
5ba3f43e	6239	pmap_pageable(pmap, pmap_addr,
0a7de745	6240	pmap_addr + (end_addr - entry->vme_start), TRUE);
1c79356b	6241
5ba3f43e	6242	if (kernel_object == object) {
f427ee49 A	6243	/*
	6244	* Would like to make user_tag in vm_object_fault_info
	6245	* vm_tag_t (unsigned short) but user_tag derives its value from
	6246	* VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
	6247	* to an _unsigned int_ which is used by non-fault_info paths throughout the
	6248	* code at many places.
	6249	*
	6250	* So, for now, an explicit truncation to unsigned short (vm_tag_t).
	6251	*/
	6252	assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
	6253	"VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
	6254	vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
5ba3f43e	6255	}
1c79356b A	6256	}
	6257
	6258	/*
	6259	* vm_fault_wire_fast:
	6260	*
	6261	* Handle common case of a wire down page fault at the given address.
	6262	* If successful, the page is inserted into the associated physical map.
	6263	* The map entry is passed in to avoid the overhead of a map lookup.
	6264	*
	6265	* NOTE: the given address should be truncated to the
	6266	* proper page address.
	6267	*
	6268	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
	6269	* a standard error specifying why the fault is fatal is returned.
	6270	*
	6271	* The map in question must be referenced, and remains so.
	6272	* Caller has a read lock on the map.
	6273	*
	6274	* This is a stripped version of vm_fault() for wiring pages. Anything
	6275	* other than the common case will return KERN_FAILURE, and the caller
	6276	* is expected to call vm_fault().
	6277	*/
3e170ce0	6278	static kern_return_t
1c79356b	6279	vm_fault_wire_fast(
0a7de745 A	6280	__unused vm_map_t map,
0a7de745 A	6281	vm_map_offset_t va,
5ba3f43e	6282	__unused vm_prot_t caller_prot,
0a7de745 A	6283	vm_tag_t wire_tag,
	6284	vm_map_entry_t entry,
	6285	pmap_t pmap,
	6286	vm_map_offset_t pmap_addr,
	6287	ppnum_t *physpage_p)
1c79356b	6288	{
0a7de745 A	6289	vm_object_t object;
	6290	vm_object_offset_t offset;
	6291	vm_page_t m;
	6292	vm_prot_t prot;
	6293	thread_t thread = current_thread();
	6294	int type_of_fault;
	6295	kern_return_t kr;
f427ee49 A	6296	vm_map_size_t fault_page_size;
f427ee49 A	6297	vm_map_offset_t fault_phys_offset;
d9a64523	6298	struct vm_object_fault_info fault_info = {};
1c79356b	6299
c3c9b80d	6300	counter_inc(&vm_statistics_faults);
1c79356b	6301
0a7de745	6302	if (thread != THREAD_NULL && thread->task != TASK_NULL) {
c3c9b80d	6303	counter_inc(&thread->task->faults);
0a7de745	6304	}
1c79356b A	6305
	6306	/*
	6307	* Recovery actions
	6308	*/
	6309
0a7de745 A	6310	#undef RELEASE_PAGE
	6311	#define RELEASE_PAGE(m) { \
	6312	PAGE_WAKEUP_DONE(m); \
	6313	vm_page_lockspin_queues(); \
	6314	vm_page_unwire(m, TRUE); \
	6315	vm_page_unlock_queues(); \
1c79356b A	6316	}
	6317
	6318
0a7de745 A	6319	#undef UNLOCK_THINGS
	6320	#define UNLOCK_THINGS { \
	6321	vm_object_paging_end(object); \
	6322	vm_object_unlock(object); \
1c79356b A	6323	}
1c79356b A	6324
0a7de745 A	6325	#undef UNLOCK_AND_DEALLOCATE
	6326	#define UNLOCK_AND_DEALLOCATE { \
	6327	UNLOCK_THINGS; \
	6328	vm_object_deallocate(object); \
1c79356b A	6329	}
	6330	/*
	6331	* Give up and have caller do things the hard way.
	6332	*/
	6333
0a7de745 A	6334	#define GIVE_UP { \
	6335	UNLOCK_AND_DEALLOCATE; \
	6336	return(KERN_FAILURE); \
1c79356b A	6337	}
	6338
	6339
	6340	/*
	6341	* If this entry is not directly to a vm_object, bail out.
	6342	*/
fe8ab488 A	6343	if (entry->is_sub_map) {
fe8ab488 A	6344	assert(physpage_p == NULL);
0a7de745	6345	return KERN_FAILURE;
fe8ab488	6346	}
1c79356b A	6347
	6348	/*
	6349	* Find the backing store object and offset into it.
	6350	*/
	6351
3e170ce0 A	6352	object = VME_OBJECT(entry);
3e170ce0 A	6353	offset = (va - entry->vme_start) + VME_OFFSET(entry);
1c79356b A	6354	prot = entry->protection;
1c79356b A	6355
0a7de745	6356	/*
1c79356b A	6357	* Make a reference to this object to prevent its
	6358	* disposal while we are messing with it.
	6359	*/
	6360
	6361	vm_object_lock(object);
2d21ac55	6362	vm_object_reference_locked(object);
ff6e181a	6363	vm_object_paging_begin(object);
1c79356b A	6364
	6365	/*
	6366	* INVARIANTS (through entire routine):
	6367	*
	6368	* 1) At all times, we must either have the object
	6369	* lock or a busy page in some object to prevent
	6370	* some other thread from trying to bring in
	6371	* the same page.
	6372	*
	6373	* 2) Once we have a busy page, we must remove it from
	6374	* the pageout queues, so that the pageout daemon
	6375	* will not grab it away.
	6376	*
	6377	*/
	6378
	6379	/*
	6380	* Look for page in top-level object. If it's not there or
	6381	* there's something going on, give up.
	6382	*/
f427ee49	6383	m = vm_page_lookup(object, vm_object_trunc_page(offset));
d9a64523	6384	if ((m == VM_PAGE_NULL) \|\| (m->vmp_busy) \|\|
0a7de745	6385	(m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent))) {
1c79356b A	6386	GIVE_UP;
1c79356b A	6387	}
d9a64523	6388	if (m->vmp_fictitious &&
39037602	6389	VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2d21ac55 A	6390	/*
	6391	* Guard pages are fictitious pages and are never
	6392	* entered into a pmap, so let's say it's been wired...
	6393	*/
	6394	kr = KERN_SUCCESS;
	6395	goto done;
	6396	}
	6397
1c79356b A	6398	/*
1c79356b A	6399	* Wire the page down now. All bail outs beyond this
5ba3f43e	6400	* point must unwire the page.
1c79356b A	6401	*/
1c79356b A	6402
2d21ac55	6403	vm_page_lockspin_queues();
5ba3f43e	6404	vm_page_wire(m, wire_tag, TRUE);
1c79356b A	6405	vm_page_unlock_queues();
	6406
	6407	/*
	6408	* Mark page busy for other threads.
	6409	*/
d9a64523 A	6410	assert(!m->vmp_busy);
	6411	m->vmp_busy = TRUE;
	6412	assert(!m->vmp_absent);
1c79356b A	6413
	6414	/*
	6415	* Give up if the page is being written and there's a copy object
	6416	*/
	6417	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
	6418	RELEASE_PAGE(m);
	6419	GIVE_UP;
	6420	}
	6421
d9a64523 A	6422	fault_info.user_tag = VME_ALIAS(entry);
	6423	fault_info.pmap_options = 0;
	6424	if (entry->iokit_acct \|\|
	6425	(!entry->is_sub_map && !entry->use_pmap)) {
	6426	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
	6427	}
	6428
f427ee49 A	6429	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
	6430	fault_phys_offset = offset - vm_object_trunc_page(offset);
	6431
1c79356b A	6432	/*
1c79356b A	6433	* Put this page into the physical map.
1c79356b	6434	*/
2d21ac55 A	6435	type_of_fault = DBG_CACHE_HIT_FAULT;
2d21ac55 A	6436	kr = vm_fault_enter(m,
0a7de745 A	6437	pmap,
0a7de745 A	6438	pmap_addr,
f427ee49 A	6439	fault_page_size,
f427ee49 A	6440	fault_phys_offset,
0a7de745 A	6441	prot,
	6442	prot,
	6443	TRUE, /* wired */
	6444	FALSE, /* change_wiring */
	6445	wire_tag,
	6446	&fault_info,
	6447	NULL,
	6448	&type_of_fault);
39037602 A	6449	if (kr != KERN_SUCCESS) {
	6450	RELEASE_PAGE(m);
	6451	GIVE_UP;
	6452	}
2d21ac55 A	6453
2d21ac55 A	6454	done:
1c79356b A	6455	/*
	6456	* Unlock everything, and return
	6457	*/
	6458
fe8ab488 A	6459	if (physpage_p) {
	6460	/* for vm_map_wire_and_extract() */
	6461	if (kr == KERN_SUCCESS) {
39037602 A	6462	assert(object == VM_PAGE_OBJECT(m));
39037602 A	6463	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
fe8ab488	6464	if (prot & VM_PROT_WRITE) {
39037602	6465	vm_object_lock_assert_exclusive(object);
d9a64523	6466	m->vmp_dirty = TRUE;
fe8ab488 A	6467	}
	6468	} else {
	6469	*physpage_p = 0;
	6470	}
	6471	}
	6472
1c79356b A	6473	PAGE_WAKEUP_DONE(m);
	6474	UNLOCK_AND_DEALLOCATE;
	6475
2d21ac55	6476	return kr;
1c79356b A	6477	}
	6478
	6479	/*
	6480	* Routine: vm_fault_copy_cleanup
	6481	* Purpose:
	6482	* Release a page used by vm_fault_copy.
	6483	*/
	6484
3e170ce0	6485	static void
1c79356b	6486	vm_fault_copy_cleanup(
0a7de745 A	6487	vm_page_t page,
0a7de745 A	6488	vm_page_t top_page)
1c79356b	6489	{
0a7de745	6490	vm_object_t object = VM_PAGE_OBJECT(page);
1c79356b A	6491
	6492	vm_object_lock(object);
	6493	PAGE_WAKEUP_DONE(page);
0a7de745	6494	if (!VM_PAGE_PAGEABLE(page)) {
b0d623f7	6495	vm_page_lockspin_queues();
0a7de745	6496	if (!VM_PAGE_PAGEABLE(page)) {
b0d623f7	6497	vm_page_activate(page);
39037602	6498	}
b0d623f7 A	6499	vm_page_unlock_queues();
b0d623f7 A	6500	}
1c79356b A	6501	vm_fault_cleanup(object, top_page);
	6502	}
	6503
3e170ce0	6504	static void
1c79356b	6505	vm_fault_copy_dst_cleanup(
0a7de745	6506	vm_page_t page)
1c79356b	6507	{
0a7de745	6508	vm_object_t object;
1c79356b A	6509
1c79356b A	6510	if (page != VM_PAGE_NULL) {
39037602	6511	object = VM_PAGE_OBJECT(page);
1c79356b	6512	vm_object_lock(object);
2d21ac55	6513	vm_page_lockspin_queues();
0b4c1975	6514	vm_page_unwire(page, TRUE);
1c79356b	6515	vm_page_unlock_queues();
5ba3f43e	6516	vm_object_paging_end(object);
1c79356b A	6517	vm_object_unlock(object);
	6518	}
	6519	}
	6520
	6521	/*
	6522	* Routine: vm_fault_copy
	6523	*
	6524	* Purpose:
	6525	* Copy pages from one virtual memory object to another --
	6526	* neither the source nor destination pages need be resident.
	6527	*
	6528	* Before actually copying a page, the version associated with
	6529	* the destination address map wil be verified.
	6530	*
	6531	* In/out conditions:
	6532	* The caller must hold a reference, but not a lock, to
	6533	* each of the source and destination objects and to the
	6534	* destination map.
	6535	*
	6536	* Results:
	6537	* Returns KERN_SUCCESS if no errors were encountered in
	6538	* reading or writing the data. Returns KERN_INTERRUPTED if
	6539	* the operation was interrupted (only possible if the
	6540	* "interruptible" argument is asserted). Other return values
	6541	* indicate a permanent error in copying the data.
	6542	*
	6543	* The actual amount of data copied will be returned in the
	6544	* "copy_size" argument. In the event that the destination map
	6545	* verification failed, this amount may be less than the amount
	6546	* requested.
	6547	*/
	6548	kern_return_t
	6549	vm_fault_copy(
0a7de745 A	6550	vm_object_t src_object,
	6551	vm_object_offset_t src_offset,
	6552	vm_map_size_t copy_size, / INOUT */
	6553	vm_object_t dst_object,
	6554	vm_object_offset_t dst_offset,
	6555	vm_map_t dst_map,
	6556	vm_map_version_t *dst_version,
	6557	int interruptible)
1c79356b	6558	{
0a7de745	6559	vm_page_t result_page;
5ba3f43e	6560
0a7de745 A	6561	vm_page_t src_page;
	6562	vm_page_t src_top_page;
	6563	vm_prot_t src_prot;
1c79356b	6564
0a7de745 A	6565	vm_page_t dst_page;
	6566	vm_page_t dst_top_page;
	6567	vm_prot_t dst_prot;
1c79356b	6568
0a7de745 A	6569	vm_map_size_t amount_left;
	6570	vm_object_t old_copy_object;
	6571	vm_object_t result_page_object = NULL;
	6572	kern_return_t error = 0;
	6573	vm_fault_return_t result;
1c79356b	6574
0a7de745	6575	vm_map_size_t part_size;
d9a64523 A	6576	struct vm_object_fault_info fault_info_src = {};
d9a64523 A	6577	struct vm_object_fault_info fault_info_dst = {};
1c79356b A	6578
	6579	/*
	6580	* In order not to confuse the clustered pageins, align
	6581	* the different offsets on a page boundary.
	6582	*/
1c79356b	6583
0a7de745 A	6584	#define RETURN(x) \
	6585	MACRO_BEGIN \
	6586	*copy_size -= amount_left; \
	6587	MACRO_RETURN(x); \
1c79356b A	6588	MACRO_END
1c79356b A	6589
91447636	6590	amount_left = *copy_size;
2d21ac55 A	6591
	6592	fault_info_src.interruptible = interruptible;
	6593	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
2d21ac55 A	6594	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
2d21ac55 A	6595	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
b0d623f7	6596	fault_info_src.stealth = TRUE;
2d21ac55 A	6597
	6598	fault_info_dst.interruptible = interruptible;
	6599	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
2d21ac55 A	6600	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
2d21ac55 A	6601	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
b0d623f7	6602	fault_info_dst.stealth = TRUE;
2d21ac55	6603
1c79356b A	6604	do { /* while (amount_left > 0) */
	6605	/*
	6606	* There may be a deadlock if both source and destination
	6607	* pages are the same. To avoid this deadlock, the copy must
	6608	* start by getting the destination page in order to apply
	6609	* COW semantics if any.
	6610	*/
	6611
0a7de745	6612	RetryDestinationFault:;
1c79356b	6613
0a7de745	6614	dst_prot = VM_PROT_WRITE \| VM_PROT_READ;
1c79356b A	6615
	6616	vm_object_lock(dst_object);
	6617	vm_object_paging_begin(dst_object);
	6618
d9a64523 A	6619	/* cap cluster size at maximum UPL size */
	6620	upl_size_t cluster_size;
	6621	if (os_convert_overflow(amount_left, &cluster_size)) {
	6622	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
b0d623f7	6623	}
d9a64523	6624	fault_info_dst.cluster_size = cluster_size;
2d21ac55	6625
39236c6e	6626	dst_page = VM_PAGE_NULL;
b0d623f7	6627	result = vm_fault_page(dst_object,
0a7de745 A	6628	vm_object_trunc_page(dst_offset),
	6629	VM_PROT_WRITE \| VM_PROT_READ,
	6630	FALSE,
	6631	FALSE, /* page not looked up */
	6632	&dst_prot, &dst_page, &dst_top_page,
	6633	(int *)0,
	6634	&error,
	6635	dst_map->no_zero_fill,
	6636	FALSE, &fault_info_dst);
b0d623f7	6637	switch (result) {
1c79356b A	6638	case VM_FAULT_SUCCESS:
	6639	break;
	6640	case VM_FAULT_RETRY:
	6641	goto RetryDestinationFault;
	6642	case VM_FAULT_MEMORY_SHORTAGE:
0a7de745	6643	if (vm_page_wait(interruptible)) {
1c79356b	6644	goto RetryDestinationFault;
0a7de745	6645	}
f427ee49	6646	OS_FALLTHROUGH;
1c79356b A	6647	case VM_FAULT_INTERRUPTED:
1c79356b A	6648	RETURN(MACH_SEND_INTERRUPTED);
b0d623f7 A	6649	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6650	/* success but no VM page: fail the copy */
	6651	vm_object_paging_end(dst_object);
	6652	vm_object_unlock(dst_object);
f427ee49	6653	OS_FALLTHROUGH;
1c79356b	6654	case VM_FAULT_MEMORY_ERROR:
0a7de745 A	6655	if (error) {
	6656	return error;
	6657	} else {
	6658	return KERN_MEMORY_ERROR;
	6659	}
b0d623f7 A	6660	default:
b0d623f7 A	6661	panic("vm_fault_copy: unexpected error 0x%x from "
0a7de745	6662	"vm_fault_page()\n", result);
1c79356b	6663	}
0a7de745	6664	assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
1c79356b	6665
39037602 A	6666	assert(dst_object == VM_PAGE_OBJECT(dst_page));
39037602 A	6667	old_copy_object = dst_object->copy;
1c79356b A	6668
	6669	/*
	6670	* There exists the possiblity that the source and
	6671	* destination page are the same. But we can't
	6672	* easily determine that now. If they are the
	6673	* same, the call to vm_fault_page() for the
	6674	* destination page will deadlock. To prevent this we
	6675	* wire the page so we can drop busy without having
5ba3f43e	6676	* the page daemon steal the page. We clean up the
1c79356b A	6677	* top page but keep the paging reference on the object
	6678	* holding the dest page so it doesn't go away.
	6679	*/
	6680
2d21ac55	6681	vm_page_lockspin_queues();
3e170ce0	6682	vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
1c79356b A	6683	vm_page_unlock_queues();
1c79356b A	6684	PAGE_WAKEUP_DONE(dst_page);
39037602	6685	vm_object_unlock(dst_object);
1c79356b A	6686
	6687	if (dst_top_page != VM_PAGE_NULL) {
	6688	vm_object_lock(dst_object);
	6689	VM_PAGE_FREE(dst_top_page);
	6690	vm_object_paging_end(dst_object);
	6691	vm_object_unlock(dst_object);
	6692	}
	6693
0a7de745	6694	RetrySourceFault:;
1c79356b A	6695
	6696	if (src_object == VM_OBJECT_NULL) {
	6697	/*
	6698	* No source object. We will just
	6699	* zero-fill the page in dst_object.
	6700	*/
	6701	src_page = VM_PAGE_NULL;
e3027f41	6702	result_page = VM_PAGE_NULL;
1c79356b A	6703	} else {
	6704	vm_object_lock(src_object);
	6705	src_page = vm_page_lookup(src_object,
0a7de745	6706	vm_object_trunc_page(src_offset));
e3027f41	6707	if (src_page == dst_page) {
1c79356b	6708	src_prot = dst_prot;
e3027f41 A	6709	result_page = VM_PAGE_NULL;
e3027f41 A	6710	} else {
1c79356b A	6711	src_prot = VM_PROT_READ;
	6712	vm_object_paging_begin(src_object);
	6713
d9a64523 A	6714	/* cap cluster size at maximum UPL size */
	6715	if (os_convert_overflow(amount_left, &cluster_size)) {
	6716	cluster_size = 0 - (upl_size_t)PAGE_SIZE;
b0d623f7	6717	}
d9a64523	6718	fault_info_src.cluster_size = cluster_size;
2d21ac55	6719
39236c6e	6720	result_page = VM_PAGE_NULL;
b0d623f7	6721	result = vm_fault_page(
5ba3f43e	6722	src_object,
b0d623f7 A	6723	vm_object_trunc_page(src_offset),
b0d623f7 A	6724	VM_PROT_READ, FALSE,
39236c6e	6725	FALSE, /* page not looked up */
5ba3f43e	6726	&src_prot,
b0d623f7 A	6727	&result_page, &src_top_page,
	6728	(int *)0, &error, FALSE,
	6729	FALSE, &fault_info_src);
	6730
	6731	switch (result) {
1c79356b A	6732	case VM_FAULT_SUCCESS:
	6733	break;
	6734	case VM_FAULT_RETRY:
	6735	goto RetrySourceFault;
	6736	case VM_FAULT_MEMORY_SHORTAGE:
0a7de745	6737	if (vm_page_wait(interruptible)) {
1c79356b	6738	goto RetrySourceFault;
0a7de745	6739	}
f427ee49	6740	OS_FALLTHROUGH;
1c79356b A	6741	case VM_FAULT_INTERRUPTED:
	6742	vm_fault_copy_dst_cleanup(dst_page);
	6743	RETURN(MACH_SEND_INTERRUPTED);
b0d623f7 A	6744	case VM_FAULT_SUCCESS_NO_VM_PAGE:
	6745	/* success but no VM page: fail */
	6746	vm_object_paging_end(src_object);
	6747	vm_object_unlock(src_object);
f427ee49	6748	OS_FALLTHROUGH;
1c79356b A	6749	case VM_FAULT_MEMORY_ERROR:
1c79356b A	6750	vm_fault_copy_dst_cleanup(dst_page);
0a7de745 A	6751	if (error) {
	6752	return error;
	6753	} else {
	6754	return KERN_MEMORY_ERROR;
	6755	}
b0d623f7 A	6756	default:
b0d623f7 A	6757	panic("vm_fault_copy(2): unexpected "
0a7de745 A	6758	"error 0x%x from "
0a7de745 A	6759	"vm_fault_page()\n", result);
1c79356b A	6760	}
1c79356b A	6761
39037602	6762	result_page_object = VM_PAGE_OBJECT(result_page);
1c79356b	6763	assert((src_top_page == VM_PAGE_NULL) ==
0a7de745	6764	(result_page_object == src_object));
1c79356b	6765	}
0a7de745	6766	assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
39037602	6767	vm_object_unlock(result_page_object);
1c79356b A	6768	}
1c79356b A	6769
5ba3f43e A	6770	vm_map_lock_read(dst_map);
5ba3f43e A	6771
1c79356b	6772	if (!vm_map_verify(dst_map, dst_version)) {
5ba3f43e	6773	vm_map_unlock_read(dst_map);
0a7de745	6774	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
e3027f41	6775	vm_fault_copy_cleanup(result_page, src_top_page);
0a7de745	6776	}
1c79356b A	6777	vm_fault_copy_dst_cleanup(dst_page);
	6778	break;
	6779	}
39037602	6780	assert(dst_object == VM_PAGE_OBJECT(dst_page));
1c79356b	6781
39037602	6782	vm_object_lock(dst_object);
1c79356b	6783
39037602 A	6784	if (dst_object->copy != old_copy_object) {
39037602 A	6785	vm_object_unlock(dst_object);
5ba3f43e	6786	vm_map_unlock_read(dst_map);
0a7de745	6787	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
e3027f41	6788	vm_fault_copy_cleanup(result_page, src_top_page);
0a7de745	6789	}
1c79356b A	6790	vm_fault_copy_dst_cleanup(dst_page);
	6791	break;
	6792	}
39037602	6793	vm_object_unlock(dst_object);
1c79356b A	6794
	6795	/*
	6796	* Copy the page, and note that it is dirty
	6797	* immediately.
	6798	*/
	6799
	6800	if (!page_aligned(src_offset) \|\|
0a7de745 A	6801	!page_aligned(dst_offset) \|\|
	6802	!page_aligned(amount_left)) {
	6803	vm_object_offset_t src_po,
	6804	dst_po;
1c79356b	6805
91447636 A	6806	src_po = src_offset - vm_object_trunc_page(src_offset);
91447636 A	6807	dst_po = dst_offset - vm_object_trunc_page(dst_offset);
1c79356b A	6808
	6809	if (dst_po > src_po) {
	6810	part_size = PAGE_SIZE - dst_po;
	6811	} else {
	6812	part_size = PAGE_SIZE - src_po;
	6813	}
0a7de745	6814	if (part_size > (amount_left)) {
1c79356b A	6815	part_size = amount_left;
	6816	}
	6817
e3027f41	6818	if (result_page == VM_PAGE_NULL) {
b0d623f7 A	6819	assert((vm_offset_t) dst_po == dst_po);
b0d623f7 A	6820	assert((vm_size_t) part_size == part_size);
1c79356b	6821	vm_page_part_zero_fill(dst_page,
0a7de745 A	6822	(vm_offset_t) dst_po,
0a7de745 A	6823	(vm_size_t) part_size);
1c79356b	6824	} else {
b0d623f7 A	6825	assert((vm_offset_t) src_po == src_po);
	6826	assert((vm_offset_t) dst_po == dst_po);
	6827	assert((vm_size_t) part_size == part_size);
	6828	vm_page_part_copy(result_page,
0a7de745 A	6829	(vm_offset_t) src_po,
	6830	dst_page,
	6831	(vm_offset_t) dst_po,
	6832	(vm_size_t)part_size);
	6833	if (!dst_page->vmp_dirty) {
1c79356b	6834	vm_object_lock(dst_object);
316670eb	6835	SET_PAGE_DIRTY(dst_page, TRUE);
39037602	6836	vm_object_unlock(dst_object);
1c79356b	6837	}
1c79356b A	6838	}
	6839	} else {
	6840	part_size = PAGE_SIZE;
	6841
0a7de745	6842	if (result_page == VM_PAGE_NULL) {
1c79356b	6843	vm_page_zero_fill(dst_page);
0a7de745	6844	} else {
39037602	6845	vm_object_lock(result_page_object);
e3027f41	6846	vm_page_copy(result_page, dst_page);
39037602	6847	vm_object_unlock(result_page_object);
316670eb	6848
0a7de745	6849	if (!dst_page->vmp_dirty) {
1c79356b	6850	vm_object_lock(dst_object);
316670eb	6851	SET_PAGE_DIRTY(dst_page, TRUE);
39037602	6852	vm_object_unlock(dst_object);
1c79356b A	6853	}
1c79356b A	6854	}
1c79356b A	6855	}
	6856
	6857	/*
	6858	* Unlock everything, and return
	6859	*/
	6860
5ba3f43e	6861	vm_map_unlock_read(dst_map);
1c79356b	6862
0a7de745	6863	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
e3027f41	6864	vm_fault_copy_cleanup(result_page, src_top_page);
0a7de745	6865	}
1c79356b A	6866	vm_fault_copy_dst_cleanup(dst_page);
	6867
	6868	amount_left -= part_size;
	6869	src_offset += part_size;
	6870	dst_offset += part_size;
	6871	} while (amount_left > 0);
	6872
	6873	RETURN(KERN_SUCCESS);
0a7de745	6874	#undef RETURN
1c79356b	6875
5ba3f43e	6876	/NOTREACHED/
1c79356b A	6877	}
1c79356b A	6878
0a7de745	6879	#if VM_FAULT_CLASSIFY
1c79356b A	6880	/*
	6881	* Temporary statistics gathering support.
	6882	*/
	6883
	6884	/*
	6885	* Statistics arrays:
	6886	*/
0a7de745 A	6887	#define VM_FAULT_TYPES_MAX 5
0a7de745 A	6888	#define VM_FAULT_LEVEL_MAX 8
1c79356b	6889
0a7de745	6890	int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
1c79356b	6891
0a7de745 A	6892	#define VM_FAULT_TYPE_ZERO_FILL 0
	6893	#define VM_FAULT_TYPE_MAP_IN 1
	6894	#define VM_FAULT_TYPE_PAGER 2
	6895	#define VM_FAULT_TYPE_COPY 3
	6896	#define VM_FAULT_TYPE_OTHER 4
1c79356b A	6897
	6898
	6899	void
0a7de745 A	6900	vm_fault_classify(vm_object_t object,
	6901	vm_object_offset_t offset,
	6902	vm_prot_t fault_type)
1c79356b	6903	{
0a7de745 A	6904	int type, level = 0;
0a7de745 A	6905	vm_page_t m;
1c79356b A	6906
	6907	while (TRUE) {
	6908	m = vm_page_lookup(object, offset);
5ba3f43e	6909	if (m != VM_PAGE_NULL) {
0a7de745	6910	if (m->vmp_busy \|\| m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent) {
1c79356b A	6911	type = VM_FAULT_TYPE_OTHER;
	6912	break;
	6913	}
	6914	if (((fault_type & VM_PROT_WRITE) == 0) \|\|
	6915	((level == 0) && object->copy == VM_OBJECT_NULL)) {
	6916	type = VM_FAULT_TYPE_MAP_IN;
5ba3f43e	6917	break;
1c79356b A	6918	}
	6919	type = VM_FAULT_TYPE_COPY;
	6920	break;
0a7de745	6921	} else {
1c79356b A	6922	if (object->pager_created) {
	6923	type = VM_FAULT_TYPE_PAGER;
	6924	break;
	6925	}
	6926	if (object->shadow == VM_OBJECT_NULL) {
	6927	type = VM_FAULT_TYPE_ZERO_FILL;
	6928	break;
0a7de745	6929	}
1c79356b	6930
6d2010ae	6931	offset += object->vo_shadow_offset;
1c79356b A	6932	object = object->shadow;
	6933	level++;
	6934	continue;
	6935	}
	6936	}
	6937
0a7de745	6938	if (level > VM_FAULT_LEVEL_MAX) {
1c79356b	6939	level = VM_FAULT_LEVEL_MAX;
0a7de745	6940	}
1c79356b A	6941
	6942	vm_fault_stats[type][level] += 1;
	6943
	6944	return;
	6945	}
	6946
	6947	/* cleanup routine to call from debugger */
	6948
	6949	void
	6950	vm_fault_classify_init(void)
	6951	{
	6952	int type, level;
	6953
	6954	for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
	6955	for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
	6956	vm_fault_stats[type][level] = 0;
	6957	}
	6958	}
	6959
	6960	return;
	6961	}
0a7de745	6962	#endif /* VM_FAULT_CLASSIFY */
2d21ac55	6963
3e170ce0	6964	vm_offset_t
39037602	6965	kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
3e170ce0	6966	{
0a7de745 A	6967	vm_map_entry_t entry;
	6968	vm_object_t object;
	6969	vm_offset_t object_offset;
	6970	vm_page_t m;
	6971	int compressor_external_state, compressed_count_delta;
	6972	int compressor_flags = (C_DONT_BLOCK \| C_KEEP \| C_KDP);
	6973	int my_fault_type = VM_PROT_READ;
	6974	kern_return_t kr;
f427ee49 A	6975	int effective_page_mask, effective_page_size;
	6976
	6977	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
	6978	effective_page_mask = VM_MAP_PAGE_MASK(map);
	6979	effective_page_size = VM_MAP_PAGE_SIZE(map);
	6980	} else {
	6981	effective_page_mask = PAGE_MASK;
	6982	effective_page_size = PAGE_SIZE;
	6983	}
3e170ce0	6984
3e170ce0 A	6985	if (not_in_kdp) {
	6986	panic("kdp_lightweight_fault called from outside of debugger context");
	6987	}
	6988
	6989	assert(map != VM_MAP_NULL);
	6990
f427ee49 A	6991	assert((cur_target_addr & effective_page_mask) == 0);
f427ee49 A	6992	if ((cur_target_addr & effective_page_mask) != 0) {
3e170ce0 A	6993	return 0;
	6994	}
	6995
	6996	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
	6997	return 0;
	6998	}
	6999
	7000	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
	7001	return 0;
	7002	}
	7003
	7004	if (entry->is_sub_map) {
	7005	return 0;
	7006	}
	7007
	7008	object = VME_OBJECT(entry);
	7009	if (object == VM_OBJECT_NULL) {
	7010	return 0;
	7011	}
	7012
	7013	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
	7014
	7015	while (TRUE) {
	7016	if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
	7017	return 0;
	7018	}
	7019
	7020	if (object->pager_created && (object->paging_in_progress \|\|
0a7de745	7021	object->activity_in_progress)) {
3e170ce0 A	7022	return 0;
	7023	}
	7024
f427ee49	7025	m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
3e170ce0 A	7026
3e170ce0 A	7027	if (m != VM_PAGE_NULL) {
3e170ce0 A	7028	if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
	7029	return 0;
	7030	}
	7031
d9a64523	7032	if (m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_free_when_done \|\| m->vmp_absent \|\| m->vmp_error \|\| m->vmp_cleaning \|\|
0a7de745	7033	m->vmp_overwriting \|\| m->vmp_restart \|\| m->vmp_unusual) {
3e170ce0 A	7034	return 0;
	7035	}
	7036
d9a64523 A	7037	assert(!m->vmp_private);
d9a64523 A	7038	if (m->vmp_private) {
3e170ce0 A	7039	return 0;
	7040	}
	7041
d9a64523 A	7042	assert(!m->vmp_fictitious);
d9a64523 A	7043	if (m->vmp_fictitious) {
3e170ce0 A	7044	return 0;
	7045	}
	7046
d9a64523 A	7047	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
d9a64523 A	7048	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3e170ce0 A	7049	return 0;
3e170ce0 A	7050	}
2d21ac55	7051
39037602	7052	return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
3e170ce0 A	7053	}
	7054
	7055	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
	7056
	7057	if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
	7058	if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
f427ee49 A	7059	kr = vm_compressor_pager_get(object->pager,
f427ee49 A	7060	vm_object_trunc_page(object_offset + object->paging_offset),
0a7de745 A	7061	kdp_compressor_decompressed_page_ppnum, &my_fault_type,
0a7de745 A	7062	compressor_flags, &compressed_count_delta);
3e170ce0	7063	if (kr == KERN_SUCCESS) {
3e170ce0 A	7064	return kdp_compressor_decompressed_page_paddr;
	7065	} else {
	7066	return 0;
	7067	}
	7068	}
	7069	}
	7070
	7071	if (object->shadow == VM_OBJECT_NULL) {
	7072	return 0;
	7073	}
	7074
	7075	object_offset += object->vo_shadow_offset;
	7076	object = object->shadow;
	7077	}
39037602	7078	}
3e170ce0	7079
d9a64523 A	7080	/*
	7081	* vm_page_validate_cs_fast():
	7082	* Performs a few quick checks to determine if the page's code signature
	7083	* really needs to be fully validated. It could:
	7084	* 1. have been modified (i.e. automatically tainted),
	7085	* 2. have already been validated,
	7086	* 3. have already been found to be tainted,
	7087	* 4. no longer have a backing store.
	7088	* Returns FALSE if the page needs to be fully validated.
	7089	*/
	7090	static boolean_t
	7091	vm_page_validate_cs_fast(
f427ee49 A	7092	vm_page_t page,
	7093	vm_map_size_t fault_page_size,
	7094	vm_map_offset_t fault_phys_offset)
593a1d5f	7095	{
0a7de745	7096	vm_object_t object;
593a1d5f	7097
39037602	7098	object = VM_PAGE_OBJECT(page);
d9a64523	7099	vm_object_lock_assert_held(object);
593a1d5f	7100
f427ee49 A	7101	if (page->vmp_wpmapped &&
f427ee49 A	7102	!VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
593a1d5f A	7103	/*
	7104	* This page was mapped for "write" access sometime in the
	7105	* past and could still be modifiable in the future.
	7106	* Consider it tainted.
	7107	* [ If the page was already found to be "tainted", no
	7108	* need to re-validate. ]
	7109	*/
d9a64523	7110	vm_object_lock_assert_exclusive(object);
f427ee49 A	7111	VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
f427ee49 A	7112	VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
593a1d5f	7113	if (cs_debug) {
d9a64523	7114	printf("CODESIGNING: %s: "
0a7de745 A	7115	"page %p obj %p off 0x%llx "
	7116	"was modified\n",
	7117	__FUNCTION__,
	7118	page, object, page->vmp_offset);
593a1d5f A	7119	}
	7120	vm_cs_validated_dirtied++;
	7121	}
	7122
f427ee49 A	7123	if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) \|\|
f427ee49 A	7124	VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
d9a64523	7125	return TRUE;
593a1d5f	7126	}
d9a64523	7127	vm_object_lock_assert_exclusive(object);
593a1d5f	7128
d9a64523 A	7129	#if CHECK_CS_VALIDATION_BITMAP
d9a64523 A	7130	kern_return_t kr;
593a1d5f	7131
d9a64523 A	7132	kr = vnode_pager_cs_check_validation_bitmap(
	7133	object->pager,
	7134	page->vmp_offset + object->paging_offset,
	7135	CS_BITMAP_CHECK);
	7136	if (kr == KERN_SUCCESS) {
f427ee49 A	7137	page->vmp_cs_validated = VMP_CS_ALL_TRUE;
f427ee49 A	7138	page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
d9a64523 A	7139	vm_cs_bitmap_validated++;
	7140	return TRUE;
	7141	}
	7142	#endif /* CHECK_CS_VALIDATION_BITMAP */
593a1d5f A	7143
	7144	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7145	/*
	7146	* The object is terminating and we don't have its pager
	7147	* so we can't validate the data...
	7148	*/
d9a64523	7149	return TRUE;
593a1d5f	7150	}
d9a64523 A	7151
	7152	/* we need to really validate this page */
	7153	vm_object_lock_assert_exclusive(object);
	7154	return FALSE;
	7155	}
	7156
	7157	void
	7158	vm_page_validate_cs_mapped_slow(
0a7de745 A	7159	vm_page_t page,
0a7de745 A	7160	const void *kaddr)
d9a64523	7161	{
0a7de745 A	7162	vm_object_t object;
	7163	memory_object_offset_t mo_offset;
	7164	memory_object_t pager;
	7165	struct vnode *vnode;
f427ee49	7166	int validated, tainted, nx;
d9a64523 A	7167
	7168	assert(page->vmp_busy);
	7169	object = VM_PAGE_OBJECT(page);
	7170	vm_object_lock_assert_exclusive(object);
	7171
	7172	vm_cs_validates++;
	7173
593a1d5f A	7174	/*
	7175	* Since we get here to validate a page that was brought in by
	7176	* the pager, we know that this pager is all setup and ready
	7177	* by now.
	7178	*/
d9a64523	7179	assert(object->code_signed);
593a1d5f A	7180	assert(!object->internal);
	7181	assert(object->pager != NULL);
	7182	assert(object->pager_ready);
	7183
	7184	pager = object->pager;
b0d623f7	7185	assert(object->paging_in_progress);
39037602	7186	vnode = vnode_pager_lookup_vnode(pager);
d9a64523	7187	mo_offset = page->vmp_offset + object->paging_offset;
593a1d5f A	7188
593a1d5f A	7189	/* verify the SHA1 hash for this page */
f427ee49	7190	validated = 0;
39037602	7191	tainted = 0;
f427ee49 A	7192	nx = 0;
f427ee49 A	7193	cs_validate_page(vnode,
0a7de745 A	7194	pager,
	7195	mo_offset,
	7196	(const void )((const char )kaddr),
f427ee49 A	7197	&validated,
	7198	&tainted,
	7199	&nx);
39037602	7200
f427ee49 A	7201	page->vmp_cs_validated \|= validated;
	7202	page->vmp_cs_tainted \|= tainted;
	7203	page->vmp_cs_nx \|= nx;
d9a64523 A	7204
d9a64523 A	7205	#if CHECK_CS_VALIDATION_BITMAP
f427ee49 A	7206	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
f427ee49 A	7207	page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
d9a64523	7208	vnode_pager_cs_check_validation_bitmap(object->pager,
0a7de745 A	7209	mo_offset,
0a7de745 A	7210	CS_BITMAP_SET);
d9a64523 A	7211	}
	7212	#endif /* CHECK_CS_VALIDATION_BITMAP */
	7213	}
	7214
	7215	void
	7216	vm_page_validate_cs_mapped(
0a7de745	7217	vm_page_t page,
f427ee49 A	7218	vm_map_size_t fault_page_size,
f427ee49 A	7219	vm_map_offset_t fault_phys_offset,
0a7de745	7220	const void *kaddr)
d9a64523	7221	{
f427ee49	7222	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
d9a64523	7223	vm_page_validate_cs_mapped_slow(page, kaddr);
593a1d5f A	7224	}
	7225	}
	7226
c3c9b80d A	7227	static void
	7228	vm_page_map_and_validate_cs(
	7229	vm_object_t object,
	7230	vm_page_t page)
2d21ac55	7231	{
0a7de745 A	7232	vm_object_offset_t offset;
	7233	vm_map_offset_t koffset;
	7234	vm_map_size_t ksize;
	7235	vm_offset_t kaddr;
	7236	kern_return_t kr;
	7237	boolean_t busy_page;
	7238	boolean_t need_unmap;
2d21ac55	7239
39037602	7240	vm_object_lock_assert_exclusive(object);
4a3eedf9	7241
2d21ac55	7242	assert(object->code_signed);
d9a64523	7243	offset = page->vmp_offset;
2d21ac55	7244
d9a64523	7245	busy_page = page->vmp_busy;
2d21ac55 A	7246	if (!busy_page) {
2d21ac55 A	7247	/* keep page busy while we map (and unlock) the VM object */
d9a64523	7248	page->vmp_busy = TRUE;
2d21ac55	7249	}
5ba3f43e	7250
2d21ac55 A	7251	/*
	7252	* Take a paging reference on the VM object
	7253	* to protect it from collapse or bypass,
	7254	* and keep it from disappearing too.
	7255	*/
	7256	vm_object_paging_begin(object);
	7257
	7258	/* map the page in the kernel address space */
2d21ac55	7259	ksize = PAGE_SIZE_64;
39236c6e A	7260	koffset = 0;
	7261	need_unmap = FALSE;
	7262	kr = vm_paging_map_object(page,
0a7de745 A	7263	object,
	7264	offset,
	7265	VM_PROT_READ,
	7266	FALSE, /* can't unlock object ! */
	7267	&ksize,
	7268	&koffset,
	7269	&need_unmap);
2d21ac55	7270	if (kr != KERN_SUCCESS) {
d9a64523	7271	panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
2d21ac55 A	7272	}
	7273	kaddr = CAST_DOWN(vm_offset_t, koffset);
	7274
593a1d5f	7275	/* validate the mapped page */
d9a64523	7276	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
2d21ac55	7277
d9a64523	7278	assert(page->vmp_busy);
39037602	7279	assert(object == VM_PAGE_OBJECT(page));
2d21ac55 A	7280	vm_object_lock_assert_exclusive(object);
2d21ac55 A	7281
2d21ac55 A	7282	if (!busy_page) {
	7283	PAGE_WAKEUP_DONE(page);
	7284	}
39236c6e	7285	if (need_unmap) {
2d21ac55 A	7286	/* unmap the map from the kernel address space */
	7287	vm_paging_unmap_object(object, koffset, koffset + ksize);
	7288	koffset = 0;
	7289	ksize = 0;
	7290	kaddr = 0;
	7291	}
	7292	vm_object_paging_end(object);
	7293	}
3e170ce0	7294
c3c9b80d A	7295	void
	7296	vm_page_validate_cs(
	7297	vm_page_t page,
	7298	vm_map_size_t fault_page_size,
	7299	vm_map_offset_t fault_phys_offset)
	7300	{
	7301	vm_object_t object;
	7302
	7303	object = VM_PAGE_OBJECT(page);
	7304	vm_object_lock_assert_held(object);
	7305
	7306	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
	7307	return;
	7308	}
	7309	vm_page_map_and_validate_cs(object, page);
	7310	}
	7311
3e170ce0 A	7312	void
3e170ce0 A	7313	vm_page_validate_cs_mapped_chunk(
0a7de745 A	7314	vm_page_t page,
	7315	const void *kaddr,
	7316	vm_offset_t chunk_offset,
	7317	vm_size_t chunk_size,
	7318	boolean_t *validated_p,
	7319	unsigned *tainted_p)
3e170ce0	7320	{
0a7de745 A	7321	vm_object_t object;
	7322	vm_object_offset_t offset, offset_in_page;
	7323	memory_object_t pager;
	7324	struct vnode *vnode;
	7325	boolean_t validated;
	7326	unsigned tainted;
3e170ce0 A	7327
	7328	*validated_p = FALSE;
	7329	*tainted_p = 0;
	7330
d9a64523	7331	assert(page->vmp_busy);
39037602 A	7332	object = VM_PAGE_OBJECT(page);
39037602 A	7333	vm_object_lock_assert_exclusive(object);
3e170ce0	7334
3e170ce0	7335	assert(object->code_signed);
d9a64523	7336	offset = page->vmp_offset;
3e170ce0 A	7337
	7338	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
	7339	/*
	7340	* The object is terminating and we don't have its pager
	7341	* so we can't validate the data...
	7342	*/
	7343	return;
	7344	}
	7345	/*
	7346	* Since we get here to validate a page that was brought in by
	7347	* the pager, we know that this pager is all setup and ready
	7348	* by now.
	7349	*/
	7350	assert(!object->internal);
	7351	assert(object->pager != NULL);
	7352	assert(object->pager_ready);
	7353
	7354	pager = object->pager;
	7355	assert(object->paging_in_progress);
39037602	7356	vnode = vnode_pager_lookup_vnode(pager);
3e170ce0 A	7357
	7358	/* verify the signature for this chunk */
	7359	offset_in_page = chunk_offset;
	7360	assert(offset_in_page < PAGE_SIZE);
3e170ce0 A	7361
3e170ce0 A	7362	tainted = 0;
39037602	7363	validated = cs_validate_range(vnode,
0a7de745 A	7364	pager,
	7365	(object->paging_offset +
	7366	offset +
	7367	offset_in_page),
	7368	(const void )((const char )kaddr
	7369	+ offset_in_page),
	7370	chunk_size,
	7371	&tainted);
3e170ce0 A	7372	if (validated) {
	7373	*validated_p = TRUE;
	7374	}
	7375	if (tainted) {
	7376	*tainted_p = tainted;
	7377	}
	7378	}
d9a64523	7379
0a7de745 A	7380	static void
	7381	vm_rtfrecord_lock(void)
	7382	{
d9a64523 A	7383	lck_spin_lock(&vm_rtfr_slock);
	7384	}
	7385
0a7de745 A	7386	static void
	7387	vm_rtfrecord_unlock(void)
	7388	{
d9a64523 A	7389	lck_spin_unlock(&vm_rtfr_slock);
	7390	}
	7391
0a7de745 A	7392	unsigned int
	7393	vmrtfaultinfo_bufsz(void)
	7394	{
	7395	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
d9a64523 A	7396	}
	7397
	7398	#include <kern/backtrace.h>
	7399
f427ee49	7400	__attribute__((noinline))
0a7de745 A	7401	static void
	7402	vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
	7403	{
d9a64523 A	7404	uint64_t fend = mach_continuous_time();
	7405
	7406	uint64_t cfpc = 0;
	7407	uint64_t ctid = cthread->thread_id;
	7408	uint64_t cupid = get_current_unique_pid();
	7409
	7410	uintptr_t bpc = 0;
ea3f0419	7411	int btr = 0;
d9a64523 A	7412	bool u64 = false;
	7413
	7414	/* Capture a single-frame backtrace; this extracts just the program
	7415	* counter at the point of the fault into "bpc", and should perform no
	7416	* further user stack traversals, thus avoiding copyin()s and further
	7417	* faults.
	7418	*/
f427ee49	7419	unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL, false);
d9a64523 A	7420
	7421	if ((btr == 0) && (bfrs > 0)) {
	7422	cfpc = bpc;
	7423	}
	7424
	7425	assert((fstart != 0) && fend >= fstart);
	7426	vm_rtfrecord_lock();
	7427	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
	7428
	7429	vmrtfrs.vmrtf_total++;
	7430	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
	7431
	7432	cvmr->rtfabstime = fstart;
	7433	cvmr->rtfduration = fend - fstart;
	7434	cvmr->rtfaddr = fault_vaddr;
	7435	cvmr->rtfpc = cfpc;
	7436	cvmr->rtftype = type_of_fault;
	7437	cvmr->rtfupid = cupid;
	7438	cvmr->rtftid = ctid;
	7439
	7440	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
	7441	vmrtfrs.vmrtfr_curi = 0;
	7442	}
	7443
	7444	vm_rtfrecord_unlock();
	7445	}
	7446
0a7de745	7447	int
f427ee49	7448	vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void vrecords, unsigned long vmrtfrv)
0a7de745	7449	{
d9a64523 A	7450	vm_rtfault_record_t *cvmrd = vrecords;
d9a64523 A	7451	size_t residue = vrecordsz;
f427ee49	7452	size_t numextracted = 0;
d9a64523 A	7453	boolean_t early_exit = FALSE;
	7454
	7455	vm_rtfrecord_lock();
	7456
	7457	for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
d9a64523 A	7458	if (residue < sizeof(vm_rtfault_record_t)) {
	7459	early_exit = TRUE;
	7460	break;
	7461	}
	7462
	7463	if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
0a7de745	7464	#if DEVELOPMENT \|\| DEBUG
d9a64523 A	7465	if (isroot == FALSE) {
	7466	continue;
	7467	}
	7468	#else
	7469	continue;
	7470	#endif /* DEVDEBUG */
	7471	}
	7472
	7473	*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
	7474	cvmrd++;
	7475	residue -= sizeof(vm_rtfault_record_t);
	7476	numextracted++;
	7477	}
	7478
	7479	vm_rtfrecord_unlock();
	7480
	7481	*vmrtfrv = numextracted;
0a7de745	7482	return early_exit;
d9a64523	7483	}
c3c9b80d A	7484
	7485	/*
	7486	* Only allow one diagnosis to be in flight at a time, to avoid
	7487	* creating too much additional memory usage.
	7488	*/
	7489	static volatile uint_t vmtc_diagnosing;
	7490	unsigned int vmtc_total;
	7491	unsigned int vmtc_undiagnosed;
	7492	unsigned int vmtc_not_eligible;
	7493	unsigned int vmtc_copyin_fail;
	7494	unsigned int vmtc_not_found;
	7495	unsigned int vmtc_one_bit_flip;
	7496	unsigned int vmtc_byte_counts[MAX_TRACK_POWER2 + 1];
	7497
	7498	#if DEVELOPMENT \|\| DEBUG
	7499	/*
	7500	* Keep around the last diagnosed corruption buffers to aid in debugging.
	7501	*/
	7502	static size_t vmtc_last_buffer_size;
	7503	static uint64_t *vmtc_last_before_buffer = NULL;
	7504	static uint64_t *vmtc_last_after_buffer = NULL;
	7505	#endif /* DEVELOPMENT \|\| DEBUG */
	7506
	7507	/*
	7508	* Set things up so we can diagnose a potential text page corruption.
	7509	*/
	7510	static uint64_t *
	7511	vmtc_text_page_diagnose_setup(
	7512	vm_map_offset_t code_addr)
	7513	{
	7514	uint64_t *buffer;
	7515	size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
	7516
	7517	(void)OSAddAtomic(1, &vmtc_total);
	7518
	7519	/*
	7520	* If another is being diagnosed, skip this one.
	7521	*/
	7522	if (!OSCompareAndSwap(0, 1, &vmtc_diagnosing)) {
	7523	(void)OSAddAtomic(1, &vmtc_undiagnosed);
	7524	return NULL;
	7525	}
	7526
	7527	/*
	7528	* Get the contents of the corrupt page.
	7529	*/
	7530	buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
	7531	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), buffer, size) != 0) {
	7532	/* copyin error, so undo things */
	7533	kheap_free(KHEAP_DEFAULT, buffer, size);
	7534	(void)OSAddAtomic(1, &vmtc_undiagnosed);
	7535	++vmtc_copyin_fail;
	7536	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
	7537	panic("Bad compare and swap in setup!");
	7538	}
	7539	return NULL;
	7540	}
	7541	return buffer;
	7542	}
	7543
	7544	/*
	7545	* Diagnose the text page by comparing its contents with
	7546	* the one we've previously saved.
	7547	*/
7548	static void
7549	vmtc_text_page_diagnose(
7550	vm_map_offset_t code_addr,
7551	uint64_t *old_code_buffer)
7552	{
7553	uint64_t *new_code_buffer;
7554	size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
7555	uint_t count = (uint_t)size / sizeof(uint64_t);
7556	uint_t diff_count = 0;
7557	bool bit_flip = false;
7558	uint_t b;
7559	uint64_t *new;
7560	uint64_t *old;
7561
7562	new_code_buffer = kheap_alloc(KHEAP_DEFAULT, size, Z_WAITOK);
7563	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - 1), new_code_buffer, size) != 0) {
7564	/* copyin error, so undo things */
7565	(void)OSAddAtomic(1, &vmtc_undiagnosed);
7566	++vmtc_copyin_fail;
7567	goto done;
7568	}
7569
7570	new = new_code_buffer;
7571	old = old_code_buffer;
7572	for (; count-- > 0; ++new, ++old) {
7573	if (new == old) {
7574	continue;
7575	}
7576
7577	/*
7578	* On first diff, check for a single bit flip
7579	*/
7580	if (diff_count == 0) {
7581	uint64_t x = (new ^ old);
7582	assert(x != 0);
7583	if ((x & (x - 1)) == 0) {
7584	bit_flip = true;
7585	++diff_count;
7586	continue;
7587	}
7588	}
7589
7590	/*
7591	* count up the number of different bytes.
7592	*/
7593	for (b = 0; b < sizeof(uint64_t); ++b) {
7594	char n = (char )new;
7595	char o = (char )old;
7596	if (n[b] != o[b]) {
7597	++diff_count;
7598	}
7599	}
7600
7601	/* quit counting when too many */
7602	if (diff_count > (1 << MAX_TRACK_POWER2)) {
7603	break;
7604	}
7605	}
7606
7607	if (diff_count > 1) {
7608	bit_flip = false;
7609	}
7610
7611	if (diff_count == 0) {
7612	++vmtc_not_found;
7613	} else if (bit_flip) {
7614	++vmtc_one_bit_flip;
7615	++vmtc_byte_counts[0];
7616	} else {
7617	for (b = 0; b <= MAX_TRACK_POWER2; ++b) {
7618	if (diff_count <= (1 << b)) {
7619	++vmtc_byte_counts[b];
7620	break;
7621	}
7622	}
7623	if (diff_count > (1 << MAX_TRACK_POWER2)) {
7624	++vmtc_byte_counts[MAX_TRACK_POWER2];
7625	}
7626	}
7627
7628	done:
7629	/*
7630	* Free up the code copy buffers, but save the last
7631	* set on development / debug kernels in case they
7632	* can provide evidence for debugging memory stomps.
7633	*/
7634	#if DEVELOPMENT \|\| DEBUG
7635	if (vmtc_last_before_buffer != NULL) {
7636	kheap_free(KHEAP_DEFAULT, vmtc_last_before_buffer, vmtc_last_buffer_size);
7637	}
7638	if (vmtc_last_after_buffer != NULL) {
7639	kheap_free(KHEAP_DEFAULT, vmtc_last_after_buffer, vmtc_last_buffer_size);
7640	}
7641	vmtc_last_before_buffer = old_code_buffer;
7642	vmtc_last_after_buffer = new_code_buffer;
7643	vmtc_last_buffer_size = size;
7644	#else /* DEVELOPMENT \|\| DEBUG */
7645	kheap_free(KHEAP_DEFAULT, new_code_buffer, size);
7646	kheap_free(KHEAP_DEFAULT, old_code_buffer, size);
7647	#endif /* DEVELOPMENT \|\| DEBUG */
7648
7649	/*
7650	* We're finished, so clear the diagnosing flag.
7651	*/
7652	if (!OSCompareAndSwap(1, 0, &vmtc_diagnosing)) {
7653	panic("Bad compare and swap in diagnose!");
7654	}
7655	}
7656
7657	/*
7658	* For the given map, virt address, find the object, offset, and page.
7659	* This has to lookup the map entry, verify protections, walk any shadow chains.
7660	* If found, returns with the object locked.
7661	*/
7662	static kern_return_t
7663	vmtc_revalidate_lookup(
7664	vm_map_t map,
7665	vm_map_offset_t vaddr,
7666	vm_object_t *ret_object,
7667	vm_object_offset_t *ret_offset,
7668	vm_page_t *ret_page)
7669	{
7670	vm_object_t object;
7671	vm_object_offset_t offset;
7672	vm_page_t page;
7673	kern_return_t kr = KERN_SUCCESS;
7674	uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
7675	vm_map_version_t version;
7676	boolean_t wired;
7677	struct vm_object_fault_info fault_info = {};
7678	vm_map_t real_map = NULL;
7679	vm_prot_t prot;
7680	vm_object_t shadow;
7681
7682	/*
7683	* Find the object/offset for the given location/map.
7684	* Note this returns with the object locked.
7685	*/
7686	restart:
7687	vm_map_lock_read(map);
7688	object = VM_OBJECT_NULL; /* in case we come around the restart path */
7689	kr = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
7690	object_lock_type, &version, &object, &offset, &prot, &wired,
7691	&fault_info, &real_map, NULL);
7692	vm_map_unlock_read(map);
7693	if (real_map != NULL && real_map != map) {
7694	vm_map_unlock(real_map);
7695	}
7696
7697	/*
7698	* If there's no mapping here, or if we fail because the page
7699	* wasn't mapped executable, we can ignore this.
7700	*/
7701	if (kr != KERN_SUCCESS \|\|
7702	object == NULL \|\|
7703	!(prot & VM_PROT_EXECUTE)) {
7704	kr = KERN_FAILURE;
7705	goto done;
7706	}
7707
7708	/*
7709	* Chase down any shadow chains to find the actual page.
7710	*/
7711	for (;;) {
7712	/*
7713	* See if the page is on the current object.
7714	*/
7715	page = vm_page_lookup(object, vm_object_trunc_page(offset));
7716	if (page != NULL) {
7717	/* restart the lookup */
7718	if (page->vmp_restart) {
7719	vm_object_unlock(object);
7720	goto restart;
7721	}
7722
7723	/*
7724	* If this page is busy, we need to wait for it.
7725	*/
7726	if (page->vmp_busy) {
7727	PAGE_SLEEP(object, page, TRUE);
7728	vm_object_unlock(object);
7729	goto restart;
7730	}
7731	break;
7732	}
7733
7734	/*
7735	* If the object doesn't have the page and
7736	* has no shadow, then we can quit.
7737	*/
7738	shadow = object->shadow;
7739	if (shadow == NULL) {
7740	kr = KERN_FAILURE;
7741	goto done;
7742	}
7743
7744	/*
7745	* Move to the next object
7746	*/
7747	offset += object->vo_shadow_offset;
7748	vm_object_lock(shadow);
7749	vm_object_unlock(object);
7750	object = shadow;
7751	shadow = VM_OBJECT_NULL;
7752	}
7753	*ret_object = object;
7754	*ret_offset = vm_object_trunc_page(offset);
7755	*ret_page = page;
7756
7757	done:
7758	if (kr != KERN_SUCCESS && object != NULL) {
7759	vm_object_unlock(object);
7760	}
7761	return kr;
7762	}
7763
7764	/*
7765	* Check if a page is wired, needs extra locking.
7766	*/
7767	static bool
7768	is_page_wired(vm_page_t page)
7769	{
7770	bool result;
7771	vm_page_lock_queues();
7772	result = VM_PAGE_WIRED(page);
7773	vm_page_unlock_queues();
7774	return result;
7775	}
7776
7777	/*
7778	* A fatal process error has occurred in the given task.
7779	* Recheck the code signing of the text page at the given
7780	* address to check for a text page corruption.
7781	*
7782	* Returns KERN_FAILURE if a page was found to be corrupt
7783	* by failing to match its code signature. KERN_SUCCESS
7784	* means the page is either valid or we don't have the
7785	* information to say it's corrupt.
7786	*/
7787	kern_return_t
7788	revalidate_text_page(task_t task, vm_map_offset_t code_addr)
7789	{
7790	kern_return_t kr;
7791	vm_map_t map;
7792	vm_object_t object = NULL;
7793	vm_object_offset_t offset;
7794	vm_page_t page = NULL;
7795	struct vnode *vnode;
7796	bool do_invalidate = false;
7797	uint64_t *diagnose_buffer = NULL;
7798
7799	map = task->map;
7800	if (task->map == NULL) {
7801	return KERN_SUCCESS;
7802	}
7803
7804	kr = vmtc_revalidate_lookup(map, code_addr, &object, &offset, &page);
7805	if (kr != KERN_SUCCESS) {
7806	goto done;
7807	}
7808
7809	/*
7810	* The object needs to have a pager.
7811	*/
7812	if (object->pager == NULL) {
7813	goto done;
7814	}
7815
7816	/*
7817	* Needs to be a vnode backed page to have a signature.
7818	*/
7819	vnode = vnode_pager_lookup_vnode(object->pager);
7820	if (vnode == NULL) {
7821	goto done;
7822	}
7823
7824	/*
7825	* Object checks to see if we should proceed.
7826	*/
7827	if (!object->code_signed \|\| /* no code signature to check */
7828	object->internal \|\| /* internal objects aren't signed */
7829	object->terminating \|\| /* the object and its pages are already going away */
7830	!object->pager_ready) { /* this should happen, but check shouldn't hurt */
7831	goto done;
7832	}
7833
7834	/*
7835	* Check the code signature of the page in question.
7836	*/
7837	vm_page_map_and_validate_cs(object, page);
7838
7839	/*
7840	* At this point:
7841	* vmp_cs_validated \|= validated (set if a code signature exists)
7842	* vmp_cs_tainted \|= tainted (set if code signature violation)
7843	* vmp_cs_nx \|= nx; ??
7844	*
7845	* if vmp_pmapped then have to pmap_disconnect..
7846	* other flags to check on object or page?
7847	*/
7848	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
7849	#if DEBUG \|\| DEVELOPMENT
7850	/*
7851	* On development builds, a boot-arg can be used to cause
7852	* a panic, instead of a quiet repair.
7853	*/
7854	if (vmtc_panic_instead) {
7855	panic("Text page corruption detected: vm_page_t 0x%llx\n", (long long)(uintptr_t)page);
7856	}
7857	#endif /* DEBUG \|\| DEVELOPMENT */
7858
7859	/*
7860	* We're going to invalidate this page. Mark it as busy so we can
7861	* drop the object lock and use copyin() to save its contents.
7862	*/
7863	do_invalidate = true;
7864	assert(!page->vmp_busy);
7865	page->vmp_busy = TRUE;
7866	vm_object_unlock(object);
7867	diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr);
7868	}
7869
7870	done:
7871	if (do_invalidate) {
7872	vm_object_lock(object);
7873	assert(page->vmp_busy);
7874	assert(VM_PAGE_OBJECT(page) == object); /* Since the page was busy, this shouldn't change */
7875	assert(page->vmp_offset == offset);
7876	PAGE_WAKEUP_DONE(page); /* make no longer busy */
7877
7878	/*
7879	* Invalidate, i.e. toss, the corrupted page.
7880	*/
7881	if (!page->vmp_cleaning &&
7882	!page->vmp_laundry &&
7883	!page->vmp_fictitious &&
7884	!page->vmp_precious &&
7885	!page->vmp_absent &&
7886	!page->vmp_error &&
7887	!page->vmp_dirty &&
7888	!is_page_wired(page)) {
7889	if (page->vmp_pmapped) {
7890	int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
7891	if (refmod & VM_MEM_MODIFIED) {
7892	SET_PAGE_DIRTY(page, FALSE);
7893	}
7894	if (refmod & VM_MEM_REFERENCED) {
7895	page->vmp_reference = TRUE;
7896	}
7897	}
7898	/* If the page seems intentionally modified, don't trash it. */
7899	if (!page->vmp_dirty) {
7900	VM_PAGE_FREE(page);
7901	} else {
7902	(void)OSAddAtomic(1, &vmtc_not_eligible);
7903	}
7904	} else {
7905	(void)OSAddAtomic(1, &vmtc_not_eligible);
7906	}
7907	vm_object_unlock(object);
7908
7909	/*
7910	* Now try to diagnose the type of failure by faulting
7911	* in a new copy and diff'ing it with what we saved.
7912	*/
7913	if (diagnose_buffer) {
7914	vmtc_text_page_diagnose(code_addr, diagnose_buffer);
7915	}
7916	return KERN_FAILURE;
7917	}
7918
7919	if (object != NULL) {
7920	vm_object_unlock(object);
7921	}
7922	return KERN_SUCCESS;
7923	}
7924
7925	#if DEBUG \|\| DEVELOPMENT
7926	/*
7927	* For implementing unit tests - ask the pmap to corrupt a text page.
7928	* We have to find the page, to get the physical address, then invoke
7929	* the pmap.
7930	*/
7931	extern kern_return_t vm_corrupt_text_addr(uintptr_t);
7932
7933	kern_return_t
7934	vm_corrupt_text_addr(uintptr_t va)
7935	{
7936	task_t task = current_task();
7937	vm_map_t map;
7938	kern_return_t kr = KERN_SUCCESS;
7939	vm_object_t object = VM_OBJECT_NULL;
7940	vm_object_offset_t offset;
7941	vm_page_t page = NULL;
7942	pmap_paddr_t pa;
7943
7944	map = task->map;
7945	if (task->map == NULL) {
7946	printf("corrupt_text_addr: no map\n");
7947	return KERN_FAILURE;
7948	}
7949
7950	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page);
7951	if (kr != KERN_SUCCESS) {
7952	printf("corrupt_text_addr: page lookup failed\n");
7953	return kr;
7954	}
7955	/* get the physical address to use */
7956	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
7957
7958	/*
7959	* Check we have something we can work with.
7960	* Due to racing with pageout as we enter the sysctl,
7961	* it's theoretically possible to have the page disappear, just
7962	* before the lookup.
7963	*
7964	* That's highly likely to happen often. I've filed a radar 72857482
7965	* to bubble up the error here to the sysctl result and have the
7966	* test not FAIL in that case.
7967	*/
7968	if (page->vmp_busy) {
7969	printf("corrupt_text_addr: vmp_busy\n");
7970	kr = KERN_FAILURE;
7971	}
7972	if (page->vmp_cleaning) {
7973	printf("corrupt_text_addr: vmp_cleaning\n");
7974	kr = KERN_FAILURE;
7975	}
7976	if (page->vmp_laundry) {
7977	printf("corrupt_text_addr: vmp_cleaning\n");
7978	kr = KERN_FAILURE;
7979	}
7980	if (page->vmp_fictitious) {
7981	printf("corrupt_text_addr: vmp_fictitious\n");
7982	kr = KERN_FAILURE;
7983	}
7984	if (page->vmp_precious) {
7985	printf("corrupt_text_addr: vmp_precious\n");
7986	kr = KERN_FAILURE;
7987	}
7988	if (page->vmp_absent) {
7989	printf("corrupt_text_addr: vmp_absent\n");
7990	kr = KERN_FAILURE;
7991	}
7992	if (page->vmp_error) {
7993	printf("corrupt_text_addr: vmp_error\n");
7994	kr = KERN_FAILURE;
7995	}
7996	if (page->vmp_dirty) {
7997	printf("corrupt_text_addr: vmp_dirty\n");
7998	kr = KERN_FAILURE;
7999	}
8000	if (is_page_wired(page)) {
8001	printf("corrupt_text_addr: wired\n");
8002	kr = KERN_FAILURE;
8003	}
8004	if (!page->vmp_pmapped) {
8005	printf("corrupt_text_addr: !vmp_pmapped\n");
8006	kr = KERN_FAILURE;
8007	}
8008
8009	if (kr == KERN_SUCCESS) {
8010	printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8011	kr = pmap_test_text_corruption(pa);
8012	if (kr != KERN_SUCCESS) {
8013	printf("corrupt_text_addr: pmap error %d\n", kr);
8014	}
8015	} else {
8016	printf("corrupt_text_addr: object %p\n", object);
8017	printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8018	printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8019	printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8020	printf("corrupt_text_addr: vm_page_t %p\n", page);
8021	printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8022	printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8023	}
8024
8025	if (object != VM_OBJECT_NULL) {
8026	vm_object_unlock(object);
8027	}
8028	return kr;
8029	}
8030	#endif /* DEBUG \|\| DEVELOPMENT */