2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
62 * Page fault handling module.
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <libkern/OSAtomic.h>
69 #include <mach/mach_types.h>
70 #include <mach/kern_return.h>
71 #include <mach/message.h> /* for error codes */
72 #include <mach/vm_param.h>
73 #include <mach/vm_behavior.h>
74 #include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
78 #include <kern/kern_types.h>
79 #include <kern/host_statistics.h>
80 #include <kern/counters.h>
81 #include <kern/task.h>
82 #include <kern/thread.h>
83 #include <kern/sched_prim.h>
84 #include <kern/host.h>
85 #include <kern/mach_param.h>
86 #include <kern/macro_help.h>
87 #include <kern/zalloc.h>
88 #include <kern/misc_protos.h>
89 #include <kern/policy_internal.h>
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_fault.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_kern.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/vm_protos.h>
101 #include <vm/vm_external.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
104 #include <vm/vm_shared_region.h>
106 #include <sys/codesign.h>
107 #include <sys/reason.h>
108 #include <sys/signalvar.h>
110 #include <san/kasan.h>
112 #define VM_FAULT_CLASSIFY 0
114 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116 int vm_protect_privileged_from_untrusted
= 1;
118 unsigned int vm_object_pagein_throttle
= 16;
121 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
122 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
123 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
124 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
125 * keep the UI active so that the user has a chance to kill the offending task before the system
128 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
129 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
130 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
131 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
134 extern void throttle_lowpri_io(int);
136 extern struct vnode
*vnode_pager_lookup_vnode(memory_object_t
);
138 uint64_t vm_hard_throttle_threshold
;
143 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
145 return vm_wants_task_throttled(current_task()) ||
146 ((vm_page_free_count
< vm_page_throttle_limit
||
147 HARD_THROTTLE_LIMIT_REACHED()) &&
148 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO
) >= THROTTLE_LEVEL_THROTTLED
);
151 #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
152 #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
154 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
155 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
158 #define VM_STAT_DECOMPRESSIONS() \
160 VM_STAT_INCR(decompressions); \
161 current_thread()->decompressions++; \
164 boolean_t
current_thread_aborted(void);
166 /* Forward declarations of internal routines. */
167 static kern_return_t
vm_fault_wire_fast(
172 vm_map_entry_t entry
,
174 vm_map_offset_t pmap_addr
,
175 ppnum_t
*physpage_p
);
177 static kern_return_t
vm_fault_internal(
179 vm_map_offset_t vaddr
,
180 vm_prot_t caller_prot
,
181 boolean_t change_wiring
,
185 vm_map_offset_t pmap_addr
,
186 ppnum_t
*physpage_p
);
188 static void vm_fault_copy_cleanup(
192 static void vm_fault_copy_dst_cleanup(
195 #if VM_FAULT_CLASSIFY
196 extern void vm_fault_classify(vm_object_t object
,
197 vm_object_offset_t offset
,
198 vm_prot_t fault_type
);
200 extern void vm_fault_classify_init(void);
203 unsigned long vm_pmap_enter_blocked
= 0;
204 unsigned long vm_pmap_enter_retried
= 0;
206 unsigned long vm_cs_validates
= 0;
207 unsigned long vm_cs_revalidates
= 0;
208 unsigned long vm_cs_query_modified
= 0;
209 unsigned long vm_cs_validated_dirtied
= 0;
210 unsigned long vm_cs_bitmap_validated
= 0;
212 uint64_t vm_cs_defer_to_pmap_cs
= 0;
213 uint64_t vm_cs_defer_to_pmap_cs_not
= 0;
216 void vm_pre_fault(vm_map_offset_t
, vm_prot_t
);
218 extern char *kdp_compressor_decompressed_page
;
219 extern addr64_t kdp_compressor_decompressed_page_paddr
;
220 extern ppnum_t kdp_compressor_decompressed_page_ppnum
;
226 vm_rtfault_record_t
*vm_rtf_records
;
228 #define VMRTF_DEFAULT_BUFSIZE (4096)
229 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
230 TUNABLE(int, vmrtf_num_records
, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT
);
232 static void vm_rtfrecord_lock(void);
233 static void vm_rtfrecord_unlock(void);
234 static void vm_record_rtfault(thread_t
, uint64_t, vm_map_offset_t
, int);
236 extern lck_grp_t vm_page_lck_grp_bucket
;
237 extern lck_attr_t vm_page_lck_attr
;
238 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock
, &vm_page_lck_grp_bucket
, &vm_page_lck_attr
);
241 * Routine: vm_fault_init
243 * Initialize our private data structures.
249 int i
, vm_compressor_temp
;
250 boolean_t need_default_val
= TRUE
;
252 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
253 * computed as a percentage of available memory, and the percentage used is scaled inversely with
254 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
255 * and reduce the value down to 10% for very large memory configurations. This helps give us a
256 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
257 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
260 vm_hard_throttle_threshold
= sane_size
* (35 - MIN((int)(sane_size
/ (1024 * 1024 * 1024)), 25)) / 100;
263 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
266 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp
, sizeof(vm_compressor_temp
))) {
267 for (i
= 0; i
< VM_PAGER_MAX_MODES
; i
++) {
268 if (((vm_compressor_temp
& (1 << i
)) == vm_compressor_temp
)) {
269 need_default_val
= FALSE
;
270 vm_compressor_mode
= vm_compressor_temp
;
274 if (need_default_val
) {
275 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp
);
278 if (need_default_val
) {
279 /* If no boot arg or incorrect boot arg, try device tree. */
280 PE_get_default("kern.vm_compressor", &vm_compressor_mode
, sizeof(vm_compressor_mode
));
282 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode
);
284 PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
285 &vm_protect_privileged_from_untrusted
,
286 sizeof(vm_protect_privileged_from_untrusted
));
291 vm_rtfault_record_init(void)
295 vmrtf_num_records
= MAX(vmrtf_num_records
, 1);
296 size
= vmrtf_num_records
* sizeof(vm_rtfault_record_t
);
297 vmrtfrs
.vm_rtf_records
= zalloc_permanent(size
,
298 ZALIGN(vm_rtfault_record_t
));
299 vmrtfrs
.vmrtfr_maxi
= vmrtf_num_records
- 1;
301 STARTUP(ZALLOC
, STARTUP_RANK_MIDDLE
, vm_rtfault_record_init
);
304 * Routine: vm_fault_cleanup
306 * Clean up the result of vm_fault_page.
308 * The paging reference for "object" is released.
309 * "object" is unlocked.
310 * If "top_page" is not null, "top_page" is
311 * freed and the paging reference for the object
312 * containing it is released.
315 * "object" must be locked.
322 vm_object_paging_end(object
);
323 vm_object_unlock(object
);
325 if (top_page
!= VM_PAGE_NULL
) {
326 object
= VM_PAGE_OBJECT(top_page
);
328 vm_object_lock(object
);
329 VM_PAGE_FREE(top_page
);
330 vm_object_paging_end(object
);
331 vm_object_unlock(object
);
335 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
338 boolean_t vm_page_deactivate_behind
= TRUE
;
340 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
342 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
343 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
344 /* we use it to size an array on the stack */
346 int vm_default_behind
= VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW
;
348 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
351 * vm_page_is_sequential
353 * Determine if sequential access is in progress
354 * in accordance with the behavior specified.
355 * Update state to indicate current access pattern.
357 * object must have at least the shared lock held
361 vm_fault_is_sequential(
363 vm_object_offset_t offset
,
364 vm_behavior_t behavior
)
366 vm_object_offset_t last_alloc
;
370 last_alloc
= object
->last_alloc
;
371 sequential
= object
->sequential
;
372 orig_sequential
= sequential
;
374 offset
= vm_object_trunc_page(offset
);
375 if (offset
== last_alloc
&& behavior
!= VM_BEHAVIOR_RANDOM
) {
376 /* re-faulting in the same page: no change in behavior */
381 case VM_BEHAVIOR_RANDOM
:
383 * reset indicator of sequential behavior
388 case VM_BEHAVIOR_SEQUENTIAL
:
389 if (offset
&& last_alloc
== offset
- PAGE_SIZE_64
) {
391 * advance indicator of sequential behavior
393 if (sequential
< MAX_SEQUENTIAL_RUN
) {
394 sequential
+= PAGE_SIZE
;
398 * reset indicator of sequential behavior
404 case VM_BEHAVIOR_RSEQNTL
:
405 if (last_alloc
&& last_alloc
== offset
+ PAGE_SIZE_64
) {
407 * advance indicator of sequential behavior
409 if (sequential
> -MAX_SEQUENTIAL_RUN
) {
410 sequential
-= PAGE_SIZE
;
414 * reset indicator of sequential behavior
420 case VM_BEHAVIOR_DEFAULT
:
422 if (offset
&& last_alloc
== (offset
- PAGE_SIZE_64
)) {
424 * advance indicator of sequential behavior
426 if (sequential
< 0) {
429 if (sequential
< MAX_SEQUENTIAL_RUN
) {
430 sequential
+= PAGE_SIZE
;
432 } else if (last_alloc
&& last_alloc
== (offset
+ PAGE_SIZE_64
)) {
434 * advance indicator of sequential behavior
436 if (sequential
> 0) {
439 if (sequential
> -MAX_SEQUENTIAL_RUN
) {
440 sequential
-= PAGE_SIZE
;
444 * reset indicator of sequential behavior
450 if (sequential
!= orig_sequential
) {
451 if (!OSCompareAndSwap(orig_sequential
, sequential
, (UInt32
*)&object
->sequential
)) {
453 * if someone else has already updated object->sequential
454 * don't bother trying to update it or object->last_alloc
460 * I'd like to do this with a OSCompareAndSwap64, but that
461 * doesn't exist for PPC... however, it shouldn't matter
462 * that much... last_alloc is maintained so that we can determine
463 * if a sequential access pattern is taking place... if only
464 * one thread is banging on this object, no problem with the unprotected
465 * update... if 2 or more threads are banging away, we run the risk of
466 * someone seeing a mangled update... however, in the face of multiple
467 * accesses, no sequential access pattern can develop anyway, so we
468 * haven't lost any real info.
470 object
->last_alloc
= offset
;
474 int vm_page_deactivate_behind_count
= 0;
477 * vm_page_deactivate_behind
479 * Determine if sequential access is in progress
480 * in accordance with the behavior specified. If
481 * so, compute a potential page to deactivate and
484 * object must be locked.
486 * return TRUE if we actually deactivate a page
490 vm_fault_deactivate_behind(
492 vm_object_offset_t offset
,
493 vm_behavior_t behavior
)
496 int pages_in_run
= 0;
497 int max_pages_in_run
= 0;
499 int sequential_behavior
= VM_BEHAVIOR_SEQUENTIAL
;
500 vm_object_offset_t run_offset
= 0;
501 vm_object_offset_t pg_offset
= 0;
503 vm_page_t page_run
[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER
];
507 dbgTrace(0xBEEF0018, (unsigned int) object
, (unsigned int) vm_fault_deactivate_behind
); /* (TEST/DEBUG) */
509 if (object
== kernel_object
|| vm_page_deactivate_behind
== FALSE
|| (vm_object_trunc_page(offset
) != offset
)) {
511 * Do not deactivate pages from the kernel object: they
512 * are not intended to become pageable.
513 * or we've disabled the deactivate behind mechanism
514 * or we are dealing with an offset that is not aligned to
515 * the system's PAGE_SIZE because in that case we will
516 * handle the deactivation on the aligned offset and, thus,
517 * the full PAGE_SIZE page once. This helps us avoid the redundant
518 * deactivates and the extra faults.
522 if ((sequential_run
= object
->sequential
)) {
523 if (sequential_run
< 0) {
524 sequential_behavior
= VM_BEHAVIOR_RSEQNTL
;
525 sequential_run
= 0 - sequential_run
;
527 sequential_behavior
= VM_BEHAVIOR_SEQUENTIAL
;
531 case VM_BEHAVIOR_RANDOM
:
533 case VM_BEHAVIOR_SEQUENTIAL
:
534 if (sequential_run
>= (int)PAGE_SIZE
) {
535 run_offset
= 0 - PAGE_SIZE_64
;
536 max_pages_in_run
= 1;
539 case VM_BEHAVIOR_RSEQNTL
:
540 if (sequential_run
>= (int)PAGE_SIZE
) {
541 run_offset
= PAGE_SIZE_64
;
542 max_pages_in_run
= 1;
545 case VM_BEHAVIOR_DEFAULT
:
547 { vm_object_offset_t behind
= vm_default_behind
* PAGE_SIZE_64
;
550 * determine if the run of sequential accesss has been
551 * long enough on an object with default access behavior
552 * to consider it for deactivation
554 if ((uint64_t)sequential_run
>= behind
&& (sequential_run
% (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER
* PAGE_SIZE
)) == 0) {
556 * the comparisons between offset and behind are done
557 * in this kind of odd fashion in order to prevent wrap around
560 if (sequential_behavior
== VM_BEHAVIOR_SEQUENTIAL
) {
561 if (offset
>= behind
) {
562 run_offset
= 0 - behind
;
563 pg_offset
= PAGE_SIZE_64
;
564 max_pages_in_run
= VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER
;
567 if (offset
< -behind
) {
569 pg_offset
= 0 - PAGE_SIZE_64
;
570 max_pages_in_run
= VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER
;
576 for (n
= 0; n
< max_pages_in_run
; n
++) {
577 m
= vm_page_lookup(object
, offset
+ run_offset
+ (n
* pg_offset
));
579 if (m
&& !m
->vmp_laundry
&& !m
->vmp_busy
&& !m
->vmp_no_cache
&& (m
->vmp_q_state
!= VM_PAGE_ON_THROTTLED_Q
) && !m
->vmp_fictitious
&& !m
->vmp_absent
) {
580 page_run
[pages_in_run
++] = m
;
583 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
585 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
586 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
587 * new reference happens. If no futher references happen on the page after that remote TLB flushes
588 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
589 * by pageout_scan, which is just fine since the last reference would have happened quite far
590 * in the past (TLB caches don't hang around for very long), and of course could just as easily
591 * have happened before we did the deactivate_behind.
593 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m
), VM_MEM_REFERENCED
, PMAP_OPTIONS_NOFLUSH
, (void *)NULL
);
597 vm_page_lockspin_queues();
599 for (n
= 0; n
< pages_in_run
; n
++) {
602 vm_page_deactivate_internal(m
, FALSE
);
604 vm_page_deactivate_behind_count
++;
606 dbgTrace(0xBEEF0019, (unsigned int) object
, (unsigned int) m
); /* (TEST/DEBUG) */
609 vm_page_unlock_queues();
617 #if (DEVELOPMENT || DEBUG)
618 uint32_t vm_page_creation_throttled_hard
= 0;
619 uint32_t vm_page_creation_throttled_soft
= 0;
620 uint64_t vm_page_creation_throttle_avoided
= 0;
621 #endif /* DEVELOPMENT || DEBUG */
624 vm_page_throttled(boolean_t page_kept
)
626 clock_sec_t elapsed_sec
;
628 clock_usec_t tv_usec
;
630 thread_t thread
= current_thread();
632 if (thread
->options
& TH_OPT_VMPRIV
) {
636 if (thread
->t_page_creation_throttled
) {
637 thread
->t_page_creation_throttled
= 0;
639 if (page_kept
== FALSE
) {
643 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
644 #if (DEVELOPMENT || DEBUG)
645 thread
->t_page_creation_throttled_hard
++;
646 OSAddAtomic(1, &vm_page_creation_throttled_hard
);
647 #endif /* DEVELOPMENT || DEBUG */
648 return HARD_THROTTLE_DELAY
;
651 if ((vm_page_free_count
< vm_page_throttle_limit
|| (VM_CONFIG_COMPRESSOR_IS_PRESENT
&& SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
652 thread
->t_page_creation_count
> (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS
* VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC
)) {
653 if (vm_page_free_wanted
== 0 && vm_page_free_wanted_privileged
== 0) {
654 #if (DEVELOPMENT || DEBUG)
655 OSAddAtomic64(1, &vm_page_creation_throttle_avoided
);
659 clock_get_system_microtime(&tv_sec
, &tv_usec
);
661 elapsed_sec
= tv_sec
- thread
->t_page_creation_time
;
663 if (elapsed_sec
<= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS
||
664 (thread
->t_page_creation_count
/ elapsed_sec
) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC
) {
665 if (elapsed_sec
>= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS
)) {
667 * we'll reset our stats to give a well behaved app
668 * that was unlucky enough to accumulate a bunch of pages
669 * over a long period of time a chance to get out of
670 * the throttled state... we reset the counter and timestamp
671 * so that if it stays under the rate limit for the next second
672 * it will be back in our good graces... if it exceeds it, it
673 * will remain in the throttled state
675 thread
->t_page_creation_time
= tv_sec
;
676 thread
->t_page_creation_count
= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC
* (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS
- 1);
678 VM_PAGEOUT_DEBUG(vm_page_throttle_count
, 1);
680 thread
->t_page_creation_throttled
= 1;
682 if (VM_CONFIG_COMPRESSOR_IS_PRESENT
&& HARD_THROTTLE_LIMIT_REACHED()) {
683 #if (DEVELOPMENT || DEBUG)
684 thread
->t_page_creation_throttled_hard
++;
685 OSAddAtomic(1, &vm_page_creation_throttled_hard
);
686 #endif /* DEVELOPMENT || DEBUG */
687 return HARD_THROTTLE_DELAY
;
689 #if (DEVELOPMENT || DEBUG)
690 thread
->t_page_creation_throttled_soft
++;
691 OSAddAtomic(1, &vm_page_creation_throttled_soft
);
692 #endif /* DEVELOPMENT || DEBUG */
693 return SOFT_THROTTLE_DELAY
;
696 thread
->t_page_creation_time
= tv_sec
;
697 thread
->t_page_creation_count
= 0;
700 thread
->t_page_creation_count
++;
707 * check for various conditions that would
708 * prevent us from creating a ZF page...
709 * cleanup is based on being called from vm_fault_page
711 * object must be locked
712 * object == m->vmp_object
714 static vm_fault_return_t
715 vm_fault_check(vm_object_t object
, vm_page_t m
, vm_page_t first_m
, wait_interrupt_t interruptible_state
, boolean_t page_throttle
)
719 if (object
->shadow_severed
||
720 VM_OBJECT_PURGEABLE_FAULT_ERROR(object
)) {
723 * 1. the shadow chain was severed,
724 * 2. the purgeable object is volatile or empty and is marked
725 * to fault on access while volatile.
726 * Just have to return an error at this point
728 if (m
!= VM_PAGE_NULL
) {
731 vm_fault_cleanup(object
, first_m
);
733 thread_interrupt_level(interruptible_state
);
735 return VM_FAULT_MEMORY_ERROR
;
737 if (page_throttle
== TRUE
) {
738 if ((throttle_delay
= vm_page_throttled(FALSE
))) {
740 * we're throttling zero-fills...
741 * treat this as if we couldn't grab a page
743 if (m
!= VM_PAGE_NULL
) {
746 vm_fault_cleanup(object
, first_m
);
748 VM_DEBUG_EVENT(vmf_check_zfdelay
, VMF_CHECK_ZFDELAY
, DBG_FUNC_NONE
, throttle_delay
, 0, 0, 0);
750 delay(throttle_delay
);
752 if (current_thread_aborted()) {
753 thread_interrupt_level(interruptible_state
);
754 return VM_FAULT_INTERRUPTED
;
756 thread_interrupt_level(interruptible_state
);
758 return VM_FAULT_MEMORY_SHORTAGE
;
761 return VM_FAULT_SUCCESS
;
765 * Clear the code signing bits on the given page_t
768 vm_fault_cs_clear(vm_page_t m
)
770 m
->vmp_cs_validated
= VMP_CS_ALL_FALSE
;
771 m
->vmp_cs_tainted
= VMP_CS_ALL_FALSE
;
772 m
->vmp_cs_nx
= VMP_CS_ALL_FALSE
;
776 * Enqueues the given page on the throttled queue.
777 * The caller must hold the vm_page_queue_lock and it will be held on return.
780 vm_fault_enqueue_throttled_locked(vm_page_t m
)
782 LCK_MTX_ASSERT(&vm_page_queue_lock
, LCK_MTX_ASSERT_OWNED
);
783 assert(!VM_PAGE_WIRED(m
));
786 * can't be on the pageout queue since we don't
787 * have a pager to try and clean to
789 vm_page_queues_remove(m
, TRUE
);
790 vm_page_check_pageable_safe(m
);
791 vm_page_queue_enter(&vm_page_queue_throttled
, m
, vmp_pageq
);
792 m
->vmp_q_state
= VM_PAGE_ON_THROTTLED_Q
;
793 vm_page_throttled_count
++;
797 * do the work to zero fill a page and
798 * inject it into the correct paging queue
800 * m->vmp_object must be locked
801 * page queue lock must NOT be held
804 vm_fault_zero_page(vm_page_t m
, boolean_t no_zero_fill
)
806 int my_fault
= DBG_ZERO_FILL_FAULT
;
809 object
= VM_PAGE_OBJECT(m
);
812 * This is is a zero-fill page fault...
814 * Checking the page lock is a waste of
815 * time; this page was absent, so
816 * it can't be page locked by a pager.
818 * we also consider it undefined
819 * with respect to instruction
820 * execution. i.e. it is the responsibility
821 * of higher layers to call for an instruction
822 * sync after changing the contents and before
823 * sending a program into this area. We
824 * choose this approach for performance
826 vm_fault_cs_clear(m
);
827 m
->vmp_pmapped
= TRUE
;
829 if (no_zero_fill
== TRUE
) {
830 my_fault
= DBG_NZF_PAGE_FAULT
;
832 if (m
->vmp_absent
&& m
->vmp_busy
) {
836 vm_page_zero_fill(m
);
838 VM_STAT_INCR(zero_fill_count
);
839 DTRACE_VM2(zfod
, int, 1, (uint64_t *), NULL
);
841 assert(!m
->vmp_laundry
);
842 assert(object
!= kernel_object
);
843 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
844 if (!VM_DYNAMIC_PAGING_ENABLED() &&
845 (object
->purgable
== VM_PURGABLE_DENY
||
846 object
->purgable
== VM_PURGABLE_NONVOLATILE
||
847 object
->purgable
== VM_PURGABLE_VOLATILE
)) {
848 vm_page_lockspin_queues();
849 if (!VM_DYNAMIC_PAGING_ENABLED()) {
850 vm_fault_enqueue_throttled_locked(m
);
852 vm_page_unlock_queues();
859 * Routine: vm_fault_page
861 * Find the resident page for the virtual memory
862 * specified by the given virtual memory object
864 * Additional arguments:
865 * The required permissions for the page is given
866 * in "fault_type". Desired permissions are included
868 * fault_info is passed along to determine pagein cluster
869 * limits... it contains the expected reference pattern,
870 * cluster size if available, etc...
872 * If the desired page is known to be resident (for
873 * example, because it was previously wired down), asserting
874 * the "unwiring" parameter will speed the search.
876 * If the operation can be interrupted (by thread_abort
877 * or thread_terminate), then the "interruptible"
878 * parameter should be asserted.
881 * The page containing the proper data is returned
885 * The source object must be locked and referenced,
886 * and must donate one paging reference. The reference
887 * is not affected. The paging reference and lock are
890 * If the call succeeds, the object in which "result_page"
891 * resides is left locked and holding a paging reference.
892 * If this is not the original object, a busy page in the
893 * original object is returned in "top_page", to prevent other
894 * callers from pursuing this same data, along with a paging
895 * reference for the original object. The "top_page" should
896 * be destroyed when this guarantee is no longer required.
897 * The "result_page" is also left busy. It is not removed
898 * from the pageout queues.
900 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
901 * fault succeeded but there's no VM page (i.e. the VM object
902 * does not actually hold VM pages, but device memory or
903 * large pages). The object is still locked and we still hold a
904 * paging_in_progress reference.
906 unsigned int vm_fault_page_blocked_access
= 0;
907 unsigned int vm_fault_page_forced_retry
= 0;
912 vm_object_t first_object
, /* Object to begin search */
913 vm_object_offset_t first_offset
, /* Offset into object */
914 vm_prot_t fault_type
, /* What access is requested */
915 boolean_t must_be_resident
,/* Must page be resident? */
916 boolean_t caller_lookup
, /* caller looked up page */
917 /* Modifies in place: */
918 vm_prot_t
*protection
, /* Protection for mapping */
919 vm_page_t
*result_page
, /* Page found, if successful */
921 vm_page_t
*top_page
, /* Page in top object, if
922 * not result_page. */
923 int *type_of_fault
, /* if non-null, fill in with type of fault
924 * COW, zero-fill, etc... returned in trace point */
925 /* More arguments: */
926 kern_return_t
*error_code
, /* code if page is in error */
927 boolean_t no_zero_fill
, /* don't zero fill absent pages */
928 boolean_t data_supply
, /* treat as data_supply if
929 * it is a write fault and a full
930 * page is provided */
931 vm_object_fault_info_t fault_info
)
935 vm_object_offset_t offset
;
937 vm_object_t next_object
;
938 vm_object_t copy_object
;
939 boolean_t look_for_page
;
940 boolean_t force_fault_retry
= FALSE
;
941 vm_prot_t access_required
= fault_type
;
942 vm_prot_t wants_copy_flag
;
943 kern_return_t wait_result
;
944 wait_interrupt_t interruptible_state
;
945 boolean_t data_already_requested
= FALSE
;
946 vm_behavior_t orig_behavior
;
947 vm_size_t orig_cluster_size
;
948 vm_fault_return_t error
;
950 uint32_t try_failed_count
;
951 int interruptible
; /* how may fault be interrupted? */
952 int external_state
= VM_EXTERNAL_STATE_UNKNOWN
;
953 memory_object_t pager
;
954 vm_fault_return_t retval
;
958 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
959 * marked as paged out in the compressor pager or the pager doesn't exist.
960 * Note also that if the pager for an internal object
961 * has not been created, the pager is not invoked regardless of the value
962 * of MUST_ASK_PAGER().
964 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
965 * is marked as paged out in the compressor pager.
966 * PAGED_OUT() is used to determine if a page has already been pushed
967 * into a copy object in order to avoid a redundant page out operation.
969 #define MUST_ASK_PAGER(o, f, s) \
970 ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
972 #define PAGED_OUT(o, f) \
973 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
978 #define RELEASE_PAGE(m) \
980 PAGE_WAKEUP_DONE(m); \
981 if ( !VM_PAGE_PAGEABLE(m)) { \
982 vm_page_lockspin_queues(); \
983 if ( !VM_PAGE_PAGEABLE(m)) { \
984 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
985 vm_page_deactivate(m); \
987 vm_page_activate(m); \
989 vm_page_unlock_queues(); \
994 dbgTrace(0xBEEF0002, (unsigned int) first_object
, (unsigned int) first_offset
); /* (TEST/DEBUG) */
997 interruptible
= fault_info
->interruptible
;
998 interruptible_state
= thread_interrupt_level(interruptible
);
1001 * INVARIANTS (through entire routine):
1003 * 1) At all times, we must either have the object
1004 * lock or a busy page in some object to prevent
1005 * some other thread from trying to bring in
1008 * Note that we cannot hold any locks during the
1009 * pager access or when waiting for memory, so
1010 * we use a busy page then.
1012 * 2) To prevent another thread from racing us down the
1013 * shadow chain and entering a new page in the top
1014 * object before we do, we must keep a busy page in
1015 * the top object while following the shadow chain.
1017 * 3) We must increment paging_in_progress on any object
1018 * for which we have a busy page before dropping
1021 * 4) We leave busy pages on the pageout queues.
1022 * If the pageout daemon comes across a busy page,
1023 * it will remove the page from the pageout queues.
1026 object
= first_object
;
1027 offset
= first_offset
;
1028 first_m
= VM_PAGE_NULL
;
1029 access_required
= fault_type
;
1032 * default type of fault
1034 my_fault
= DBG_CACHE_HIT_FAULT
;
1038 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1042 #if CONFIG_SECLUDED_MEMORY
1043 if (object
->can_grab_secluded
) {
1044 grab_options
|= VM_PAGE_GRAB_SECLUDED
;
1046 #endif /* CONFIG_SECLUDED_MEMORY */
1048 if (!object
->alive
) {
1050 * object is no longer valid
1051 * clean up and return error
1053 vm_fault_cleanup(object
, first_m
);
1054 thread_interrupt_level(interruptible_state
);
1056 return VM_FAULT_MEMORY_ERROR
;
1059 if (!object
->pager_created
&& object
->phys_contiguous
) {
1061 * A physically-contiguous object without a pager:
1062 * must be a "large page" object. We do not deal
1063 * with VM pages for this object.
1065 caller_lookup
= FALSE
;
1067 goto phys_contig_object
;
1070 if (object
->blocked_access
) {
1072 * Access to this VM object has been blocked.
1073 * Replace our "paging_in_progress" reference with
1074 * a "activity_in_progress" reference and wait for
1075 * access to be unblocked.
1077 caller_lookup
= FALSE
; /* no longer valid after sleep */
1078 vm_object_activity_begin(object
);
1079 vm_object_paging_end(object
);
1080 while (object
->blocked_access
) {
1081 vm_object_sleep(object
,
1082 VM_OBJECT_EVENT_UNBLOCKED
,
1085 vm_fault_page_blocked_access
++;
1086 vm_object_paging_begin(object
);
1087 vm_object_activity_end(object
);
1091 * See whether the page at 'offset' is resident
1093 if (caller_lookup
== TRUE
) {
1095 * The caller has already looked up the page
1096 * and gave us the result in "result_page".
1097 * We can use this for the first lookup but
1098 * it loses its validity as soon as we unlock
1102 caller_lookup
= FALSE
; /* no longer valid after that */
1104 m
= vm_page_lookup(object
, vm_object_trunc_page(offset
));
1107 dbgTrace(0xBEEF0004, (unsigned int) m
, (unsigned int) object
); /* (TEST/DEBUG) */
1109 if (m
!= VM_PAGE_NULL
) {
1112 * The page is being brought in,
1113 * wait for it and then retry.
1116 dbgTrace(0xBEEF0005, (unsigned int) m
, (unsigned int) 0); /* (TEST/DEBUG) */
1118 wait_result
= PAGE_SLEEP(object
, m
, interruptible
);
1120 counter(c_vm_fault_page_block_busy_kernel
++);
1122 if (wait_result
!= THREAD_AWAKENED
) {
1123 vm_fault_cleanup(object
, first_m
);
1124 thread_interrupt_level(interruptible_state
);
1126 if (wait_result
== THREAD_RESTART
) {
1127 return VM_FAULT_RETRY
;
1129 return VM_FAULT_INTERRUPTED
;
1134 if (m
->vmp_laundry
) {
1135 m
->vmp_free_when_done
= FALSE
;
1137 if (!m
->vmp_cleaning
) {
1138 vm_pageout_steal_laundry(m
, FALSE
);
1141 if (VM_PAGE_GET_PHYS_PAGE(m
) == vm_page_guard_addr
) {
1143 * Guard page: off limits !
1145 if (fault_type
== VM_PROT_NONE
) {
1147 * The fault is not requesting any
1148 * access to the guard page, so it must
1149 * be just to wire or unwire it.
1150 * Let's pretend it succeeded...
1154 assert(first_m
== VM_PAGE_NULL
);
1155 *top_page
= first_m
;
1156 if (type_of_fault
) {
1157 *type_of_fault
= DBG_GUARD_FAULT
;
1159 thread_interrupt_level(interruptible_state
);
1160 return VM_FAULT_SUCCESS
;
1163 * The fault requests access to the
1164 * guard page: let's deny that !
1166 vm_fault_cleanup(object
, first_m
);
1167 thread_interrupt_level(interruptible_state
);
1168 return VM_FAULT_MEMORY_ERROR
;
1174 * The page is in error, give up now.
1177 dbgTrace(0xBEEF0006, (unsigned int) m
, (unsigned int) error_code
); /* (TEST/DEBUG) */
1180 *error_code
= KERN_MEMORY_ERROR
;
1184 vm_fault_cleanup(object
, first_m
);
1185 thread_interrupt_level(interruptible_state
);
1187 return VM_FAULT_MEMORY_ERROR
;
1189 if (m
->vmp_restart
) {
1191 * The pager wants us to restart
1192 * at the top of the chain,
1193 * typically because it has moved the
1194 * page to another pager, then do so.
1197 dbgTrace(0xBEEF0007, (unsigned int) m
, (unsigned int) 0); /* (TEST/DEBUG) */
1201 vm_fault_cleanup(object
, first_m
);
1202 thread_interrupt_level(interruptible_state
);
1204 return VM_FAULT_RETRY
;
1206 if (m
->vmp_absent
) {
1208 * The page isn't busy, but is absent,
1209 * therefore it's deemed "unavailable".
1211 * Remove the non-existent page (unless it's
1212 * in the top object) and move on down to the
1213 * next object (if there is one).
1216 dbgTrace(0xBEEF0008, (unsigned int) m
, (unsigned int) object
->shadow
); /* (TEST/DEBUG) */
1218 next_object
= object
->shadow
;
1220 if (next_object
== VM_OBJECT_NULL
) {
1222 * Absent page at bottom of shadow
1223 * chain; zero fill the page we left
1224 * busy in the first object, and free
1227 assert(!must_be_resident
);
1230 * check for any conditions that prevent
1231 * us from creating a new zero-fill page
1232 * vm_fault_check will do all of the
1233 * fault cleanup in the case of an error condition
1234 * including resetting the thread_interrupt_level
1236 error
= vm_fault_check(object
, m
, first_m
, interruptible_state
, (type_of_fault
== NULL
) ? TRUE
: FALSE
);
1238 if (error
!= VM_FAULT_SUCCESS
) {
1242 if (object
!= first_object
) {
1244 * free the absent page we just found
1249 * drop reference and lock on current object
1251 vm_object_paging_end(object
);
1252 vm_object_unlock(object
);
1255 * grab the original page we
1256 * 'soldered' in place and
1257 * retake lock on 'first_object'
1260 first_m
= VM_PAGE_NULL
;
1262 object
= first_object
;
1263 offset
= first_offset
;
1265 vm_object_lock(object
);
1268 * we're going to use the absent page we just found
1269 * so convert it to a 'busy' page
1271 m
->vmp_absent
= FALSE
;
1274 if (fault_info
->mark_zf_absent
&& no_zero_fill
== TRUE
) {
1275 m
->vmp_absent
= TRUE
;
1278 * zero-fill the page and put it on
1279 * the correct paging queue
1281 my_fault
= vm_fault_zero_page(m
, no_zero_fill
);
1285 if (must_be_resident
) {
1286 vm_object_paging_end(object
);
1287 } else if (object
!= first_object
) {
1288 vm_object_paging_end(object
);
1292 m
->vmp_absent
= FALSE
;
1295 vm_page_lockspin_queues();
1296 vm_page_queues_remove(m
, FALSE
);
1297 vm_page_unlock_queues();
1300 offset
+= object
->vo_shadow_offset
;
1301 fault_info
->lo_offset
+= object
->vo_shadow_offset
;
1302 fault_info
->hi_offset
+= object
->vo_shadow_offset
;
1303 access_required
= VM_PROT_READ
;
1305 vm_object_lock(next_object
);
1306 vm_object_unlock(object
);
1307 object
= next_object
;
1308 vm_object_paging_begin(object
);
1311 * reset to default type of fault
1313 my_fault
= DBG_CACHE_HIT_FAULT
;
1318 if ((m
->vmp_cleaning
)
1319 && ((object
!= first_object
) || (object
->copy
!= VM_OBJECT_NULL
))
1320 && (fault_type
& VM_PROT_WRITE
)) {
1322 * This is a copy-on-write fault that will
1323 * cause us to revoke access to this page, but
1324 * this page is in the process of being cleaned
1325 * in a clustered pageout. We must wait until
1326 * the cleaning operation completes before
1327 * revoking access to the original page,
1328 * otherwise we might attempt to remove a
1332 dbgTrace(0xBEEF0009, (unsigned int) m
, (unsigned int) offset
); /* (TEST/DEBUG) */
1335 * take an extra ref so that object won't die
1337 vm_object_reference_locked(object
);
1339 vm_fault_cleanup(object
, first_m
);
1341 counter(c_vm_fault_page_block_backoff_kernel
++);
1342 vm_object_lock(object
);
1343 assert(object
->ref_count
> 0);
1345 m
= vm_page_lookup(object
, vm_object_trunc_page(offset
));
1347 if (m
!= VM_PAGE_NULL
&& m
->vmp_cleaning
) {
1348 PAGE_ASSERT_WAIT(m
, interruptible
);
1350 vm_object_unlock(object
);
1351 wait_result
= thread_block(THREAD_CONTINUE_NULL
);
1352 vm_object_deallocate(object
);
1356 vm_object_unlock(object
);
1358 vm_object_deallocate(object
);
1359 thread_interrupt_level(interruptible_state
);
1361 return VM_FAULT_RETRY
;
1364 if (type_of_fault
== NULL
&& (m
->vmp_q_state
== VM_PAGE_ON_SPECULATIVE_Q
) &&
1365 !(fault_info
!= NULL
&& fault_info
->stealth
)) {
1367 * If we were passed a non-NULL pointer for
1368 * "type_of_fault", than we came from
1369 * vm_fault... we'll let it deal with
1370 * this condition, since it
1371 * needs to see m->vmp_speculative to correctly
1372 * account the pageins, otherwise...
1373 * take it off the speculative queue, we'll
1374 * let the caller of vm_fault_page deal
1375 * with getting it onto the correct queue
1377 * If the caller specified in fault_info that
1378 * it wants a "stealth" fault, we also leave
1379 * the page in the speculative queue.
1381 vm_page_lockspin_queues();
1382 if (m
->vmp_q_state
== VM_PAGE_ON_SPECULATIVE_Q
) {
1383 vm_page_queues_remove(m
, FALSE
);
1385 vm_page_unlock_queues();
1387 assert(object
== VM_PAGE_OBJECT(m
));
1389 if (object
->code_signed
) {
1392 * We just paged in a page from a signed
1393 * memory object but we don't need to
1394 * validate it now. We'll validate it if
1395 * when it gets mapped into a user address
1396 * space for the first time or when the page
1397 * gets copied to another object as a result
1398 * of a copy-on-write.
1403 * We mark the page busy and leave it on
1404 * the pageout queues. If the pageout
1405 * deamon comes across it, then it will
1406 * remove the page from the queue, but not the object
1409 dbgTrace(0xBEEF000B, (unsigned int) m
, (unsigned int) 0); /* (TEST/DEBUG) */
1411 assert(!m
->vmp_busy
);
1412 assert(!m
->vmp_absent
);
1420 * we get here when there is no page present in the object at
1421 * the offset we're interested in... we'll allocate a page
1422 * at this point if the pager associated with
1423 * this object can provide the data or we're the top object...
1424 * object is locked; m == NULL
1427 if (must_be_resident
) {
1428 if (fault_type
== VM_PROT_NONE
&&
1429 object
== kernel_object
) {
1431 * We've been called from vm_fault_unwire()
1432 * while removing a map entry that was allocated
1433 * with KMA_KOBJECT and KMA_VAONLY. This page
1434 * is not present and there's nothing more to
1435 * do here (nothing to unwire).
1437 vm_fault_cleanup(object
, first_m
);
1438 thread_interrupt_level(interruptible_state
);
1440 return VM_FAULT_MEMORY_ERROR
;
1443 goto dont_look_for_page
;
1446 /* Don't expect to fault pages into the kernel object. */
1447 assert(object
!= kernel_object
);
1449 data_supply
= FALSE
;
1451 look_for_page
= (object
->pager_created
&& (MUST_ASK_PAGER(object
, offset
, external_state
) == TRUE
) && !data_supply
);
1454 dbgTrace(0xBEEF000C, (unsigned int) look_for_page
, (unsigned int) object
); /* (TEST/DEBUG) */
1456 if (!look_for_page
&& object
== first_object
&& !object
->phys_contiguous
) {
1458 * Allocate a new page for this object/offset pair as a placeholder
1460 m
= vm_page_grab_options(grab_options
);
1462 dbgTrace(0xBEEF000D, (unsigned int) m
, (unsigned int) object
); /* (TEST/DEBUG) */
1464 if (m
== VM_PAGE_NULL
) {
1465 vm_fault_cleanup(object
, first_m
);
1466 thread_interrupt_level(interruptible_state
);
1468 return VM_FAULT_MEMORY_SHORTAGE
;
1471 if (fault_info
&& fault_info
->batch_pmap_op
== TRUE
) {
1472 vm_page_insert_internal(m
, object
,
1473 vm_object_trunc_page(offset
),
1474 VM_KERN_MEMORY_NONE
, FALSE
, TRUE
, TRUE
, FALSE
, NULL
);
1476 vm_page_insert(m
, object
, vm_object_trunc_page(offset
));
1479 if (look_for_page
) {
1484 * If the memory manager is not ready, we
1485 * cannot make requests.
1487 if (!object
->pager_ready
) {
1489 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1491 if (m
!= VM_PAGE_NULL
) {
1496 * take an extra ref so object won't die
1498 vm_object_reference_locked(object
);
1499 vm_fault_cleanup(object
, first_m
);
1500 counter(c_vm_fault_page_block_backoff_kernel
++);
1502 vm_object_lock(object
);
1503 assert(object
->ref_count
> 0);
1505 if (!object
->pager_ready
) {
1506 wait_result
= vm_object_assert_wait(object
, VM_OBJECT_EVENT_PAGER_READY
, interruptible
);
1508 vm_object_unlock(object
);
1509 if (wait_result
== THREAD_WAITING
) {
1510 wait_result
= thread_block(THREAD_CONTINUE_NULL
);
1512 vm_object_deallocate(object
);
1516 vm_object_unlock(object
);
1517 vm_object_deallocate(object
);
1518 thread_interrupt_level(interruptible_state
);
1520 return VM_FAULT_RETRY
;
1523 if (!object
->internal
&& !object
->phys_contiguous
&& object
->paging_in_progress
> vm_object_pagein_throttle
) {
1525 * If there are too many outstanding page
1526 * requests pending on this external object, we
1527 * wait for them to be resolved now.
1530 dbgTrace(0xBEEF0010, (unsigned int) m
, (unsigned int) 0); /* (TEST/DEBUG) */
1532 if (m
!= VM_PAGE_NULL
) {
1536 * take an extra ref so object won't die
1538 vm_object_reference_locked(object
);
1540 vm_fault_cleanup(object
, first_m
);
1542 counter(c_vm_fault_page_block_backoff_kernel
++);
1544 vm_object_lock(object
);
1545 assert(object
->ref_count
> 0);
1547 if (object
->paging_in_progress
>= vm_object_pagein_throttle
) {
1548 vm_object_assert_wait(object
, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS
, interruptible
);
1550 vm_object_unlock(object
);
1551 wait_result
= thread_block(THREAD_CONTINUE_NULL
);
1552 vm_object_deallocate(object
);
1556 vm_object_unlock(object
);
1557 vm_object_deallocate(object
);
1558 thread_interrupt_level(interruptible_state
);
1560 return VM_FAULT_RETRY
;
1563 if (object
->internal
) {
1564 int compressed_count_delta
;
1566 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT
);
1568 if (m
== VM_PAGE_NULL
) {
1570 * Allocate a new page for this object/offset pair as a placeholder
1572 m
= vm_page_grab_options(grab_options
);
1574 dbgTrace(0xBEEF000D, (unsigned int) m
, (unsigned int) object
); /* (TEST/DEBUG) */
1576 if (m
== VM_PAGE_NULL
) {
1577 vm_fault_cleanup(object
, first_m
);
1578 thread_interrupt_level(interruptible_state
);
1580 return VM_FAULT_MEMORY_SHORTAGE
;
1583 m
->vmp_absent
= TRUE
;
1584 if (fault_info
&& fault_info
->batch_pmap_op
== TRUE
) {
1585 vm_page_insert_internal(m
, object
, vm_object_trunc_page(offset
), VM_KERN_MEMORY_NONE
, FALSE
, TRUE
, TRUE
, FALSE
, NULL
);
1587 vm_page_insert(m
, object
, vm_object_trunc_page(offset
));
1590 assert(m
->vmp_busy
);
1592 m
->vmp_absent
= TRUE
;
1593 pager
= object
->pager
;
1595 assert(object
->paging_in_progress
> 0);
1596 vm_object_unlock(object
);
1598 rc
= vm_compressor_pager_get(
1600 offset
+ object
->paging_offset
,
1601 VM_PAGE_GET_PHYS_PAGE(m
),
1604 &compressed_count_delta
);
1606 if (type_of_fault
== NULL
) {
1610 * we weren't called from vm_fault, so we
1611 * need to apply page creation throttling
1612 * do it before we re-acquire any locks
1614 if (my_fault_type
== DBG_COMPRESSOR_FAULT
) {
1615 if ((throttle_delay
= vm_page_throttled(TRUE
))) {
1616 VM_DEBUG_EVENT(vmf_compressordelay
, VMF_COMPRESSORDELAY
, DBG_FUNC_NONE
, throttle_delay
, 0, 1, 0);
1617 delay(throttle_delay
);
1621 vm_object_lock(object
);
1622 assert(object
->paging_in_progress
> 0);
1624 vm_compressor_pager_count(
1626 compressed_count_delta
,
1627 FALSE
, /* shared_lock */
1632 m
->vmp_absent
= FALSE
;
1633 m
->vmp_dirty
= TRUE
;
1634 if ((object
->wimg_bits
&
1636 VM_WIMG_USE_DEFAULT
) {
1638 * If the page is not cacheable,
1639 * we can't let its contents
1640 * linger in the data cache
1641 * after the decompression.
1643 pmap_sync_page_attributes_phys(
1644 VM_PAGE_GET_PHYS_PAGE(m
));
1646 m
->vmp_written_by_kernel
= TRUE
;
1650 * If the object is purgeable, its
1651 * owner's purgeable ledgers have been
1652 * updated in vm_page_insert() but the
1653 * page was also accounted for in a
1654 * "compressed purgeable" ledger, so
1657 if (((object
->purgable
!=
1658 VM_PURGABLE_DENY
) ||
1659 object
->vo_ledger_tag
) &&
1660 (object
->vo_owner
!=
1663 * One less compressed
1664 * purgeable/tagged page.
1666 vm_object_owner_compressed_update(
1672 case KERN_MEMORY_FAILURE
:
1673 m
->vmp_unusual
= TRUE
;
1674 m
->vmp_error
= TRUE
;
1675 m
->vmp_absent
= FALSE
;
1677 case KERN_MEMORY_ERROR
:
1678 assert(m
->vmp_absent
);
1681 panic("vm_fault_page(): unexpected "
1683 "vm_compressor_pager_get()\n",
1686 PAGE_WAKEUP_DONE(m
);
1689 goto data_requested
;
1691 my_fault_type
= DBG_PAGEIN_FAULT
;
1693 if (m
!= VM_PAGE_NULL
) {
1699 dbgTrace(0xBEEF0012, (unsigned int) object
, (unsigned int) 0); /* (TEST/DEBUG) */
1703 * It's possible someone called vm_object_destroy while we weren't
1704 * holding the object lock. If that has happened, then bail out
1708 pager
= object
->pager
;
1710 if (pager
== MEMORY_OBJECT_NULL
) {
1711 vm_fault_cleanup(object
, first_m
);
1712 thread_interrupt_level(interruptible_state
);
1713 return VM_FAULT_MEMORY_ERROR
;
1717 * We have an absent page in place for the faulting offset,
1718 * so we can release the object lock.
1721 if (object
->object_is_shared_cache
) {
1722 set_thread_rwlock_boost();
1725 vm_object_unlock(object
);
1728 * If this object uses a copy_call strategy,
1729 * and we are interested in a copy of this object
1730 * (having gotten here only by following a
1731 * shadow chain), then tell the memory manager
1732 * via a flag added to the desired_access
1733 * parameter, so that it can detect a race
1734 * between our walking down the shadow chain
1735 * and its pushing pages up into a copy of
1736 * the object that it manages.
1738 if (object
->copy_strategy
== MEMORY_OBJECT_COPY_CALL
&& object
!= first_object
) {
1739 wants_copy_flag
= VM_PROT_WANTS_COPY
;
1741 wants_copy_flag
= VM_PROT_NONE
;
1744 if (object
->copy
== first_object
) {
1746 * if we issue the memory_object_data_request in
1747 * this state, we are subject to a deadlock with
1748 * the underlying filesystem if it is trying to
1749 * shrink the file resulting in a push of pages
1750 * into the copy object... that push will stall
1751 * on the placeholder page, and if the pushing thread
1752 * is holding a lock that is required on the pagein
1753 * path (such as a truncate lock), we'll deadlock...
1754 * to avoid this potential deadlock, we throw away
1755 * our placeholder page before calling memory_object_data_request
1756 * and force this thread to retry the vm_fault_page after
1757 * we have issued the I/O. the second time through this path
1758 * we will find the page already in the cache (presumably still
1759 * busy waiting for the I/O to complete) and then complete
1760 * the fault w/o having to go through memory_object_data_request again
1762 assert(first_m
!= VM_PAGE_NULL
);
1763 assert(VM_PAGE_OBJECT(first_m
) == first_object
);
1765 vm_object_lock(first_object
);
1766 VM_PAGE_FREE(first_m
);
1767 vm_object_paging_end(first_object
);
1768 vm_object_unlock(first_object
);
1770 first_m
= VM_PAGE_NULL
;
1771 force_fault_retry
= TRUE
;
1773 vm_fault_page_forced_retry
++;
1776 if (data_already_requested
== TRUE
) {
1777 orig_behavior
= fault_info
->behavior
;
1778 orig_cluster_size
= fault_info
->cluster_size
;
1780 fault_info
->behavior
= VM_BEHAVIOR_RANDOM
;
1781 fault_info
->cluster_size
= PAGE_SIZE
;
1784 * Call the memory manager to retrieve the data.
1786 rc
= memory_object_data_request(
1788 vm_object_trunc_page(offset
) + object
->paging_offset
,
1790 access_required
| wants_copy_flag
,
1791 (memory_object_fault_info_t
)fault_info
);
1793 if (data_already_requested
== TRUE
) {
1794 fault_info
->behavior
= orig_behavior
;
1795 fault_info
->cluster_size
= orig_cluster_size
;
1797 data_already_requested
= TRUE
;
1800 DTRACE_VM2(maj_fault
, int, 1, (uint64_t *), NULL
);
1802 dbgTrace(0xBEEF0013, (unsigned int) object
, (unsigned int) rc
); /* (TEST/DEBUG) */
1804 vm_object_lock(object
);
1806 if (object
->object_is_shared_cache
) {
1807 clear_thread_rwlock_boost();
1811 if (rc
!= KERN_SUCCESS
) {
1812 vm_fault_cleanup(object
, first_m
);
1813 thread_interrupt_level(interruptible_state
);
1815 return (rc
== MACH_SEND_INTERRUPTED
) ?
1816 VM_FAULT_INTERRUPTED
:
1817 VM_FAULT_MEMORY_ERROR
;
1820 clock_usec_t tv_usec
;
1822 if (my_fault_type
== DBG_PAGEIN_FAULT
) {
1823 clock_get_system_microtime(&tv_sec
, &tv_usec
);
1824 current_thread()->t_page_creation_time
= tv_sec
;
1825 current_thread()->t_page_creation_count
= 0;
1828 if ((interruptible
!= THREAD_UNINT
) && (current_thread()->sched_flags
& TH_SFLAG_ABORT
)) {
1829 vm_fault_cleanup(object
, first_m
);
1830 thread_interrupt_level(interruptible_state
);
1832 return VM_FAULT_INTERRUPTED
;
1834 if (force_fault_retry
== TRUE
) {
1835 vm_fault_cleanup(object
, first_m
);
1836 thread_interrupt_level(interruptible_state
);
1838 return VM_FAULT_RETRY
;
1840 if (m
== VM_PAGE_NULL
&& object
->phys_contiguous
) {
1842 * No page here means that the object we
1843 * initially looked up was "physically
1844 * contiguous" (i.e. device memory). However,
1845 * with Virtual VRAM, the object might not
1846 * be backed by that device memory anymore,
1847 * so we're done here only if the object is
1848 * still "phys_contiguous".
1849 * Otherwise, if the object is no longer
1850 * "phys_contiguous", we need to retry the
1851 * page fault against the object's new backing
1852 * store (different memory object).
1858 * potentially a pagein fault
1859 * if we make it through the state checks
1860 * above, than we'll count it as such
1862 my_fault
= my_fault_type
;
1865 * Retry with same object/offset, since new data may
1866 * be in a different page (i.e., m is meaningless at
1873 * We get here if the object has no pager, or an existence map
1874 * exists and indicates the page isn't present on the pager
1875 * or we're unwiring a page. If a pager exists, but there
1876 * is no existence map, then the m->vmp_absent case above handles
1877 * the ZF case when the pager can't provide the page
1880 dbgTrace(0xBEEF0014, (unsigned int) object
, (unsigned int) m
); /* (TEST/DEBUG) */
1882 if (object
== first_object
) {
1885 assert(m
== VM_PAGE_NULL
);
1888 next_object
= object
->shadow
;
1890 if (next_object
== VM_OBJECT_NULL
) {
1892 * we've hit the bottom of the shadown chain,
1893 * fill the page in the top object with zeros.
1895 assert(!must_be_resident
);
1897 if (object
!= first_object
) {
1898 vm_object_paging_end(object
);
1899 vm_object_unlock(object
);
1901 object
= first_object
;
1902 offset
= first_offset
;
1903 vm_object_lock(object
);
1906 assert(VM_PAGE_OBJECT(m
) == object
);
1907 first_m
= VM_PAGE_NULL
;
1910 * check for any conditions that prevent
1911 * us from creating a new zero-fill page
1912 * vm_fault_check will do all of the
1913 * fault cleanup in the case of an error condition
1914 * including resetting the thread_interrupt_level
1916 error
= vm_fault_check(object
, m
, first_m
, interruptible_state
, (type_of_fault
== NULL
) ? TRUE
: FALSE
);
1918 if (error
!= VM_FAULT_SUCCESS
) {
1922 if (m
== VM_PAGE_NULL
) {
1923 m
= vm_page_grab_options(grab_options
);
1925 if (m
== VM_PAGE_NULL
) {
1926 vm_fault_cleanup(object
, VM_PAGE_NULL
);
1927 thread_interrupt_level(interruptible_state
);
1929 return VM_FAULT_MEMORY_SHORTAGE
;
1931 vm_page_insert(m
, object
, vm_object_trunc_page(offset
));
1933 if (fault_info
->mark_zf_absent
&& no_zero_fill
== TRUE
) {
1934 m
->vmp_absent
= TRUE
;
1937 my_fault
= vm_fault_zero_page(m
, no_zero_fill
);
1942 * Move on to the next object. Lock the next
1943 * object before unlocking the current one.
1945 if ((object
!= first_object
) || must_be_resident
) {
1946 vm_object_paging_end(object
);
1949 offset
+= object
->vo_shadow_offset
;
1950 fault_info
->lo_offset
+= object
->vo_shadow_offset
;
1951 fault_info
->hi_offset
+= object
->vo_shadow_offset
;
1952 access_required
= VM_PROT_READ
;
1954 vm_object_lock(next_object
);
1955 vm_object_unlock(object
);
1957 object
= next_object
;
1958 vm_object_paging_begin(object
);
1963 * PAGE HAS BEEN FOUND.
1966 * busy, so that we can play with it;
1967 * not absent, so that nobody else will fill it;
1968 * possibly eligible for pageout;
1970 * The top-level page (first_m) is:
1971 * VM_PAGE_NULL if the page was found in the
1973 * busy, not absent, and ineligible for pageout.
1975 * The current object (object) is locked. A paging
1976 * reference is held for the current and top-level
1981 dbgTrace(0xBEEF0015, (unsigned int) object
, (unsigned int) m
); /* (TEST/DEBUG) */
1983 #if EXTRA_ASSERTIONS
1984 assert(m
->vmp_busy
&& !m
->vmp_absent
);
1985 assert((first_m
== VM_PAGE_NULL
) ||
1986 (first_m
->vmp_busy
&& !first_m
->vmp_absent
&&
1987 !first_m
->vmp_active
&& !first_m
->vmp_inactive
&& !first_m
->vmp_secluded
));
1988 #endif /* EXTRA_ASSERTIONS */
1991 * If the page is being written, but isn't
1992 * already owned by the top-level object,
1993 * we have to copy it into a new page owned
1994 * by the top-level object.
1996 if (object
!= first_object
) {
1998 dbgTrace(0xBEEF0016, (unsigned int) object
, (unsigned int) fault_type
); /* (TEST/DEBUG) */
2000 if (fault_type
& VM_PROT_WRITE
) {
2004 * We only really need to copy if we
2007 assert(!must_be_resident
);
2010 * If we try to collapse first_object at this
2011 * point, we may deadlock when we try to get
2012 * the lock on an intermediate object (since we
2013 * have the bottom object locked). We can't
2014 * unlock the bottom object, because the page
2015 * we found may move (by collapse) if we do.
2017 * Instead, we first copy the page. Then, when
2018 * we have no more use for the bottom object,
2019 * we unlock it and try to collapse.
2021 * Note that we copy the page even if we didn't
2022 * need to... that's the breaks.
2026 * Allocate a page for the copy
2028 copy_m
= vm_page_grab_options(grab_options
);
2030 if (copy_m
== VM_PAGE_NULL
) {
2033 vm_fault_cleanup(object
, first_m
);
2034 thread_interrupt_level(interruptible_state
);
2036 return VM_FAULT_MEMORY_SHORTAGE
;
2039 vm_page_copy(m
, copy_m
);
2042 * If another map is truly sharing this
2043 * page with us, we have to flush all
2044 * uses of the original page, since we
2045 * can't distinguish those which want the
2046 * original from those which need the
2049 * XXXO If we know that only one map has
2050 * access to this page, then we could
2051 * avoid the pmap_disconnect() call.
2053 if (m
->vmp_pmapped
) {
2054 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m
));
2057 if (m
->vmp_clustered
) {
2058 VM_PAGE_COUNT_AS_PAGEIN(m
);
2059 VM_PAGE_CONSUME_CLUSTERED(m
);
2061 assert(!m
->vmp_cleaning
);
2064 * We no longer need the old page or object.
2069 * This check helps with marking the object as having a sequential pattern
2070 * Normally we'll miss doing this below because this fault is about COW to
2071 * the first_object i.e. bring page in from disk, push to object above but
2072 * don't update the file object's sequential pattern.
2074 if (object
->internal
== FALSE
) {
2075 vm_fault_is_sequential(object
, offset
, fault_info
->behavior
);
2078 vm_object_paging_end(object
);
2079 vm_object_unlock(object
);
2081 my_fault
= DBG_COW_FAULT
;
2082 VM_STAT_INCR(cow_faults
);
2083 DTRACE_VM2(cow_fault
, int, 1, (uint64_t *), NULL
);
2084 current_task()->cow_faults
++;
2086 object
= first_object
;
2087 offset
= first_offset
;
2089 vm_object_lock(object
);
2091 * get rid of the place holder
2092 * page that we soldered in earlier
2094 VM_PAGE_FREE(first_m
);
2095 first_m
= VM_PAGE_NULL
;
2098 * and replace it with the
2099 * page we just copied into
2101 assert(copy_m
->vmp_busy
);
2102 vm_page_insert(copy_m
, object
, vm_object_trunc_page(offset
));
2103 SET_PAGE_DIRTY(copy_m
, TRUE
);
2107 * Now that we've gotten the copy out of the
2108 * way, let's try to collapse the top object.
2109 * But we have to play ugly games with
2110 * paging_in_progress to do that...
2112 vm_object_paging_end(object
);
2113 vm_object_collapse(object
, vm_object_trunc_page(offset
), TRUE
);
2114 vm_object_paging_begin(object
);
2116 *protection
&= (~VM_PROT_WRITE
);
2120 * Now check whether the page needs to be pushed into the
2121 * copy object. The use of asymmetric copy on write for
2122 * shared temporary objects means that we may do two copies to
2123 * satisfy the fault; one above to get the page from a
2124 * shadowed object, and one here to push it into the copy.
2126 try_failed_count
= 0;
2128 while ((copy_object
= first_object
->copy
) != VM_OBJECT_NULL
) {
2129 vm_object_offset_t copy_offset
;
2133 dbgTrace(0xBEEF0017, (unsigned int) copy_object
, (unsigned int) fault_type
); /* (TEST/DEBUG) */
2136 * If the page is being written, but hasn't been
2137 * copied to the copy-object, we have to copy it there.
2139 if ((fault_type
& VM_PROT_WRITE
) == 0) {
2140 *protection
&= ~VM_PROT_WRITE
;
2145 * If the page was guaranteed to be resident,
2146 * we must have already performed the copy.
2148 if (must_be_resident
) {
2153 * Try to get the lock on the copy_object.
2155 if (!vm_object_lock_try(copy_object
)) {
2156 vm_object_unlock(object
);
2159 mutex_pause(try_failed_count
); /* wait a bit */
2160 vm_object_lock(object
);
2164 try_failed_count
= 0;
2167 * Make another reference to the copy-object,
2168 * to keep it from disappearing during the
2171 vm_object_reference_locked(copy_object
);
2174 * Does the page exist in the copy?
2176 copy_offset
= first_offset
- copy_object
->vo_shadow_offset
;
2177 copy_offset
= vm_object_trunc_page(copy_offset
);
2179 if (copy_object
->vo_size
<= copy_offset
) {
2181 * Copy object doesn't cover this page -- do nothing.
2184 } else if ((copy_m
= vm_page_lookup(copy_object
, copy_offset
)) != VM_PAGE_NULL
) {
2186 * Page currently exists in the copy object
2188 if (copy_m
->vmp_busy
) {
2190 * If the page is being brought
2191 * in, wait for it and then retry.
2196 * take an extra ref so object won't die
2198 vm_object_reference_locked(copy_object
);
2199 vm_object_unlock(copy_object
);
2200 vm_fault_cleanup(object
, first_m
);
2201 counter(c_vm_fault_page_block_backoff_kernel
++);
2203 vm_object_lock(copy_object
);
2204 assert(copy_object
->ref_count
> 0);
2205 VM_OBJ_RES_DECR(copy_object
);
2206 vm_object_lock_assert_exclusive(copy_object
);
2207 copy_object
->ref_count
--;
2208 assert(copy_object
->ref_count
> 0);
2209 copy_m
= vm_page_lookup(copy_object
, copy_offset
);
2211 if (copy_m
!= VM_PAGE_NULL
&& copy_m
->vmp_busy
) {
2212 PAGE_ASSERT_WAIT(copy_m
, interruptible
);
2214 vm_object_unlock(copy_object
);
2215 wait_result
= thread_block(THREAD_CONTINUE_NULL
);
2216 vm_object_deallocate(copy_object
);
2220 vm_object_unlock(copy_object
);
2221 vm_object_deallocate(copy_object
);
2222 thread_interrupt_level(interruptible_state
);
2224 return VM_FAULT_RETRY
;
2227 } else if (!PAGED_OUT(copy_object
, copy_offset
)) {
2229 * If PAGED_OUT is TRUE, then the page used to exist
2230 * in the copy-object, and has already been paged out.
2231 * We don't need to repeat this. If PAGED_OUT is
2232 * FALSE, then either we don't know (!pager_created,
2233 * for example) or it hasn't been paged out.
2234 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2235 * We must copy the page to the copy object.
2237 * Allocate a page for the copy
2239 copy_m
= vm_page_alloc(copy_object
, copy_offset
);
2241 if (copy_m
== VM_PAGE_NULL
) {
2244 VM_OBJ_RES_DECR(copy_object
);
2245 vm_object_lock_assert_exclusive(copy_object
);
2246 copy_object
->ref_count
--;
2247 assert(copy_object
->ref_count
> 0);
2249 vm_object_unlock(copy_object
);
2250 vm_fault_cleanup(object
, first_m
);
2251 thread_interrupt_level(interruptible_state
);
2253 return VM_FAULT_MEMORY_SHORTAGE
;
2256 * Must copy page into copy-object.
2258 vm_page_copy(m
, copy_m
);
2261 * If the old page was in use by any users
2262 * of the copy-object, it must be removed
2263 * from all pmaps. (We can't know which
2266 if (m
->vmp_pmapped
) {
2267 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m
));
2270 if (m
->vmp_clustered
) {
2271 VM_PAGE_COUNT_AS_PAGEIN(m
);
2272 VM_PAGE_CONSUME_CLUSTERED(m
);
2275 * If there's a pager, then immediately
2276 * page out this page, using the "initialize"
2277 * option. Else, we use the copy.
2279 if ((!copy_object
->pager_ready
)
2280 || VM_COMPRESSOR_PAGER_STATE_GET(copy_object
, copy_offset
) == VM_EXTERNAL_STATE_ABSENT
2282 vm_page_lockspin_queues();
2283 assert(!m
->vmp_cleaning
);
2284 vm_page_activate(copy_m
);
2285 vm_page_unlock_queues();
2287 SET_PAGE_DIRTY(copy_m
, TRUE
);
2288 PAGE_WAKEUP_DONE(copy_m
);
2290 assert(copy_m
->vmp_busy
== TRUE
);
2291 assert(!m
->vmp_cleaning
);
2294 * dirty is protected by the object lock
2296 SET_PAGE_DIRTY(copy_m
, TRUE
);
2299 * The page is already ready for pageout:
2300 * not on pageout queues and busy.
2301 * Unlock everything except the
2302 * copy_object itself.
2304 vm_object_unlock(object
);
2307 * Write the page to the copy-object,
2308 * flushing it from the kernel.
2310 vm_pageout_initialize_page(copy_m
);
2313 * Since the pageout may have
2314 * temporarily dropped the
2315 * copy_object's lock, we
2316 * check whether we'll have
2317 * to deallocate the hard way.
2319 if ((copy_object
->shadow
!= object
) || (copy_object
->ref_count
== 1)) {
2320 vm_object_unlock(copy_object
);
2321 vm_object_deallocate(copy_object
);
2322 vm_object_lock(object
);
2327 * Pick back up the old object's
2328 * lock. [It is safe to do so,
2329 * since it must be deeper in the
2332 vm_object_lock(object
);
2336 * Because we're pushing a page upward
2337 * in the object tree, we must restart
2338 * any faults that are waiting here.
2339 * [Note that this is an expansion of
2340 * PAGE_WAKEUP that uses the THREAD_RESTART
2341 * wait result]. Can't turn off the page's
2342 * busy bit because we're not done with it.
2344 if (m
->vmp_wanted
) {
2345 m
->vmp_wanted
= FALSE
;
2346 thread_wakeup_with_result((event_t
) m
, THREAD_RESTART
);
2350 * The reference count on copy_object must be
2351 * at least 2: one for our extra reference,
2352 * and at least one from the outside world
2353 * (we checked that when we last locked
2356 vm_object_lock_assert_exclusive(copy_object
);
2357 copy_object
->ref_count
--;
2358 assert(copy_object
->ref_count
> 0);
2360 VM_OBJ_RES_DECR(copy_object
);
2361 vm_object_unlock(copy_object
);
2368 *top_page
= first_m
;
2370 if (m
!= VM_PAGE_NULL
) {
2371 assert(VM_PAGE_OBJECT(m
) == object
);
2373 retval
= VM_FAULT_SUCCESS
;
2375 if (my_fault
== DBG_PAGEIN_FAULT
) {
2376 VM_PAGE_COUNT_AS_PAGEIN(m
);
2378 if (object
->internal
) {
2379 my_fault
= DBG_PAGEIND_FAULT
;
2381 my_fault
= DBG_PAGEINV_FAULT
;
2385 * evaluate access pattern and update state
2386 * vm_fault_deactivate_behind depends on the
2387 * state being up to date
2389 vm_fault_is_sequential(object
, offset
, fault_info
->behavior
);
2390 vm_fault_deactivate_behind(object
, offset
, fault_info
->behavior
);
2391 } else if (type_of_fault
== NULL
&& my_fault
== DBG_CACHE_HIT_FAULT
) {
2393 * we weren't called from vm_fault, so handle the
2394 * accounting here for hits in the cache
2396 if (m
->vmp_clustered
) {
2397 VM_PAGE_COUNT_AS_PAGEIN(m
);
2398 VM_PAGE_CONSUME_CLUSTERED(m
);
2400 vm_fault_is_sequential(object
, offset
, fault_info
->behavior
);
2401 vm_fault_deactivate_behind(object
, offset
, fault_info
->behavior
);
2402 } else if (my_fault
== DBG_COMPRESSOR_FAULT
|| my_fault
== DBG_COMPRESSOR_SWAPIN_FAULT
) {
2403 VM_STAT_DECOMPRESSIONS();
2405 if (type_of_fault
) {
2406 *type_of_fault
= my_fault
;
2409 retval
= VM_FAULT_SUCCESS_NO_VM_PAGE
;
2410 assert(first_m
== VM_PAGE_NULL
);
2411 assert(object
== first_object
);
2414 thread_interrupt_level(interruptible_state
);
2417 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS
, 0); /* (TEST/DEBUG) */
2422 thread_interrupt_level(interruptible_state
);
2424 if (wait_result
== THREAD_INTERRUPTED
) {
2425 return VM_FAULT_INTERRUPTED
;
2427 return VM_FAULT_RETRY
;
2433 extern int panic_on_cs_killed
;
2434 extern int proc_selfpid(void);
2435 extern char *proc_name_address(void *p
);
2436 unsigned long cs_enter_tainted_rejected
= 0;
2437 unsigned long cs_enter_tainted_accepted
= 0;
2441 * When soft faulting a page, we have to validate the page if:
2442 * 1. the page is being mapped in user space
2443 * 2. the page hasn't already been found to be "tainted"
2444 * 3. the page belongs to a code-signed object
2445 * 4. the page has not been validated yet or has been mapped for write.
2448 vm_fault_cs_need_validation(
2451 vm_object_t page_obj
,
2452 vm_map_size_t fault_page_size
,
2453 vm_map_offset_t fault_phys_offset
)
2455 if (pmap
== kernel_pmap
) {
2456 /* 1 - not user space */
2459 if (!page_obj
->code_signed
) {
2460 /* 3 - page does not belong to a code-signed object */
2463 if (fault_page_size
== PAGE_SIZE
) {
2464 /* looking at the whole page */
2465 assertf(fault_phys_offset
== 0,
2466 "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2467 (uint64_t)fault_page_size
,
2468 (uint64_t)fault_phys_offset
);
2469 if (page
->vmp_cs_tainted
== VMP_CS_ALL_TRUE
) {
2470 /* 2 - page is all tainted */
2473 if (page
->vmp_cs_validated
== VMP_CS_ALL_TRUE
&&
2474 !page
->vmp_wpmapped
) {
2475 /* 4 - already fully validated and never mapped writable */
2479 /* looking at a specific sub-page */
2480 if (VMP_CS_TAINTED(page
, fault_page_size
, fault_phys_offset
)) {
2481 /* 2 - sub-page was already marked as tainted */
2484 if (VMP_CS_VALIDATED(page
, fault_page_size
, fault_phys_offset
) &&
2485 !page
->vmp_wpmapped
) {
2486 /* 4 - already validated and never mapped writable */
2490 /* page needs to be validated */
2496 vm_fault_cs_page_immutable(
2498 vm_map_size_t fault_page_size
,
2499 vm_map_offset_t fault_phys_offset
,
2500 vm_prot_t prot __unused
)
2502 if (VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
)
2503 /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2510 vm_fault_cs_page_nx(
2512 vm_map_size_t fault_page_size
,
2513 vm_map_offset_t fault_phys_offset
)
2515 return VMP_CS_NX(m
, fault_page_size
, fault_phys_offset
);
2519 * Check if the page being entered into the pmap violates code signing.
2521 static kern_return_t
2522 vm_fault_cs_check_violation(
2528 vm_prot_t caller_prot
,
2529 vm_map_size_t fault_page_size
,
2530 vm_map_offset_t fault_phys_offset
,
2531 vm_object_fault_info_t fault_info
,
2532 bool map_is_switched
,
2533 bool map_is_switch_protected
,
2537 #pragma unused(caller_prot)
2538 #pragma unused(fault_info)
2539 #endif /* !PMAP_CS */
2540 int cs_enforcement_enabled
;
2542 vm_fault_cs_need_validation(pmap
, m
, object
,
2543 fault_page_size
, fault_phys_offset
)) {
2544 vm_object_lock_assert_exclusive(object
);
2546 if (VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
)) {
2547 vm_cs_revalidates
++;
2550 /* VM map is locked, so 1 ref will remain on VM object -
2551 * so no harm if vm_page_validate_cs drops the object lock */
2554 if (fault_info
->pmap_cs_associated
&&
2555 pmap_cs_enforced(pmap
) &&
2556 !VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
) &&
2557 !VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
) &&
2558 !VMP_CS_NX(m
, fault_page_size
, fault_phys_offset
) &&
2559 (prot
& VM_PROT_EXECUTE
) &&
2560 (caller_prot
& VM_PROT_EXECUTE
)) {
2562 * With pmap_cs, the pmap layer will validate the
2563 * code signature for any executable pmap mapping.
2564 * No need for us to validate this page too:
2565 * in pmap_cs we trust...
2567 vm_cs_defer_to_pmap_cs
++;
2569 vm_cs_defer_to_pmap_cs_not
++;
2570 vm_page_validate_cs(m
, fault_page_size
, fault_phys_offset
);
2573 vm_page_validate_cs(m
, fault_page_size
, fault_phys_offset
);
2574 #endif /* PMAP_CS */
2577 /* If the map is switched, and is switch-protected, we must protect
2578 * some pages from being write-faulted: immutable pages because by
2579 * definition they may not be written, and executable pages because that
2580 * would provide a way to inject unsigned code.
2581 * If the page is immutable, we can simply return. However, we can't
2582 * immediately determine whether a page is executable anywhere. But,
2583 * we can disconnect it everywhere and remove the executable protection
2584 * from the current map. We do that below right before we do the
2587 if (pmap
== kernel_pmap
) {
2588 /* kernel fault: cs_enforcement does not apply */
2589 cs_enforcement_enabled
= 0;
2591 cs_enforcement_enabled
= pmap_get_vm_map_cs_enforced(pmap
);
2594 if (cs_enforcement_enabled
&& map_is_switched
&&
2595 map_is_switch_protected
&&
2596 vm_fault_cs_page_immutable(m
, fault_page_size
, fault_phys_offset
, prot
) &&
2597 (prot
& VM_PROT_WRITE
)) {
2598 return KERN_CODESIGN_ERROR
;
2601 if (cs_enforcement_enabled
&&
2602 vm_fault_cs_page_nx(m
, fault_page_size
, fault_phys_offset
) &&
2603 (prot
& VM_PROT_EXECUTE
)) {
2605 printf("page marked to be NX, not letting it be mapped EXEC\n");
2607 return KERN_CODESIGN_ERROR
;
2610 /* A page could be tainted, or pose a risk of being tainted later.
2611 * Check whether the receiving process wants it, and make it feel
2612 * the consequences (that hapens in cs_invalid_page()).
2613 * For CS Enforcement, two other conditions will
2614 * cause that page to be tainted as well:
2615 * - pmapping an unsigned page executable - this means unsigned code;
2616 * - writeable mapping of a validated page - the content of that page
2617 * can be changed without the kernel noticing, therefore unsigned
2618 * code can be created
2621 /* code-signing is bypassed */
2622 *cs_violation
= FALSE
;
2623 } else if (VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
)) {
2625 *cs_violation
= TRUE
;
2626 } else if (!cs_enforcement_enabled
) {
2627 /* no further code-signing enforcement */
2628 *cs_violation
= FALSE
;
2629 } else if (vm_fault_cs_page_immutable(m
, fault_page_size
, fault_phys_offset
, prot
) &&
2630 ((prot
& VM_PROT_WRITE
) ||
2633 * The page should be immutable, but is in danger of being
2635 * This is the case where we want policy from the code
2636 * directory - is the page immutable or not? For now we have
2637 * to assume that code pages will be immutable, data pages not.
2638 * We'll assume a page is a code page if it has a code directory
2639 * and we fault for execution.
2640 * That is good enough since if we faulted the code page for
2641 * writing in another map before, it is wpmapped; if we fault
2642 * it for writing in this map later it will also be faulted for
2643 * executing at the same time; and if we fault for writing in
2644 * another map later, we will disconnect it from this pmap so
2645 * we'll notice the change.
2647 *cs_violation
= TRUE
;
2648 } else if (!VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
) &&
2649 (prot
& VM_PROT_EXECUTE
)
2652 * Executable pages will be validated by pmap_cs;
2653 * in pmap_cs we trust...
2654 * If pmap_cs is turned off, this is a code-signing
2657 && !(pmap_cs_enforced(pmap
))
2658 #endif /* PMAP_CS */
2660 *cs_violation
= TRUE
;
2662 *cs_violation
= FALSE
;
2664 return KERN_SUCCESS
;
2668 * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2669 * @param must_disconnect This value will be set to true if the caller must disconnect
2671 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2673 static kern_return_t
2674 vm_fault_cs_handle_violation(
2679 vm_map_offset_t vaddr
,
2680 vm_map_size_t fault_page_size
,
2681 vm_map_offset_t fault_phys_offset
,
2682 bool map_is_switched
,
2683 bool map_is_switch_protected
,
2684 bool *must_disconnect
)
2687 #pragma unused(pmap)
2688 #pragma unused(map_is_switch_protected)
2689 #endif /* !MACH_ASSERT */
2691 * We will have a tainted page. Have to handle the special case
2692 * of a switched map now. If the map is not switched, standard
2693 * procedure applies - call cs_invalid_page().
2694 * If the map is switched, the real owner is invalid already.
2695 * There is no point in invalidating the switching process since
2696 * it will not be executing from the map. So we don't call
2697 * cs_invalid_page() in that case.
2699 boolean_t reject_page
, cs_killed
;
2701 if (map_is_switched
) {
2702 assert(pmap
== vm_map_pmap(current_thread()->map
));
2703 assert(!(prot
& VM_PROT_WRITE
) || (map_is_switch_protected
== FALSE
));
2704 reject_page
= FALSE
;
2707 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2708 object
->code_signed
? "yes" : "no",
2709 VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
) ? "yes" : "no",
2710 VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
) ? "yes" : "no",
2711 m
->vmp_wpmapped
? "yes" : "no",
2714 reject_page
= cs_invalid_page((addr64_t
) vaddr
, &cs_killed
);
2718 /* reject the invalid page: abort the page fault */
2720 const char *procname
;
2722 vm_object_t file_object
, shadow
;
2723 vm_object_offset_t file_offset
;
2724 char *pathname
, *filename
;
2725 vm_size_t pathname_len
, filename_len
;
2726 boolean_t truncated_path
;
2727 #define __PATH_MAX 1024
2728 struct timespec mtime
, cs_mtime
;
2730 os_reason_t codesigning_exit_reason
= OS_REASON_NULL
;
2732 kr
= KERN_CODESIGN_ERROR
;
2733 cs_enter_tainted_rejected
++;
2735 /* get process name and pid */
2737 task
= current_task();
2738 pid
= proc_selfpid();
2739 if (task
->bsd_info
!= NULL
) {
2740 procname
= proc_name_address(task
->bsd_info
);
2743 /* get file's VM object */
2744 file_object
= object
;
2745 file_offset
= m
->vmp_offset
;
2746 for (shadow
= file_object
->shadow
,
2748 shadow
!= VM_OBJECT_NULL
;
2749 shadow
= file_object
->shadow
,
2751 vm_object_lock_shared(shadow
);
2752 if (file_object
!= object
) {
2753 vm_object_unlock(file_object
);
2755 file_offset
+= file_object
->vo_shadow_offset
;
2756 file_object
= shadow
;
2761 cs_mtime
.tv_sec
= 0;
2762 cs_mtime
.tv_nsec
= 0;
2764 /* get file's pathname and/or filename */
2769 truncated_path
= FALSE
;
2770 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2771 if (file_object
->pager
!= NULL
) {
2772 pathname
= kheap_alloc(KHEAP_TEMP
, __PATH_MAX
* 2, Z_WAITOK
);
2775 pathname_len
= __PATH_MAX
;
2776 filename
= pathname
+ pathname_len
;
2777 filename_len
= __PATH_MAX
;
2779 if (vnode_pager_get_object_name(file_object
->pager
,
2784 &truncated_path
) == KERN_SUCCESS
) {
2785 /* safety first... */
2786 pathname
[__PATH_MAX
- 1] = '\0';
2787 filename
[__PATH_MAX
- 1] = '\0';
2789 vnode_pager_get_object_mtime(file_object
->pager
,
2793 kheap_free(KHEAP_TEMP
, pathname
, __PATH_MAX
* 2);
2798 truncated_path
= FALSE
;
2802 printf("CODE SIGNING: process %d[%s]: "
2803 "rejecting invalid page at address 0x%llx "
2804 "from offset 0x%llx in file \"%s%s%s\" "
2805 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2806 "(signed:%d validated:%d tainted:%d nx:%d "
2807 "wpmapped:%d dirty:%d depth:%d)\n",
2808 pid
, procname
, (addr64_t
) vaddr
,
2810 (pathname
? pathname
: "<nil>"),
2811 (truncated_path
? "/.../" : ""),
2812 (truncated_path
? filename
: ""),
2813 cs_mtime
.tv_sec
, cs_mtime
.tv_nsec
,
2814 ((cs_mtime
.tv_sec
== mtime
.tv_sec
&&
2815 cs_mtime
.tv_nsec
== mtime
.tv_nsec
)
2818 mtime
.tv_sec
, mtime
.tv_nsec
,
2819 object
->code_signed
,
2820 VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
),
2821 VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
),
2822 VMP_CS_NX(m
, fault_page_size
, fault_phys_offset
),
2828 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2829 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2830 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2831 * will deal with the segmentation fault.
2834 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC
, BSD_PROC_EXITREASON_CREATE
) | DBG_FUNC_NONE
,
2835 pid
, OS_REASON_CODESIGNING
, CODESIGNING_EXIT_REASON_INVALID_PAGE
, 0, 0);
2837 codesigning_exit_reason
= os_reason_create(OS_REASON_CODESIGNING
, CODESIGNING_EXIT_REASON_INVALID_PAGE
);
2838 if (codesigning_exit_reason
== NULL
) {
2839 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2841 mach_vm_address_t data_addr
= 0;
2842 struct codesigning_exit_reason_info
*ceri
= NULL
;
2843 uint32_t reason_buffer_size_estimate
= kcdata_estimate_required_buffer_size(1, sizeof(*ceri
));
2845 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason
, reason_buffer_size_estimate
)) {
2846 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2848 if (KERN_SUCCESS
== kcdata_get_memory_addr(&codesigning_exit_reason
->osr_kcd_descriptor
,
2849 EXIT_REASON_CODESIGNING_INFO
, sizeof(*ceri
), &data_addr
)) {
2850 ceri
= (struct codesigning_exit_reason_info
*)data_addr
;
2851 static_assert(__PATH_MAX
== sizeof(ceri
->ceri_pathname
));
2853 ceri
->ceri_virt_addr
= vaddr
;
2854 ceri
->ceri_file_offset
= file_offset
;
2856 strncpy((char *)&ceri
->ceri_pathname
, pathname
, sizeof(ceri
->ceri_pathname
));
2858 ceri
->ceri_pathname
[0] = '\0';
2861 strncpy((char *)&ceri
->ceri_filename
, filename
, sizeof(ceri
->ceri_filename
));
2863 ceri
->ceri_filename
[0] = '\0';
2865 ceri
->ceri_path_truncated
= (truncated_path
? 1 : 0);
2866 ceri
->ceri_codesig_modtime_secs
= cs_mtime
.tv_sec
;
2867 ceri
->ceri_codesig_modtime_nsecs
= cs_mtime
.tv_nsec
;
2868 ceri
->ceri_page_modtime_secs
= mtime
.tv_sec
;
2869 ceri
->ceri_page_modtime_nsecs
= mtime
.tv_nsec
;
2870 ceri
->ceri_object_codesigned
= (object
->code_signed
);
2871 ceri
->ceri_page_codesig_validated
= VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
);
2872 ceri
->ceri_page_codesig_tainted
= VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
);
2873 ceri
->ceri_page_codesig_nx
= VMP_CS_NX(m
, fault_page_size
, fault_phys_offset
);
2874 ceri
->ceri_page_wpmapped
= (m
->vmp_wpmapped
);
2875 ceri
->ceri_page_slid
= 0;
2876 ceri
->ceri_page_dirty
= (m
->vmp_dirty
);
2877 ceri
->ceri_page_shadow_depth
= shadow_depth
;
2879 #if DEBUG || DEVELOPMENT
2880 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2882 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2883 #endif /* DEBUG || DEVELOPMENT */
2884 /* Free the buffer */
2885 os_reason_alloc_buffer_noblock(codesigning_exit_reason
, 0);
2890 set_thread_exit_reason(current_thread(), codesigning_exit_reason
, FALSE
);
2892 if (panic_on_cs_killed
&&
2893 object
->object_is_shared_cache
) {
2894 char *tainted_contents
;
2895 vm_map_offset_t src_vaddr
;
2896 src_vaddr
= (vm_map_offset_t
) phystokv((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
) << PAGE_SHIFT
);
2897 tainted_contents
= kalloc(PAGE_SIZE
);
2898 bcopy((const char *)src_vaddr
, tainted_contents
, PAGE_SIZE
);
2899 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m
, VM_PAGE_GET_PHYS_PAGE(m
), (uint64_t)src_vaddr
, tainted_contents
);
2900 panic("CODE SIGNING: process %d[%s]: "
2901 "rejecting invalid page (phys#0x%x) at address 0x%llx "
2902 "from offset 0x%llx in file \"%s%s%s\" "
2903 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2904 "(signed:%d validated:%d tainted:%d nx:%d"
2905 "wpmapped:%d dirty:%d depth:%d)\n",
2907 VM_PAGE_GET_PHYS_PAGE(m
),
2910 (pathname
? pathname
: "<nil>"),
2911 (truncated_path
? "/.../" : ""),
2912 (truncated_path
? filename
: ""),
2913 cs_mtime
.tv_sec
, cs_mtime
.tv_nsec
,
2914 ((cs_mtime
.tv_sec
== mtime
.tv_sec
&&
2915 cs_mtime
.tv_nsec
== mtime
.tv_nsec
)
2918 mtime
.tv_sec
, mtime
.tv_nsec
,
2919 object
->code_signed
,
2920 VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
),
2921 VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
),
2922 VMP_CS_NX(m
, fault_page_size
, fault_phys_offset
),
2928 if (file_object
!= object
) {
2929 vm_object_unlock(file_object
);
2931 if (pathname_len
!= 0) {
2932 kheap_free(KHEAP_TEMP
, pathname
, __PATH_MAX
* 2);
2937 /* proceed with the invalid page */
2939 if (!VMP_CS_VALIDATED(m
, fault_page_size
, fault_phys_offset
) &&
2940 !object
->code_signed
) {
2942 * This page has not been (fully) validated but
2943 * does not belong to a code-signed object
2944 * so it should not be forcefully considered
2946 * We're just concerned about it here because
2947 * we've been asked to "execute" it but that
2948 * does not mean that it should cause other
2950 * This happens when a debugger sets a
2951 * breakpoint and we then execute code in
2952 * that page. Marking the page as "tainted"
2953 * would cause any inspection tool ("leaks",
2954 * "vmmap", "CrashReporter", ...) to get killed
2955 * due to code-signing violation on that page,
2956 * even though they're just reading it and not
2957 * executing from it.
2961 * Page might have been tainted before or not;
2962 * now it definitively is. If the page wasn't
2963 * tainted, we must disconnect it from all
2964 * pmaps later, to force existing mappings
2965 * through that code path for re-consideration
2966 * of the validity of that page.
2968 if (!VMP_CS_TAINTED(m
, fault_page_size
, fault_phys_offset
)) {
2969 *must_disconnect
= TRUE
;
2970 VMP_CS_SET_TAINTED(m
, fault_page_size
, fault_phys_offset
, TRUE
);
2973 cs_enter_tainted_accepted
++;
2975 if (kr
!= KERN_SUCCESS
) {
2977 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2978 "*** INVALID PAGE ***\n",
2982 if (cs_enforcement_panic
) {
2983 panic("CODESIGNING: panicking on invalid page\n");
2991 * Check that the code signature is valid for the given page being inserted into
2994 * @param must_disconnect This value will be set to true if the caller must disconnect
2996 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2998 static kern_return_t
2999 vm_fault_validate_cs(
3004 vm_map_offset_t vaddr
,
3006 vm_prot_t caller_prot
,
3007 vm_map_size_t fault_page_size
,
3008 vm_map_offset_t fault_phys_offset
,
3009 vm_object_fault_info_t fault_info
,
3010 bool *must_disconnect
)
3012 bool map_is_switched
, map_is_switch_protected
, cs_violation
;
3014 /* Validate code signature if necessary. */
3015 map_is_switched
= ((pmap
!= vm_map_pmap(current_task()->map
)) &&
3016 (pmap
== vm_map_pmap(current_thread()->map
)));
3017 map_is_switch_protected
= current_thread()->map
->switch_protect
;
3018 kr
= vm_fault_cs_check_violation(cs_bypass
, object
, m
, pmap
,
3019 prot
, caller_prot
, fault_page_size
, fault_phys_offset
, fault_info
,
3020 map_is_switched
, map_is_switch_protected
, &cs_violation
);
3021 if (kr
!= KERN_SUCCESS
) {
3025 kr
= vm_fault_cs_handle_violation(object
, m
, pmap
, prot
, vaddr
,
3026 fault_page_size
, fault_phys_offset
,
3027 map_is_switched
, map_is_switch_protected
, must_disconnect
);
3033 * Enqueue the page on the appropriate paging queue.
3036 vm_fault_enqueue_page(
3046 assert((m
->vmp_q_state
== VM_PAGE_USED_BY_COMPRESSOR
) || object
!= compressor_object
);
3047 boolean_t page_queues_locked
= FALSE
;
3048 boolean_t previously_pmapped
= m
->vmp_pmapped
;
3049 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3051 if (! page_queues_locked) { \
3052 page_queues_locked = TRUE; \
3053 vm_page_lockspin_queues(); \
3056 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3058 if (page_queues_locked) { \
3059 page_queues_locked = FALSE; \
3060 vm_page_unlock_queues(); \
3064 #if CONFIG_BACKGROUND_QUEUE
3065 vm_page_update_background_state(m
);
3067 if (m
->vmp_q_state
== VM_PAGE_USED_BY_COMPRESSOR
) {
3069 * Compressor pages are neither wired
3070 * nor pageable and should never change.
3072 assert(object
== compressor_object
);
3073 } else if (change_wiring
) {
3074 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3077 if (kr
== KERN_SUCCESS
) {
3078 vm_page_wire(m
, wire_tag
, TRUE
);
3081 vm_page_unwire(m
, TRUE
);
3083 /* we keep the page queues lock, if we need it later */
3085 if (object
->internal
== TRUE
) {
3087 * don't allow anonymous pages on
3088 * the speculative queues
3092 if (kr
!= KERN_SUCCESS
) {
3093 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3094 vm_page_deactivate(m
);
3095 /* we keep the page queues lock, if we need it later */
3096 } else if (((m
->vmp_q_state
== VM_PAGE_NOT_ON_Q
) ||
3097 (m
->vmp_q_state
== VM_PAGE_ON_SPECULATIVE_Q
) ||
3098 (m
->vmp_q_state
== VM_PAGE_ON_INACTIVE_CLEANED_Q
) ||
3099 ((m
->vmp_q_state
!= VM_PAGE_ON_THROTTLED_Q
) && no_cache
)) &&
3100 !VM_PAGE_WIRED(m
)) {
3101 if (vm_page_local_q
&&
3102 (*type_of_fault
== DBG_COW_FAULT
||
3103 *type_of_fault
== DBG_ZERO_FILL_FAULT
)) {
3107 assert(m
->vmp_q_state
== VM_PAGE_NOT_ON_Q
);
3109 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3110 vm_object_lock_assert_exclusive(object
);
3113 * we got a local queue to stuff this
3115 * its safe to manipulate local and
3116 * local_id at this point since we're
3117 * behind an exclusive object lock and
3118 * the page is not on any global queue.
3120 * we'll use the current cpu number to
3121 * select the queue note that we don't
3122 * need to disable preemption... we're
3123 * going to be behind the local queue's
3124 * lock to do the real work
3128 lq
= zpercpu_get_cpu(vm_page_local_q
, lid
);
3130 VPL_LOCK(&lq
->vpl_lock
);
3132 vm_page_check_pageable_safe(m
);
3133 vm_page_queue_enter(&lq
->vpl_queue
, m
, vmp_pageq
);
3134 m
->vmp_q_state
= VM_PAGE_ON_ACTIVE_LOCAL_Q
;
3135 m
->vmp_local_id
= lid
;
3138 if (object
->internal
) {
3139 lq
->vpl_internal_count
++;
3141 lq
->vpl_external_count
++;
3144 VPL_UNLOCK(&lq
->vpl_lock
);
3146 if (lq
->vpl_count
> vm_page_local_q_soft_limit
) {
3148 * we're beyond the soft limit
3149 * for the local queue
3150 * vm_page_reactivate_local will
3151 * 'try' to take the global page
3152 * queue lock... if it can't
3153 * that's ok... we'll let the
3154 * queue continue to grow up
3155 * to the hard limit... at that
3156 * point we'll wait for the
3157 * lock... once we've got the
3158 * lock, we'll transfer all of
3159 * the pages from the local
3160 * queue to the global active
3163 vm_page_reactivate_local(lid
, FALSE
, FALSE
);
3166 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3169 * test again now that we hold the
3172 if (!VM_PAGE_WIRED(m
)) {
3173 if (m
->vmp_q_state
== VM_PAGE_ON_INACTIVE_CLEANED_Q
) {
3174 vm_page_queues_remove(m
, FALSE
);
3176 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated
, 1);
3177 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated
, 1);
3180 if (!VM_PAGE_ACTIVE_OR_INACTIVE(m
) ||
3183 * If this is a no_cache mapping
3184 * and the page has never been
3185 * mapped before or was
3186 * previously a no_cache page,
3187 * then we want to leave pages
3188 * in the speculative state so
3189 * that they can be readily
3190 * recycled if free memory runs
3191 * low. Otherwise the page is
3192 * activated as normal.
3196 (!previously_pmapped
||
3198 m
->vmp_no_cache
= TRUE
;
3200 if (m
->vmp_q_state
!= VM_PAGE_ON_SPECULATIVE_Q
) {
3201 vm_page_speculate(m
, FALSE
);
3203 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m
)) {
3204 vm_page_activate(m
);
3208 /* we keep the page queues lock, if we need it later */
3212 /* we're done with the page queues lock, if we ever took it */
3213 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3217 * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3218 * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3219 * before being inserted into the pmap.
3222 vm_fault_enter_set_mapped(
3226 vm_prot_t fault_type
)
3228 bool page_needs_sync
= false;
3230 * NOTE: we may only hold the vm_object lock SHARED
3231 * at this point, so we need the phys_page lock to
3232 * properly serialize updating the pmapped and
3235 if ((prot
& VM_PROT_EXECUTE
) && !m
->vmp_xpmapped
) {
3236 ppnum_t phys_page
= VM_PAGE_GET_PHYS_PAGE(m
);
3238 pmap_lock_phys_page(phys_page
);
3239 m
->vmp_pmapped
= TRUE
;
3241 if (!m
->vmp_xpmapped
) {
3242 m
->vmp_xpmapped
= TRUE
;
3244 pmap_unlock_phys_page(phys_page
);
3246 if (!object
->internal
) {
3247 OSAddAtomic(1, &vm_page_xpmapped_external_count
);
3250 #if defined(__arm__) || defined(__arm64__)
3251 page_needs_sync
= true;
3253 if (object
->internal
&&
3254 object
->pager
!= NULL
) {
3256 * This page could have been
3257 * uncompressed by the
3258 * compressor pager and its
3259 * contents might be only in
3261 * Since it's being mapped for
3262 * "execute" for the fist time,
3263 * make sure the icache is in
3266 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT
);
3267 page_needs_sync
= true;
3271 pmap_unlock_phys_page(phys_page
);
3274 if (m
->vmp_pmapped
== FALSE
) {
3275 ppnum_t phys_page
= VM_PAGE_GET_PHYS_PAGE(m
);
3277 pmap_lock_phys_page(phys_page
);
3278 m
->vmp_pmapped
= TRUE
;
3279 pmap_unlock_phys_page(phys_page
);
3283 if (fault_type
& VM_PROT_WRITE
) {
3284 if (m
->vmp_wpmapped
== FALSE
) {
3285 vm_object_lock_assert_exclusive(object
);
3286 if (!object
->internal
&& object
->pager
) {
3287 task_update_logical_writes(current_task(), PAGE_SIZE
, TASK_WRITE_DEFERRED
, vnode_pager_lookup_vnode(object
->pager
));
3289 m
->vmp_wpmapped
= TRUE
;
3292 return page_needs_sync
;
3296 * Try to enter the given page into the pmap.
3297 * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3298 * a codesigning failure on a non-execute fault.
3300 static kern_return_t
3301 vm_fault_attempt_pmap_enter(
3303 vm_map_offset_t vaddr
,
3304 vm_map_size_t fault_page_size
,
3305 vm_map_offset_t fault_phys_offset
,
3308 vm_prot_t caller_prot
,
3309 vm_prot_t fault_type
,
3314 #pragma unused(caller_prot)
3315 #endif /* !PMAP_CS */
3317 if (fault_page_size
!= PAGE_SIZE
) {
3318 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap
, (uint64_t)vaddr
, (uint64_t)((((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
) + fault_phys_offset
), (uint64_t)(((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
), (uint64_t)fault_phys_offset
, *prot
, fault_type
);
3319 assertf((!(fault_phys_offset
& FOURK_PAGE_MASK
) &&
3320 fault_phys_offset
< PAGE_SIZE
),
3321 "0x%llx\n", (uint64_t)fault_phys_offset
);
3323 assertf(fault_phys_offset
== 0,
3324 "0x%llx\n", (uint64_t)fault_phys_offset
);
3327 PMAP_ENTER_OPTIONS(pmap
, vaddr
,
3329 m
, *prot
, fault_type
, 0,
3335 * Retry without execute permission if we encountered a codesigning
3336 * failure on a non-execute fault. This allows applications which
3337 * don't actually need to execute code to still map it for read access.
3339 if ((kr
== KERN_CODESIGN_ERROR
) && pmap_cs_enforced(pmap
) &&
3340 (*prot
& VM_PROT_EXECUTE
) && !(caller_prot
& VM_PROT_EXECUTE
)) {
3341 *prot
&= ~VM_PROT_EXECUTE
;
3342 PMAP_ENTER_OPTIONS(pmap
, vaddr
,
3344 m
, *prot
, fault_type
, 0,
3354 * Enter the given page into the pmap.
3355 * The map must be locked shared.
3356 * The vm object must NOT be locked.
3358 * @param need_retry if not null, avoid making a (potentially) blocking call into
3359 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3361 static kern_return_t
3362 vm_fault_pmap_enter(
3364 vm_map_offset_t vaddr
,
3365 vm_map_size_t fault_page_size
,
3366 vm_map_offset_t fault_phys_offset
,
3369 vm_prot_t caller_prot
,
3370 vm_prot_t fault_type
,
3373 boolean_t
*need_retry
)
3376 if (need_retry
!= NULL
) {
3378 * Although we don't hold a lock on this object, we hold a lock
3379 * on the top object in the chain. To prevent a deadlock, we
3380 * can't allow the pmap layer to block.
3382 pmap_options
|= PMAP_OPTIONS_NOWAIT
;
3384 kr
= vm_fault_attempt_pmap_enter(pmap
, vaddr
,
3385 fault_page_size
, fault_phys_offset
,
3386 m
, prot
, caller_prot
, fault_type
, wired
, pmap_options
);
3387 if (kr
== KERN_RESOURCE_SHORTAGE
) {
3390 * There's nothing we can do here since we hold the
3391 * lock on the top object in the chain. The caller
3392 * will need to deal with this by dropping that lock and retrying.
3395 vm_pmap_enter_retried
++;
3402 * Enter the given page into the pmap.
3403 * The vm map must be locked shared.
3404 * The vm object must be locked exclusive, unless this is a soft fault.
3405 * For a soft fault, the object must be locked shared or exclusive.
3407 * @param need_retry if not null, avoid making a (potentially) blocking call into
3408 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3410 static kern_return_t
3411 vm_fault_pmap_enter_with_object_lock(
3414 vm_map_offset_t vaddr
,
3415 vm_map_size_t fault_page_size
,
3416 vm_map_offset_t fault_phys_offset
,
3419 vm_prot_t caller_prot
,
3420 vm_prot_t fault_type
,
3423 boolean_t
*need_retry
)
3427 * Prevent a deadlock by not
3428 * holding the object lock if we need to wait for a page in
3429 * pmap_enter() - <rdar://problem/7138958>
3431 kr
= vm_fault_attempt_pmap_enter(pmap
, vaddr
,
3432 fault_page_size
, fault_phys_offset
,
3433 m
, prot
, caller_prot
, fault_type
, wired
, pmap_options
| PMAP_OPTIONS_NOWAIT
);
3435 if (kr
== KERN_INVALID_ARGUMENT
&&
3436 pmap
== PMAP_NULL
&&
3439 * Wiring a page in a pmap-less VM map:
3440 * VMware's "vmmon" kernel extension does this
3442 * Let it proceed even though the PMAP_ENTER() failed.
3446 #endif /* __x86_64__ */
3448 if (kr
== KERN_RESOURCE_SHORTAGE
) {
3451 * this will be non-null in the case where we hold the lock
3452 * on the top-object in this chain... we can't just drop
3453 * the lock on the object we're inserting the page into
3454 * and recall the PMAP_ENTER since we can still cause
3455 * a deadlock if one of the critical paths tries to
3456 * acquire the lock on the top-object and we're blocked
3457 * in PMAP_ENTER waiting for memory... our only recourse
3458 * is to deal with it at a higher level where we can
3462 vm_pmap_enter_retried
++;
3466 * The nonblocking version of pmap_enter did not succeed.
3467 * and we don't need to drop other locks and retry
3468 * at the level above us, so
3469 * use the blocking version instead. Requires marking
3470 * the page busy and unlocking the object
3472 boolean_t was_busy
= m
->vmp_busy
;
3474 vm_object_lock_assert_exclusive(object
);
3477 vm_object_unlock(object
);
3479 PMAP_ENTER_OPTIONS(pmap
, vaddr
,
3481 m
, *prot
, fault_type
,
3485 assert(VM_PAGE_OBJECT(m
) == object
);
3487 /* Take the object lock again. */
3488 vm_object_lock(object
);
3490 /* If the page was busy, someone else will wake it up.
3491 * Otherwise, we have to do it now. */
3492 assert(m
->vmp_busy
);
3494 PAGE_WAKEUP_DONE(m
);
3496 vm_pmap_enter_blocked
++;
3504 * Prepare to enter a page into the pmap by checking CS, protection bits,
3505 * and setting mapped bits on the page_t.
3506 * Does not modify the page's paging queue.
3508 * page queue lock must NOT be held
3509 * m->vmp_object must be locked
3511 * NOTE: m->vmp_object could be locked "shared" only if we are called
3512 * from vm_fault() as part of a soft fault.
3514 static kern_return_t
3515 vm_fault_enter_prepare(
3518 vm_map_offset_t vaddr
,
3520 vm_prot_t caller_prot
,
3521 vm_map_size_t fault_page_size
,
3522 vm_map_offset_t fault_phys_offset
,
3523 boolean_t change_wiring
,
3524 vm_prot_t fault_type
,
3525 vm_object_fault_info_t fault_info
,
3527 bool *page_needs_data_sync
)
3530 bool is_tainted
= false;
3532 boolean_t cs_bypass
= fault_info
->cs_bypass
;
3534 object
= VM_PAGE_OBJECT(m
);
3536 vm_object_lock_assert_held(object
);
3539 if (pmap
== kernel_pmap
) {
3540 kasan_notify_address(vaddr
, PAGE_SIZE
);
3544 if (pmap_cs_exempt(pmap
)) {
3549 LCK_MTX_ASSERT(&vm_page_queue_lock
, LCK_MTX_ASSERT_NOTOWNED
);
3551 if (*type_of_fault
== DBG_ZERO_FILL_FAULT
) {
3552 vm_object_lock_assert_exclusive(object
);
3553 } else if ((fault_type
& VM_PROT_WRITE
) == 0 &&
3556 #if VM_OBJECT_ACCESS_TRACKING
3557 || object
->access_tracking
3558 #endif /* VM_OBJECT_ACCESS_TRACKING */
3561 * This is not a "write" fault, so we
3562 * might not have taken the object lock
3563 * exclusively and we might not be able
3564 * to update the "wpmapped" bit in
3566 * Let's just grant read access to
3567 * the page for now and we'll
3568 * soft-fault again if we need write
3572 /* This had better not be a JIT page. */
3573 if (!pmap_has_prot_policy(pmap
, fault_info
->pmap_options
& PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE
, *prot
)) {
3574 *prot
&= ~VM_PROT_WRITE
;
3579 if (m
->vmp_pmapped
== FALSE
) {
3580 if (m
->vmp_clustered
) {
3581 if (*type_of_fault
== DBG_CACHE_HIT_FAULT
) {
3583 * found it in the cache, but this
3584 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3585 * so it must have come in as part of
3586 * a cluster... account 1 pagein against it
3588 if (object
->internal
) {
3589 *type_of_fault
= DBG_PAGEIND_FAULT
;
3591 *type_of_fault
= DBG_PAGEINV_FAULT
;
3594 VM_PAGE_COUNT_AS_PAGEIN(m
);
3596 VM_PAGE_CONSUME_CLUSTERED(m
);
3600 if (*type_of_fault
!= DBG_COW_FAULT
) {
3601 DTRACE_VM2(as_fault
, int, 1, (uint64_t *), NULL
);
3603 if (pmap
== kernel_pmap
) {
3604 DTRACE_VM2(kernel_asflt
, int, 1, (uint64_t *), NULL
);
3608 kr
= vm_fault_validate_cs(cs_bypass
, object
, m
, pmap
, vaddr
,
3609 *prot
, caller_prot
, fault_page_size
, fault_phys_offset
,
3610 fault_info
, &is_tainted
);
3611 if (kr
== KERN_SUCCESS
) {
3613 * We either have a good page, or a tainted page that has been accepted by the process.
3614 * In both cases the page will be entered into the pmap.
3616 *page_needs_data_sync
= vm_fault_enter_set_mapped(object
, m
, *prot
, fault_type
);
3617 if ((fault_type
& VM_PROT_WRITE
) && is_tainted
) {
3619 * This page is tainted but we're inserting it anyways.
3620 * Since it's writeable, we need to disconnect it from other pmaps
3621 * now so those processes can take note.
3625 * We can only get here
3626 * because of the CSE logic
3628 assert(pmap_get_vm_map_cs_enforced(pmap
));
3629 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m
));
3631 * If we are faulting for a write, we can clear
3632 * the execute bit - that will ensure the page is
3633 * checked again before being executable, which
3634 * protects against a map switch.
3635 * This only happens the first time the page
3636 * gets tainted, so we won't get stuck here
3637 * to make an already writeable page executable.
3640 assert(!pmap_has_prot_policy(pmap
, fault_info
->pmap_options
& PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE
, *prot
));
3641 *prot
&= ~VM_PROT_EXECUTE
;
3644 assert(VM_PAGE_OBJECT(m
) == object
);
3646 #if VM_OBJECT_ACCESS_TRACKING
3647 if (object
->access_tracking
) {
3648 DTRACE_VM2(access_tracking
, vm_map_offset_t
, vaddr
, int, fault_type
);
3649 if (fault_type
& VM_PROT_WRITE
) {
3650 object
->access_tracking_writes
++;
3651 vm_object_access_tracking_writes
++;
3653 object
->access_tracking_reads
++;
3654 vm_object_access_tracking_reads
++;
3657 #endif /* VM_OBJECT_ACCESS_TRACKING */
3664 * page queue lock must NOT be held
3665 * m->vmp_object must be locked
3667 * NOTE: m->vmp_object could be locked "shared" only if we are called
3668 * from vm_fault() as part of a soft fault. If so, we must be
3669 * careful not to modify the VM object in any way that is not
3670 * legal under a shared lock...
3676 vm_map_offset_t vaddr
,
3677 vm_map_size_t fault_page_size
,
3678 vm_map_offset_t fault_phys_offset
,
3680 vm_prot_t caller_prot
,
3682 boolean_t change_wiring
,
3684 vm_object_fault_info_t fault_info
,
3685 boolean_t
*need_retry
,
3690 bool page_needs_data_sync
;
3691 vm_prot_t fault_type
;
3692 int pmap_options
= fault_info
->pmap_options
;
3694 if (VM_PAGE_GET_PHYS_PAGE(m
) == vm_page_guard_addr
) {
3695 assert(m
->vmp_fictitious
);
3696 return KERN_SUCCESS
;
3699 fault_type
= change_wiring
? VM_PROT_NONE
: caller_prot
;
3701 kr
= vm_fault_enter_prepare(m
, pmap
, vaddr
, &prot
, caller_prot
,
3702 fault_page_size
, fault_phys_offset
, change_wiring
, fault_type
,
3703 fault_info
, type_of_fault
, &page_needs_data_sync
);
3704 object
= VM_PAGE_OBJECT(m
);
3706 vm_fault_enqueue_page(object
, m
, wired
, change_wiring
, wire_tag
, fault_info
->no_cache
, type_of_fault
, kr
);
3708 if (kr
== KERN_SUCCESS
) {
3709 if (page_needs_data_sync
) {
3710 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m
));
3713 kr
= vm_fault_pmap_enter_with_object_lock(object
, pmap
, vaddr
,
3714 fault_page_size
, fault_phys_offset
, m
,
3715 &prot
, caller_prot
, fault_type
, wired
, pmap_options
, need_retry
);
3722 vm_pre_fault(vm_map_offset_t vaddr
, vm_prot_t prot
)
3724 if (pmap_find_phys(current_map()->pmap
, vaddr
) == 0) {
3725 vm_fault(current_map(), /* map */
3727 prot
, /* fault_type */
3728 FALSE
, /* change_wiring */
3729 VM_KERN_MEMORY_NONE
, /* tag - not wiring */
3730 THREAD_UNINT
, /* interruptible */
3731 NULL
, /* caller_pmap */
3732 0 /* caller_pmap_addr */);
3740 * Handle page faults, including pseudo-faults
3741 * used to change the wiring status of pages.
3743 * Explicit continuations have been removed.
3745 * vm_fault and vm_fault_page save mucho state
3746 * in the moral equivalent of a closure. The state
3747 * structure is allocated when first entering vm_fault
3748 * and deallocated when leaving vm_fault.
3751 extern uint64_t get_current_unique_pid(void);
3753 unsigned long vm_fault_collapse_total
= 0;
3754 unsigned long vm_fault_collapse_skipped
= 0;
3760 vm_map_offset_t vaddr
,
3761 vm_prot_t fault_type
,
3762 boolean_t change_wiring
,
3765 vm_map_offset_t caller_pmap_addr
)
3767 return vm_fault_internal(map
, vaddr
, fault_type
, change_wiring
,
3768 change_wiring
? vm_tag_bt() : VM_KERN_MEMORY_NONE
,
3769 interruptible
, caller_pmap
, caller_pmap_addr
,
3776 vm_map_offset_t vaddr
,
3777 vm_prot_t fault_type
,
3778 boolean_t change_wiring
,
3779 vm_tag_t wire_tag
, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3782 vm_map_offset_t caller_pmap_addr
)
3784 return vm_fault_internal(map
, vaddr
, fault_type
, change_wiring
, wire_tag
,
3785 interruptible
, caller_pmap
, caller_pmap_addr
,
3790 current_proc_is_privileged(void)
3792 return csproc_get_platform_binary(current_proc());
3795 uint64_t vm_copied_on_read
= 0;
3798 * Cleanup after a vm_fault_enter.
3799 * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3800 * or the page should be in the pmap and on the correct paging queue.
3803 * map must be locked shared.
3804 * m_object must be locked.
3805 * If top_object != VM_OBJECT_NULL, it must be locked.
3806 * real_map must be locked.
3809 * map will be unlocked
3810 * m_object will be unlocked
3811 * top_object will be unlocked
3812 * If real_map != map, it will be unlocked
3819 vm_object_t m_object
,
3821 vm_map_offset_t offset
,
3822 vm_map_offset_t trace_real_vaddr
,
3823 vm_object_fault_info_t fault_info
,
3824 vm_prot_t caller_prot
,
3826 vm_map_offset_t real_vaddr
,
3828 __unused vm_map_offset_t real_vaddr
,
3829 #endif /* CONFIG_DTRACE */
3831 boolean_t need_retry
,
3833 ppnum_t
*physpage_p
,
3835 vm_object_t top_object
,
3836 boolean_t need_collapse
,
3837 vm_map_offset_t cur_offset
,
3838 vm_prot_t fault_type
,
3839 vm_object_t
*written_on_object
,
3840 memory_object_t
*written_on_pager
,
3841 vm_object_offset_t
*written_on_offset
)
3844 vm_map_lock_assert_shared(map
);
3845 vm_object_lock_assert_held(m_object
);
3846 if (top_object
!= VM_OBJECT_NULL
) {
3847 vm_object_lock_assert_held(top_object
);
3849 vm_map_lock_assert_held(real_map
);
3851 if (m_object
->internal
) {
3852 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_INTERNAL
));
3853 } else if (m_object
->object_is_shared_cache
) {
3854 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_SHAREDCACHE
));
3856 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_EXTERNAL
));
3859 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, event_code
, trace_real_vaddr
, (fault_info
->user_tag
<< 16) | (caller_prot
<< 8) | type_of_fault
, m
->vmp_offset
, get_current_unique_pid(), 0);
3860 if (need_retry
== FALSE
) {
3861 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_FAST
), get_current_unique_pid(), 0, 0, 0, 0);
3863 DTRACE_VM6(real_fault
, vm_map_offset_t
, real_vaddr
, vm_map_offset_t
, m
->vmp_offset
, int, event_code
, int, caller_prot
, int, type_of_fault
, int, fault_info
->user_tag
);
3864 if (kr
== KERN_SUCCESS
&&
3865 physpage_p
!= NULL
) {
3866 /* for vm_map_wire_and_extract() */
3867 *physpage_p
= VM_PAGE_GET_PHYS_PAGE(m
);
3868 if (prot
& VM_PROT_WRITE
) {
3869 vm_object_lock_assert_exclusive(m_object
);
3870 m
->vmp_dirty
= TRUE
;
3874 if (top_object
!= VM_OBJECT_NULL
) {
3876 * It's safe to drop the top object
3877 * now that we've done our
3878 * vm_fault_enter(). Any other fault
3879 * in progress for that virtual
3880 * address will either find our page
3881 * and translation or put in a new page
3884 vm_object_unlock(top_object
);
3885 top_object
= VM_OBJECT_NULL
;
3888 if (need_collapse
== TRUE
) {
3889 vm_object_collapse(object
, vm_object_trunc_page(offset
), TRUE
);
3892 if (need_retry
== FALSE
&&
3893 (type_of_fault
== DBG_PAGEIND_FAULT
|| type_of_fault
== DBG_PAGEINV_FAULT
|| type_of_fault
== DBG_CACHE_HIT_FAULT
)) {
3895 * evaluate access pattern and update state
3896 * vm_fault_deactivate_behind depends on the
3897 * state being up to date
3899 vm_fault_is_sequential(m_object
, cur_offset
, fault_info
->behavior
);
3901 vm_fault_deactivate_behind(m_object
, cur_offset
, fault_info
->behavior
);
3904 * That's it, clean up and return.
3907 vm_object_lock_assert_exclusive(m_object
);
3908 PAGE_WAKEUP_DONE(m
);
3911 if (need_retry
== FALSE
&& !m_object
->internal
&& (fault_type
& VM_PROT_WRITE
)) {
3912 vm_object_paging_begin(m_object
);
3914 assert(*written_on_object
== VM_OBJECT_NULL
);
3915 *written_on_object
= m_object
;
3916 *written_on_pager
= m_object
->pager
;
3917 *written_on_offset
= m_object
->paging_offset
+ m
->vmp_offset
;
3919 vm_object_unlock(object
);
3921 vm_map_unlock_read(map
);
3922 if (real_map
!= map
) {
3923 vm_map_unlock(real_map
);
3928 vm_fault_type_for_tracing(boolean_t need_copy_on_read
, int type_of_fault
)
3930 if (need_copy_on_read
&& type_of_fault
== DBG_COW_FAULT
) {
3931 return DBG_COR_FAULT
;
3933 return type_of_fault
;
3939 vm_map_offset_t vaddr
,
3940 vm_prot_t caller_prot
,
3941 boolean_t change_wiring
,
3942 vm_tag_t wire_tag
, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3945 vm_map_offset_t caller_pmap_addr
,
3946 ppnum_t
*physpage_p
)
3948 vm_map_version_t version
; /* Map version for verificiation */
3949 boolean_t wired
; /* Should mapping be wired down? */
3950 vm_object_t object
; /* Top-level object */
3951 vm_object_offset_t offset
; /* Top-level offset */
3952 vm_prot_t prot
; /* Protection for mapping */
3953 vm_object_t old_copy_object
; /* Saved copy object */
3954 vm_page_t result_page
; /* Result of vm_fault_page */
3955 vm_page_t top_page
; /* Placeholder page */
3958 vm_page_t m
; /* Fast access to result_page */
3959 kern_return_t error_code
;
3960 vm_object_t cur_object
;
3961 vm_object_t m_object
= NULL
;
3962 vm_object_offset_t cur_offset
;
3964 vm_object_t new_object
;
3967 wait_interrupt_t interruptible_state
;
3968 vm_map_t real_map
= map
;
3969 vm_map_t original_map
= map
;
3970 bool object_locks_dropped
= FALSE
;
3971 vm_prot_t fault_type
;
3972 vm_prot_t original_fault_type
;
3973 struct vm_object_fault_info fault_info
= {};
3974 bool need_collapse
= FALSE
;
3975 boolean_t need_retry
= FALSE
;
3976 boolean_t
*need_retry_ptr
= NULL
;
3977 uint8_t object_lock_type
= 0;
3978 uint8_t cur_object_lock_type
;
3979 vm_object_t top_object
= VM_OBJECT_NULL
;
3980 vm_object_t written_on_object
= VM_OBJECT_NULL
;
3981 memory_object_t written_on_pager
= NULL
;
3982 vm_object_offset_t written_on_offset
= 0;
3984 int compressed_count_delta
;
3985 uint8_t grab_options
;
3987 bool need_copy_on_read
;
3988 vm_map_offset_t trace_vaddr
;
3989 vm_map_offset_t trace_real_vaddr
;
3990 vm_map_size_t fault_page_size
;
3991 vm_map_size_t fault_page_mask
;
3992 vm_map_offset_t fault_phys_offset
;
3993 vm_map_offset_t real_vaddr
;
3994 bool resilient_media_retry
= FALSE
;
3995 vm_object_t resilient_media_object
= VM_OBJECT_NULL
;
3996 vm_object_offset_t resilient_media_offset
= (vm_object_offset_t
)-1;
3997 bool page_needs_data_sync
= false;
3999 * Was the VM object contended when vm_map_lookup_locked locked it?
4000 * If so, the zero fill path will drop the lock
4001 * NB: Ideally we would always drop the lock rather than rely on
4002 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4004 bool object_is_contended
= false;
4007 trace_real_vaddr
= vaddr
;
4009 if (VM_MAP_PAGE_SIZE(original_map
) < PAGE_SIZE
) {
4010 fault_phys_offset
= (vm_map_offset_t
)-1;
4011 fault_page_size
= VM_MAP_PAGE_SIZE(original_map
);
4012 fault_page_mask
= VM_MAP_PAGE_MASK(original_map
);
4013 if (fault_page_size
< PAGE_SIZE
) {
4014 DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map
, (uint64_t)trace_real_vaddr
, caller_prot
);
4015 vaddr
= vm_map_trunc_page(vaddr
, fault_page_mask
);
4018 fault_phys_offset
= 0;
4019 fault_page_size
= PAGE_SIZE
;
4020 fault_page_mask
= PAGE_MASK
;
4021 vaddr
= vm_map_trunc_page(vaddr
, PAGE_MASK
);
4024 if (map
== kernel_map
) {
4025 trace_vaddr
= VM_KERNEL_ADDRHIDE(vaddr
);
4026 trace_real_vaddr
= VM_KERNEL_ADDRHIDE(trace_real_vaddr
);
4028 trace_vaddr
= vaddr
;
4031 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
4032 (MACHDBG_CODE(DBG_MACH_VM
, 2)) | DBG_FUNC_START
,
4033 ((uint64_t)trace_vaddr
>> 32),
4035 (map
== kernel_map
),
4039 if (get_preemption_level() != 0) {
4040 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
4041 (MACHDBG_CODE(DBG_MACH_VM
, 2)) | DBG_FUNC_END
,
4042 ((uint64_t)trace_vaddr
>> 32),
4048 return KERN_FAILURE
;
4051 thread_t cthread
= current_thread();
4052 bool rtfault
= (cthread
->sched_mode
== TH_MODE_REALTIME
);
4053 uint64_t fstart
= 0;
4056 fstart
= mach_continuous_time();
4059 interruptible_state
= thread_interrupt_level(interruptible
);
4061 fault_type
= (change_wiring
? VM_PROT_NONE
: caller_prot
);
4063 VM_STAT_INCR(faults
);
4064 current_task()->faults
++;
4065 original_fault_type
= fault_type
;
4068 if (fault_type
& VM_PROT_WRITE
) {
4072 if (need_copy
|| change_wiring
) {
4073 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4075 object_lock_type
= OBJECT_LOCK_SHARED
;
4078 cur_object_lock_type
= OBJECT_LOCK_SHARED
;
4080 if ((map
== kernel_map
) && (caller_prot
& VM_PROT_WRITE
)) {
4081 if (compressor_map
) {
4082 if ((vaddr
>= vm_map_min(compressor_map
)) && (vaddr
< vm_map_max(compressor_map
))) {
4083 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr
, caller_prot
, (void *) vm_map_min(compressor_map
), (void *) vm_map_max(compressor_map
));
4088 assert(written_on_object
== VM_OBJECT_NULL
);
4091 * assume we will hit a page in the cache
4092 * otherwise, explicitly override with
4093 * the real fault type once we determine it
4095 type_of_fault
= DBG_CACHE_HIT_FAULT
;
4098 * Find the backing store object and offset into
4099 * it to begin the search.
4101 fault_type
= original_fault_type
;
4103 vm_map_lock_read(map
);
4105 if (resilient_media_retry
) {
4107 * If we have to insert a fake zero-filled page to hide
4108 * a media failure to provide the real page, we need to
4109 * resolve any pending copy-on-write on this mapping.
4110 * VM_PROT_COPY tells vm_map_lookup_locked() to deal
4111 * with that even if this is not a "write" fault.
4114 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4117 kr
= vm_map_lookup_locked(&map
, vaddr
,
4118 (fault_type
| (need_copy
? VM_PROT_COPY
: 0)),
4119 object_lock_type
, &version
,
4120 &object
, &offset
, &prot
, &wired
,
4123 &object_is_contended
);
4125 if (kr
!= KERN_SUCCESS
) {
4126 vm_map_unlock_read(map
);
4131 pmap
= real_map
->pmap
;
4132 fault_info
.interruptible
= interruptible
;
4133 fault_info
.stealth
= FALSE
;
4134 fault_info
.io_sync
= FALSE
;
4135 fault_info
.mark_zf_absent
= FALSE
;
4136 fault_info
.batch_pmap_op
= FALSE
;
4138 if (resilient_media_retry
) {
4140 * We're retrying this fault after having detected a media
4141 * failure from a "resilient_media" mapping.
4142 * Check that the mapping is still pointing at the object
4143 * that just failed to provide a page.
4145 assert(resilient_media_object
!= VM_OBJECT_NULL
);
4146 assert(resilient_media_offset
!= (vm_object_offset_t
)-1);
4147 if (object
!= VM_OBJECT_NULL
&&
4148 object
== resilient_media_object
&&
4149 offset
== resilient_media_offset
&&
4150 fault_info
.resilient_media
) {
4152 * This mapping still points at the same object
4153 * and is still "resilient_media": proceed in
4154 * "recovery-from-media-failure" mode, where we'll
4155 * insert a zero-filled page in the top object.
4157 // printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4159 /* not recovering: reset state */
4160 // printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4161 resilient_media_retry
= FALSE
;
4162 /* release our extra reference on failed object */
4163 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4164 vm_object_deallocate(resilient_media_object
);
4165 resilient_media_object
= VM_OBJECT_NULL
;
4166 resilient_media_offset
= (vm_object_offset_t
)-1;
4169 assert(resilient_media_object
== VM_OBJECT_NULL
);
4170 resilient_media_offset
= (vm_object_offset_t
)-1;
4174 * If the page is wired, we must fault for the current protection
4175 * value, to avoid further faults.
4178 fault_type
= prot
| VM_PROT_WRITE
;
4180 if (wired
|| need_copy
) {
4182 * since we're treating this fault as a 'write'
4183 * we must hold the top object lock exclusively
4185 if (object_lock_type
== OBJECT_LOCK_SHARED
) {
4186 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4188 if (vm_object_lock_upgrade(object
) == FALSE
) {
4190 * couldn't upgrade, so explictly
4191 * take the lock exclusively
4193 vm_object_lock(object
);
4198 #if VM_FAULT_CLASSIFY
4200 * Temporary data gathering code
4202 vm_fault_classify(object
, offset
, fault_type
);
4205 * Fast fault code. The basic idea is to do as much as
4206 * possible while holding the map lock and object locks.
4207 * Busy pages are not used until the object lock has to
4208 * be dropped to do something (copy, zero fill, pmap enter).
4209 * Similarly, paging references aren't acquired until that
4210 * point, and object references aren't used.
4212 * If we can figure out what to do
4213 * (zero fill, copy on write, pmap enter) while holding
4214 * the locks, then it gets done. Otherwise, we give up,
4215 * and use the original fault path (which doesn't hold
4216 * the map lock, and relies on busy pages).
4217 * The give up cases include:
4218 * - Have to talk to pager.
4219 * - Page is busy, absent or in error.
4220 * - Pager has locked out desired access.
4221 * - Fault needs to be restarted.
4222 * - Have to push page into copy object.
4224 * The code is an infinite loop that moves one level down
4225 * the shadow chain each time. cur_object and cur_offset
4226 * refer to the current object being examined. object and offset
4227 * are the original object from the map. The loop is at the
4228 * top level if and only if object and cur_object are the same.
4230 * Invariants: Map lock is held throughout. Lock is held on
4231 * original object and cur_object (if different) when
4232 * continuing or exiting loop.
4236 #if defined(__arm64__)
4238 * Fail if reading an execute-only page in a
4239 * pmap that enforces execute-only protection.
4241 if (fault_type
== VM_PROT_READ
&&
4242 (prot
& VM_PROT_EXECUTE
) &&
4243 !(prot
& VM_PROT_READ
) &&
4244 pmap_enforces_execute_only(pmap
)) {
4245 vm_object_unlock(object
);
4246 vm_map_unlock_read(map
);
4247 if (real_map
!= map
) {
4248 vm_map_unlock(real_map
);
4250 kr
= KERN_PROTECTION_FAILURE
;
4255 fault_phys_offset
= (vm_map_offset_t
)offset
- vm_map_trunc_page((vm_map_offset_t
)offset
, PAGE_MASK
);
4258 * If this page is to be inserted in a copy delay object
4259 * for writing, and if the object has a copy, then the
4260 * copy delay strategy is implemented in the slow fault page.
4262 if (object
->copy_strategy
== MEMORY_OBJECT_COPY_DELAY
&&
4263 object
->copy
!= VM_OBJECT_NULL
&& (fault_type
& VM_PROT_WRITE
)) {
4264 goto handle_copy_delay
;
4267 cur_object
= object
;
4268 cur_offset
= offset
;
4271 #if CONFIG_SECLUDED_MEMORY
4272 if (object
->can_grab_secluded
) {
4273 grab_options
|= VM_PAGE_GRAB_SECLUDED
;
4275 #endif /* CONFIG_SECLUDED_MEMORY */
4278 if (!cur_object
->pager_created
&&
4279 cur_object
->phys_contiguous
) { /* superpage */
4283 if (cur_object
->blocked_access
) {
4285 * Access to this VM object has been blocked.
4286 * Let the slow path handle it.
4291 m
= vm_page_lookup(cur_object
, vm_object_trunc_page(cur_offset
));
4294 if (m
!= VM_PAGE_NULL
) {
4295 m_object
= cur_object
;
4298 wait_result_t result
;
4301 * in order to do the PAGE_ASSERT_WAIT, we must
4302 * have object that 'm' belongs to locked exclusively
4304 if (object
!= cur_object
) {
4305 if (cur_object_lock_type
== OBJECT_LOCK_SHARED
) {
4306 cur_object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4308 if (vm_object_lock_upgrade(cur_object
) == FALSE
) {
4310 * couldn't upgrade so go do a full retry
4311 * immediately since we can no longer be
4312 * certain about cur_object (since we
4313 * don't hold a reference on it)...
4314 * first drop the top object lock
4316 vm_object_unlock(object
);
4318 vm_map_unlock_read(map
);
4319 if (real_map
!= map
) {
4320 vm_map_unlock(real_map
);
4326 } else if (object_lock_type
== OBJECT_LOCK_SHARED
) {
4327 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4329 if (vm_object_lock_upgrade(object
) == FALSE
) {
4331 * couldn't upgrade, so explictly take the lock
4332 * exclusively and go relookup the page since we
4333 * will have dropped the object lock and
4334 * a different thread could have inserted
4335 * a page at this offset
4336 * no need for a full retry since we're
4337 * at the top level of the object chain
4339 vm_object_lock(object
);
4344 if ((m
->vmp_q_state
== VM_PAGE_ON_PAGEOUT_Q
) && m_object
->internal
) {
4346 * m->vmp_busy == TRUE and the object is locked exclusively
4347 * if m->pageout_queue == TRUE after we acquire the
4348 * queues lock, we are guaranteed that it is stable on
4349 * the pageout queue and therefore reclaimable
4351 * NOTE: this is only true for the internal pageout queue
4352 * in the compressor world
4354 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT
);
4356 vm_page_lock_queues();
4358 if (m
->vmp_q_state
== VM_PAGE_ON_PAGEOUT_Q
) {
4359 vm_pageout_throttle_up(m
);
4360 vm_page_unlock_queues();
4362 PAGE_WAKEUP_DONE(m
);
4363 goto reclaimed_from_pageout
;
4365 vm_page_unlock_queues();
4367 if (object
!= cur_object
) {
4368 vm_object_unlock(object
);
4371 vm_map_unlock_read(map
);
4372 if (real_map
!= map
) {
4373 vm_map_unlock(real_map
);
4376 result
= PAGE_ASSERT_WAIT(m
, interruptible
);
4378 vm_object_unlock(cur_object
);
4380 if (result
== THREAD_WAITING
) {
4381 result
= thread_block(THREAD_CONTINUE_NULL
);
4383 counter(c_vm_fault_page_block_busy_kernel
++);
4385 if (result
== THREAD_AWAKENED
|| result
== THREAD_RESTART
) {
4392 reclaimed_from_pageout
:
4393 if (m
->vmp_laundry
) {
4394 if (object
!= cur_object
) {
4395 if (cur_object_lock_type
== OBJECT_LOCK_SHARED
) {
4396 cur_object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4398 vm_object_unlock(object
);
4399 vm_object_unlock(cur_object
);
4401 vm_map_unlock_read(map
);
4402 if (real_map
!= map
) {
4403 vm_map_unlock(real_map
);
4408 } else if (object_lock_type
== OBJECT_LOCK_SHARED
) {
4409 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4411 if (vm_object_lock_upgrade(object
) == FALSE
) {
4413 * couldn't upgrade, so explictly take the lock
4414 * exclusively and go relookup the page since we
4415 * will have dropped the object lock and
4416 * a different thread could have inserted
4417 * a page at this offset
4418 * no need for a full retry since we're
4419 * at the top level of the object chain
4421 vm_object_lock(object
);
4426 vm_pageout_steal_laundry(m
, FALSE
);
4429 if (VM_PAGE_GET_PHYS_PAGE(m
) == vm_page_guard_addr
) {
4431 * Guard page: let the slow path deal with it
4435 if (m
->vmp_unusual
&& (m
->vmp_error
|| m
->vmp_restart
|| m
->vmp_private
|| m
->vmp_absent
)) {
4437 * Unusual case... let the slow path deal with it
4441 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object
)) {
4442 if (object
!= cur_object
) {
4443 vm_object_unlock(object
);
4445 vm_map_unlock_read(map
);
4446 if (real_map
!= map
) {
4447 vm_map_unlock(real_map
);
4449 vm_object_unlock(cur_object
);
4450 kr
= KERN_MEMORY_ERROR
;
4453 assert(m_object
== VM_PAGE_OBJECT(m
));
4455 if (vm_fault_cs_need_validation(map
->pmap
, m
, m_object
,
4457 (physpage_p
!= NULL
&& (prot
& VM_PROT_WRITE
))) {
4458 upgrade_lock_and_retry
:
4460 * We might need to validate this page
4461 * against its code signature, so we
4462 * want to hold the VM object exclusively.
4464 if (object
!= cur_object
) {
4465 if (cur_object_lock_type
== OBJECT_LOCK_SHARED
) {
4466 vm_object_unlock(object
);
4467 vm_object_unlock(cur_object
);
4469 cur_object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4471 vm_map_unlock_read(map
);
4472 if (real_map
!= map
) {
4473 vm_map_unlock(real_map
);
4478 } else if (object_lock_type
== OBJECT_LOCK_SHARED
) {
4479 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4481 if (vm_object_lock_upgrade(object
) == FALSE
) {
4483 * couldn't upgrade, so explictly take the lock
4484 * exclusively and go relookup the page since we
4485 * will have dropped the object lock and
4486 * a different thread could have inserted
4487 * a page at this offset
4488 * no need for a full retry since we're
4489 * at the top level of the object chain
4491 vm_object_lock(object
);
4498 * Two cases of map in faults:
4499 * - At top level w/o copy object.
4500 * - Read fault anywhere.
4501 * --> must disallow write.
4504 if (object
== cur_object
&& object
->copy
== VM_OBJECT_NULL
) {
4509 !fault_info
.no_copy_on_read
&&
4510 cur_object
!= object
&&
4511 !cur_object
->internal
&&
4512 !cur_object
->pager_trusted
&&
4513 vm_protect_privileged_from_untrusted
&&
4514 !((prot
& VM_PROT_EXECUTE
) &&
4515 cur_object
->code_signed
&&
4516 pmap_get_vm_map_cs_enforced(caller_pmap
? caller_pmap
: pmap
)) &&
4517 current_proc_is_privileged()) {
4519 * We're faulting on a page in "object" and
4520 * went down the shadow chain to "cur_object"
4521 * to find out that "cur_object"'s pager
4522 * is not "trusted", i.e. we can not trust it
4523 * to always return the same contents.
4524 * Since the target is a "privileged" process,
4525 * let's treat this as a copy-on-read fault, as
4526 * if it was a copy-on-write fault.
4527 * Once "object" gets a copy of this page, it
4528 * won't have to rely on "cur_object" to
4529 * provide the contents again.
4531 * This is done by setting "need_copy" and
4532 * retrying the fault from the top with the
4533 * appropriate locking.
4535 * Special case: if the mapping is executable
4536 * and the untrusted object is code-signed and
4537 * the process is "cs_enforced", we do not
4538 * copy-on-read because that would break
4539 * code-signing enforcement expectations (an
4540 * executable page must belong to a code-signed
4541 * object) and we can rely on code-signing
4542 * to re-validate the page if it gets evicted
4543 * and paged back in.
4545 // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4546 vm_copied_on_read
++;
4549 vm_object_unlock(object
);
4550 vm_object_unlock(cur_object
);
4551 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4552 vm_map_unlock_read(map
);
4553 if (real_map
!= map
) {
4554 vm_map_unlock(real_map
);
4559 if (!(fault_type
& VM_PROT_WRITE
) && !need_copy
) {
4560 if (!pmap_has_prot_policy(pmap
, fault_info
.pmap_options
& PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE
, prot
)) {
4561 prot
&= ~VM_PROT_WRITE
;
4564 * For a protection that the pmap cares
4565 * about, we must hand over the full
4566 * set of protections (so that the pmap
4567 * layer can apply any desired policy).
4568 * This means that cs_bypass must be
4569 * set, as this can force us to pass
4572 assert(fault_info
.cs_bypass
);
4575 if (object
!= cur_object
) {
4577 * We still need to hold the top object
4578 * lock here to prevent a race between
4579 * a read fault (taking only "shared"
4580 * locks) and a write fault (taking
4581 * an "exclusive" lock on the top
4583 * Otherwise, as soon as we release the
4584 * top lock, the write fault could
4585 * proceed and actually complete before
4586 * the read fault, and the copied page's
4587 * translation could then be overwritten
4588 * by the read fault's translation for
4589 * the original page.
4591 * Let's just record what the top object
4592 * is and we'll release it later.
4594 top_object
= object
;
4597 * switch to the object that has the new page
4599 object
= cur_object
;
4600 object_lock_type
= cur_object_lock_type
;
4603 assert(m_object
== VM_PAGE_OBJECT(m
));
4606 * prepare for the pmap_enter...
4607 * object and map are both locked
4608 * m contains valid data
4609 * object == m->vmp_object
4610 * cur_object == NULL or it's been unlocked
4611 * no paging references on either object or cur_object
4613 if (top_object
!= VM_OBJECT_NULL
|| object_lock_type
!= OBJECT_LOCK_EXCLUSIVE
) {
4614 need_retry_ptr
= &need_retry
;
4616 need_retry_ptr
= NULL
;
4619 if (fault_page_size
< PAGE_SIZE
) {
4620 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map
, original_map
, pmap
, (uint64_t)vaddr
, caller_pmap
, (uint64_t)caller_pmap_addr
, (uint64_t)((((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
) + fault_phys_offset
), (uint64_t)(((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
), (uint64_t)fault_phys_offset
, prot
, caller_prot
);
4621 assertf((!(fault_phys_offset
& FOURK_PAGE_MASK
) &&
4622 fault_phys_offset
< PAGE_SIZE
),
4623 "0x%llx\n", (uint64_t)fault_phys_offset
);
4625 assertf(fault_phys_offset
== 0,
4626 "0x%llx\n", (uint64_t)fault_phys_offset
);
4630 kr
= vm_fault_enter(m
,
4644 kr
= vm_fault_enter(m
,
4670 vm_fault_type_for_tracing(need_copy_on_read
, type_of_fault
),
4681 &written_on_offset
);
4682 top_object
= VM_OBJECT_NULL
;
4683 if (need_retry
== TRUE
) {
4685 * vm_fault_enter couldn't complete the PMAP_ENTER...
4686 * at this point we don't hold any locks so it's safe
4687 * to ask the pmap layer to expand the page table to
4688 * accommodate this mapping... once expanded, we'll
4689 * re-drive the fault which should result in vm_fault_enter
4690 * being able to successfully enter the mapping this time around
4692 (void)pmap_enter_options(
4693 pmap
, vaddr
, 0, 0, 0, 0, 0,
4694 PMAP_OPTIONS_NOENTER
, NULL
);
4702 * COPY ON WRITE FAULT
4704 assert(object_lock_type
== OBJECT_LOCK_EXCLUSIVE
);
4707 * If objects match, then
4708 * object->copy must not be NULL (else control
4709 * would be in previous code block), and we
4710 * have a potential push into the copy object
4711 * with which we can't cope with here.
4713 if (cur_object
== object
) {
4715 * must take the slow path to
4716 * deal with the copy push
4722 * This is now a shadow based copy on write
4723 * fault -- it requires a copy up the shadow
4726 assert(m_object
== VM_PAGE_OBJECT(m
));
4728 if ((cur_object_lock_type
== OBJECT_LOCK_SHARED
) &&
4729 vm_fault_cs_need_validation(NULL
, m
, m_object
,
4731 goto upgrade_lock_and_retry
;
4735 * Allocate a page in the original top level
4736 * object. Give up if allocate fails. Also
4737 * need to remember current page, as it's the
4738 * source of the copy.
4740 * at this point we hold locks on both
4741 * object and cur_object... no need to take
4742 * paging refs or mark pages BUSY since
4743 * we don't drop either object lock until
4744 * the page has been copied and inserted
4747 m
= vm_page_grab_options(grab_options
);
4750 if (m
== VM_PAGE_NULL
) {
4752 * no free page currently available...
4753 * must take the slow path
4758 * Now do the copy. Mark the source page busy...
4760 * NOTE: This code holds the map lock across
4763 vm_page_copy(cur_m
, m
);
4764 vm_page_insert(m
, object
, vm_object_trunc_page(offset
));
4765 if (VM_MAP_PAGE_MASK(map
) != PAGE_MASK
) {
4766 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map
, (uint64_t)vaddr
, cur_m
, VM_PAGE_OBJECT(cur_m
), cur_m
->vmp_offset
, m
, VM_PAGE_OBJECT(m
), m
->vmp_offset
);
4769 SET_PAGE_DIRTY(m
, FALSE
);
4772 * Now cope with the source page and object
4774 if (object
->ref_count
> 1 && cur_m
->vmp_pmapped
) {
4775 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m
));
4776 } else if (VM_MAP_PAGE_SIZE(map
) < PAGE_SIZE
) {
4778 * We've copied the full 16K page but we're
4779 * about to call vm_fault_enter() only for
4780 * the 4K chunk we're faulting on. The other
4781 * three 4K chunks in that page could still
4782 * be pmapped in this pmap.
4783 * Since the VM object layer thinks that the
4784 * entire page has been dealt with and the
4785 * original page might no longer be needed,
4786 * it might collapse/bypass the original VM
4787 * object and free its pages, which would be
4788 * bad (and would trigger pmap_verify_free()
4789 * assertions) if the other 4K chunks are still
4793 * XXX FBDP TODO4K: to be revisisted
4794 * Technically, we need to pmap_disconnect()
4795 * only the target pmap's mappings for the 4K
4796 * chunks of this 16K VM page. If other pmaps
4797 * have PTEs on these chunks, that means that
4798 * the associated VM map must have a reference
4799 * on the VM object, so no need to worry about
4801 * pmap_protect() for each 4K chunk would be
4802 * better but we'd have to check which chunks
4803 * are actually mapped before and after this
4805 * A full-blown pmap_disconnect() is easier
4806 * for now but not efficient.
4808 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m
, VM_PAGE_OBJECT(cur_m
), cur_m
->vmp_offset
, VM_PAGE_GET_PHYS_PAGE(cur_m
));
4809 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m
));
4812 if (cur_m
->vmp_clustered
) {
4813 VM_PAGE_COUNT_AS_PAGEIN(cur_m
);
4814 VM_PAGE_CONSUME_CLUSTERED(cur_m
);
4815 vm_fault_is_sequential(cur_object
, cur_offset
, fault_info
.behavior
);
4817 need_collapse
= TRUE
;
4819 if (!cur_object
->internal
&&
4820 cur_object
->copy_strategy
== MEMORY_OBJECT_COPY_DELAY
) {
4822 * The object from which we've just
4823 * copied a page is most probably backed
4824 * by a vnode. We don't want to waste too
4825 * much time trying to collapse the VM objects
4826 * and create a bottleneck when several tasks
4827 * map the same file.
4829 if (cur_object
->copy
== object
) {
4831 * Shared mapping or no COW yet.
4832 * We can never collapse a copy
4833 * object into its backing object.
4835 need_collapse
= FALSE
;
4836 } else if (cur_object
->copy
== object
->shadow
&&
4837 object
->shadow
->resident_page_count
== 0) {
4839 * Shared mapping after a COW occurred.
4841 need_collapse
= FALSE
;
4844 vm_object_unlock(cur_object
);
4846 if (need_collapse
== FALSE
) {
4847 vm_fault_collapse_skipped
++;
4849 vm_fault_collapse_total
++;
4851 type_of_fault
= DBG_COW_FAULT
;
4852 VM_STAT_INCR(cow_faults
);
4853 DTRACE_VM2(cow_fault
, int, 1, (uint64_t *), NULL
);
4854 current_task()->cow_faults
++;
4859 * No page at cur_object, cur_offset... m == NULL
4861 if (cur_object
->pager_created
) {
4862 vm_external_state_t compressor_external_state
= VM_EXTERNAL_STATE_UNKNOWN
;
4864 if (MUST_ASK_PAGER(cur_object
, cur_offset
, compressor_external_state
) == TRUE
) {
4866 uint8_t c_flags
= C_DONT_BLOCK
;
4867 bool insert_cur_object
= FALSE
;
4870 * May have to talk to a pager...
4871 * if so, take the slow path by
4872 * doing a 'break' from the while (TRUE) loop
4874 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4875 * if the compressor is active and the page exists there
4877 if (compressor_external_state
!= VM_EXTERNAL_STATE_EXISTS
) {
4881 if (map
== kernel_map
|| real_map
== kernel_map
) {
4883 * can't call into the compressor with the kernel_map
4884 * lock held, since the compressor may try to operate
4885 * on the kernel map in order to return an empty c_segment
4889 if (object
!= cur_object
) {
4890 if (fault_type
& VM_PROT_WRITE
) {
4893 insert_cur_object
= TRUE
;
4896 if (insert_cur_object
== TRUE
) {
4897 if (cur_object_lock_type
== OBJECT_LOCK_SHARED
) {
4898 cur_object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4900 if (vm_object_lock_upgrade(cur_object
) == FALSE
) {
4902 * couldn't upgrade so go do a full retry
4903 * immediately since we can no longer be
4904 * certain about cur_object (since we
4905 * don't hold a reference on it)...
4906 * first drop the top object lock
4908 vm_object_unlock(object
);
4910 vm_map_unlock_read(map
);
4911 if (real_map
!= map
) {
4912 vm_map_unlock(real_map
);
4918 } else if (object_lock_type
== OBJECT_LOCK_SHARED
) {
4919 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
4921 if (object
!= cur_object
) {
4923 * we can't go for the upgrade on the top
4924 * lock since the upgrade may block waiting
4925 * for readers to drain... since we hold
4926 * cur_object locked at this point, waiting
4927 * for the readers to drain would represent
4928 * a lock order inversion since the lock order
4929 * for objects is the reference order in the
4932 vm_object_unlock(object
);
4933 vm_object_unlock(cur_object
);
4935 vm_map_unlock_read(map
);
4936 if (real_map
!= map
) {
4937 vm_map_unlock(real_map
);
4942 if (vm_object_lock_upgrade(object
) == FALSE
) {
4944 * couldn't upgrade, so explictly take the lock
4945 * exclusively and go relookup the page since we
4946 * will have dropped the object lock and
4947 * a different thread could have inserted
4948 * a page at this offset
4949 * no need for a full retry since we're
4950 * at the top level of the object chain
4952 vm_object_lock(object
);
4957 m
= vm_page_grab_options(grab_options
);
4960 if (m
== VM_PAGE_NULL
) {
4962 * no free page currently available...
4963 * must take the slow path
4969 * The object is and remains locked
4970 * so no need to take a
4971 * "paging_in_progress" reference.
4974 if ((object
== cur_object
&&
4975 object_lock_type
== OBJECT_LOCK_EXCLUSIVE
) ||
4976 (object
!= cur_object
&&
4977 cur_object_lock_type
== OBJECT_LOCK_EXCLUSIVE
)) {
4978 shared_lock
= FALSE
;
4983 kr
= vm_compressor_pager_get(
4985 (vm_object_trunc_page(cur_offset
)
4986 + cur_object
->paging_offset
),
4987 VM_PAGE_GET_PHYS_PAGE(m
),
4990 &compressed_count_delta
);
4992 vm_compressor_pager_count(
4994 compressed_count_delta
,
4998 if (kr
!= KERN_SUCCESS
) {
4999 vm_page_release(m
, FALSE
);
5003 * If vm_compressor_pager_get() returns
5004 * KERN_MEMORY_FAILURE, then the
5005 * compressed data is permanently lost,
5006 * so return this error immediately.
5008 if (kr
== KERN_MEMORY_FAILURE
) {
5009 if (object
!= cur_object
) {
5010 vm_object_unlock(cur_object
);
5012 vm_object_unlock(object
);
5013 vm_map_unlock_read(map
);
5014 if (real_map
!= map
) {
5015 vm_map_unlock(real_map
);
5018 } else if (kr
!= KERN_SUCCESS
) {
5021 m
->vmp_dirty
= TRUE
;
5024 * If the object is purgeable, its
5025 * owner's purgeable ledgers will be
5026 * updated in vm_page_insert() but the
5027 * page was also accounted for in a
5028 * "compressed purgeable" ledger, so
5031 if (object
!= cur_object
&&
5032 !insert_cur_object
) {
5034 * We're not going to insert
5035 * the decompressed page into
5036 * the object it came from.
5038 * We're dealing with a
5039 * copy-on-write fault on
5041 * We're going to decompress
5042 * the page directly into the
5043 * target "object" while
5044 * keepin the compressed
5045 * page for "cur_object", so
5046 * no ledger update in that
5049 } else if (((cur_object
->purgable
==
5050 VM_PURGABLE_DENY
) &&
5051 (!cur_object
->vo_ledger_tag
)) ||
5052 (cur_object
->vo_owner
==
5055 * "cur_object" is not purgeable
5056 * and is not ledger-taged, or
5057 * there's no owner for it,
5058 * so no owner's ledgers to
5063 * One less compressed
5064 * purgeable/tagged page for
5065 * cur_object's owner.
5067 vm_object_owner_compressed_update(
5072 if (insert_cur_object
) {
5073 vm_page_insert(m
, cur_object
, vm_object_trunc_page(cur_offset
));
5074 m_object
= cur_object
;
5076 vm_page_insert(m
, object
, vm_object_trunc_page(offset
));
5080 if ((m_object
->wimg_bits
& VM_WIMG_MASK
) != VM_WIMG_USE_DEFAULT
) {
5082 * If the page is not cacheable,
5083 * we can't let its contents
5084 * linger in the data cache
5085 * after the decompression.
5087 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m
));
5090 type_of_fault
= my_fault_type
;
5092 VM_STAT_DECOMPRESSIONS();
5094 if (cur_object
!= object
) {
5095 if (insert_cur_object
) {
5096 top_object
= object
;
5098 * switch to the object that has the new page
5100 object
= cur_object
;
5101 object_lock_type
= cur_object_lock_type
;
5103 vm_object_unlock(cur_object
);
5104 cur_object
= object
;
5110 * existence map present and indicates
5111 * that the pager doesn't have this page
5114 if (cur_object
->shadow
== VM_OBJECT_NULL
||
5115 resilient_media_retry
) {
5117 * Zero fill fault. Page gets
5118 * inserted into the original object.
5120 if (cur_object
->shadow_severed
||
5121 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object
) ||
5122 cur_object
== compressor_object
||
5123 cur_object
== kernel_object
||
5124 cur_object
== vm_submap_object
) {
5125 if (object
!= cur_object
) {
5126 vm_object_unlock(cur_object
);
5128 vm_object_unlock(object
);
5130 vm_map_unlock_read(map
);
5131 if (real_map
!= map
) {
5132 vm_map_unlock(real_map
);
5135 kr
= KERN_MEMORY_ERROR
;
5138 if (cur_object
!= object
) {
5139 vm_object_unlock(cur_object
);
5141 cur_object
= object
;
5143 if (object_lock_type
== OBJECT_LOCK_SHARED
) {
5144 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
5146 if (vm_object_lock_upgrade(object
) == FALSE
) {
5148 * couldn't upgrade so do a full retry on the fault
5149 * since we dropped the object lock which
5150 * could allow another thread to insert
5151 * a page at this offset
5153 vm_map_unlock_read(map
);
5154 if (real_map
!= map
) {
5155 vm_map_unlock(real_map
);
5161 if (!object
->internal
) {
5162 panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__
, __LINE__
, (uint64_t)offset
, object
);
5164 m
= vm_page_alloc(object
, vm_object_trunc_page(offset
));
5167 if (m
== VM_PAGE_NULL
) {
5169 * no free page currently available...
5170 * must take the slow path
5177 * Zeroing the page and entering into it into the pmap
5178 * represents a significant amount of the zero fill fault handler's work.
5180 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5181 * now that we've inserted the page into the vm object.
5182 * Before dropping the lock, we need to check protection bits and set the
5183 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5184 * zero it, and do the pmap enter. We'll need to reacquire the lock
5185 * to clear the busy bit and wake up any waiters.
5187 vm_fault_cs_clear(m
);
5188 m
->vmp_pmapped
= TRUE
;
5189 if (map
->no_zero_fill
) {
5190 type_of_fault
= DBG_NZF_PAGE_FAULT
;
5192 type_of_fault
= DBG_ZERO_FILL_FAULT
;
5195 pmap_t destination_pmap
;
5196 vm_map_offset_t destination_pmap_vaddr
;
5197 vm_prot_t enter_fault_type
;
5199 destination_pmap
= caller_pmap
;
5200 destination_pmap_vaddr
= caller_pmap_addr
;
5202 destination_pmap
= pmap
;
5203 destination_pmap_vaddr
= vaddr
;
5205 if (change_wiring
) {
5206 enter_fault_type
= VM_PROT_NONE
;
5208 enter_fault_type
= caller_prot
;
5210 kr
= vm_fault_enter_prepare(m
,
5212 destination_pmap_vaddr
,
5221 &page_needs_data_sync
);
5222 if (kr
!= KERN_SUCCESS
) {
5223 goto zero_fill_cleanup
;
5226 if (object_is_contended
) {
5228 * At this point the page is in the vm object, but not on a paging queue.
5229 * Since it's accessible to another thread but its contents are invalid
5230 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5233 vm_object_unlock(object
);
5235 if (type_of_fault
== DBG_ZERO_FILL_FAULT
) {
5237 * Now zero fill page...
5238 * the page is probably going to
5239 * be written soon, so don't bother
5240 * to clear the modified bit
5242 * NOTE: This code holds the map
5243 * lock across the zero fill.
5245 vm_page_zero_fill(m
);
5246 VM_STAT_INCR(zero_fill_count
);
5247 DTRACE_VM2(zfod
, int, 1, (uint64_t *), NULL
);
5249 if (page_needs_data_sync
) {
5250 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m
));
5253 if (top_object
!= VM_OBJECT_NULL
) {
5254 need_retry_ptr
= &need_retry
;
5256 need_retry_ptr
= NULL
;
5258 if (object_is_contended
) {
5259 kr
= vm_fault_pmap_enter(destination_pmap
, destination_pmap_vaddr
,
5260 fault_page_size
, fault_phys_offset
,
5261 m
, &prot
, caller_prot
, enter_fault_type
, wired
,
5262 fault_info
.pmap_options
, need_retry_ptr
);
5263 vm_object_lock(object
);
5265 kr
= vm_fault_pmap_enter_with_object_lock(object
, destination_pmap
, destination_pmap_vaddr
,
5266 fault_page_size
, fault_phys_offset
,
5267 m
, &prot
, caller_prot
, enter_fault_type
, wired
,
5268 fault_info
.pmap_options
, need_retry_ptr
);
5272 if (!VM_DYNAMIC_PAGING_ENABLED() &&
5273 (object
->purgable
== VM_PURGABLE_DENY
||
5274 object
->purgable
== VM_PURGABLE_NONVOLATILE
||
5275 object
->purgable
== VM_PURGABLE_VOLATILE
)) {
5276 vm_page_lockspin_queues();
5277 if (!VM_DYNAMIC_PAGING_ENABLED()) {
5278 vm_fault_enqueue_throttled_locked(m
);
5280 vm_page_unlock_queues();
5282 vm_fault_enqueue_page(object
, m
, wired
, change_wiring
, wire_tag
, fault_info
.no_cache
, &type_of_fault
, kr
);
5306 &written_on_offset
);
5307 top_object
= VM_OBJECT_NULL
;
5308 if (need_retry
== TRUE
) {
5310 * vm_fault_enter couldn't complete the PMAP_ENTER...
5311 * at this point we don't hold any locks so it's safe
5312 * to ask the pmap layer to expand the page table to
5313 * accommodate this mapping... once expanded, we'll
5314 * re-drive the fault which should result in vm_fault_enter
5315 * being able to successfully enter the mapping this time around
5317 (void)pmap_enter_options(
5318 pmap
, vaddr
, 0, 0, 0, 0, 0,
5319 PMAP_OPTIONS_NOENTER
, NULL
);
5327 * On to the next level in the shadow chain
5329 cur_offset
+= cur_object
->vo_shadow_offset
;
5330 new_object
= cur_object
->shadow
;
5331 fault_phys_offset
= cur_offset
- vm_object_trunc_page(cur_offset
);
5334 * take the new_object's lock with the indicated state
5336 if (cur_object_lock_type
== OBJECT_LOCK_SHARED
) {
5337 vm_object_lock_shared(new_object
);
5339 vm_object_lock(new_object
);
5342 if (cur_object
!= object
) {
5343 vm_object_unlock(cur_object
);
5346 cur_object
= new_object
;
5352 * Cleanup from fast fault failure. Drop any object
5353 * lock other than original and drop map lock.
5355 if (object
!= cur_object
) {
5356 vm_object_unlock(cur_object
);
5360 * must own the object lock exclusively at this point
5362 if (object_lock_type
== OBJECT_LOCK_SHARED
) {
5363 object_lock_type
= OBJECT_LOCK_EXCLUSIVE
;
5365 if (vm_object_lock_upgrade(object
) == FALSE
) {
5367 * couldn't upgrade, so explictly
5368 * take the lock exclusively
5369 * no need to retry the fault at this
5370 * point since "vm_fault_page" will
5371 * completely re-evaluate the state
5373 vm_object_lock(object
);
5378 vm_map_unlock_read(map
);
5379 if (real_map
!= map
) {
5380 vm_map_unlock(real_map
);
5383 if (__improbable(object
== compressor_object
||
5384 object
== kernel_object
||
5385 object
== vm_submap_object
)) {
5387 * These objects are explicitly managed and populated by the
5388 * kernel. The virtual ranges backed by these objects should
5389 * either have wired pages or "holes" that are not supposed to
5390 * be accessed at all until they get explicitly populated.
5391 * We should never have to resolve a fault on a mapping backed
5392 * by one of these VM objects and providing a zero-filled page
5393 * would be wrong here, so let's fail the fault and let the
5394 * caller crash or recover.
5396 vm_object_unlock(object
);
5397 kr
= KERN_MEMORY_ERROR
;
5401 assert(object
!= compressor_object
);
5402 assert(object
!= kernel_object
);
5403 assert(object
!= vm_submap_object
);
5405 if (resilient_media_retry
) {
5407 * We could get here if we failed to get a free page
5408 * to zero-fill and had to take the slow path again.
5409 * Reset our "recovery-from-failed-media" state.
5411 assert(resilient_media_object
!= VM_OBJECT_NULL
);
5412 assert(resilient_media_offset
!= (vm_object_offset_t
)-1);
5413 /* release our extra reference on failed object */
5414 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5415 vm_object_deallocate(resilient_media_object
);
5416 resilient_media_object
= VM_OBJECT_NULL
;
5417 resilient_media_offset
= (vm_object_offset_t
)-1;
5418 resilient_media_retry
= FALSE
;
5422 * Make a reference to this object to
5423 * prevent its disposal while we are messing with
5424 * it. Once we have the reference, the map is free
5425 * to be diddled. Since objects reference their
5426 * shadows (and copies), they will stay around as well.
5428 vm_object_reference_locked(object
);
5429 vm_object_paging_begin(object
);
5431 set_thread_pagein_error(cthread
, 0);
5434 result_page
= VM_PAGE_NULL
;
5435 kr
= vm_fault_page(object
, offset
, fault_type
,
5436 (change_wiring
&& !wired
),
5437 FALSE
, /* page not looked up */
5438 &prot
, &result_page
, &top_page
,
5440 &error_code
, map
->no_zero_fill
,
5441 FALSE
, &fault_info
);
5444 * if kr != VM_FAULT_SUCCESS, then the paging reference
5445 * has been dropped and the object unlocked... the ref_count
5448 * if kr == VM_FAULT_SUCCESS, then the paging reference
5449 * is still held along with the ref_count on the original object
5451 * the object is returned locked with a paging reference
5453 * if top_page != NULL, then it's BUSY and the
5454 * object it belongs to has a paging reference
5455 * but is returned unlocked
5457 if (kr
!= VM_FAULT_SUCCESS
&&
5458 kr
!= VM_FAULT_SUCCESS_NO_VM_PAGE
) {
5459 if (kr
== VM_FAULT_MEMORY_ERROR
&&
5460 fault_info
.resilient_media
) {
5461 assertf(object
->internal
, "object %p", object
);
5463 * This fault failed but the mapping was
5464 * "media resilient", so we'll retry the fault in
5465 * recovery mode to get a zero-filled page in the
5467 * Keep the reference on the failing object so
5468 * that we can check that the mapping is still
5469 * pointing to it when we retry the fault.
5471 // printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5472 assert(!resilient_media_retry
); /* no double retry */
5473 assert(resilient_media_object
== VM_OBJECT_NULL
);
5474 assert(resilient_media_offset
== (vm_object_offset_t
)-1);
5475 resilient_media_retry
= TRUE
;
5476 resilient_media_object
= object
;
5477 resilient_media_offset
= offset
;
5478 // printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5482 * we didn't succeed, lose the object reference
5485 vm_object_deallocate(object
);
5486 object
= VM_OBJECT_NULL
; /* no longer valid */
5490 * See why we failed, and take corrective action.
5493 case VM_FAULT_MEMORY_SHORTAGE
:
5494 if (vm_page_wait((change_wiring
) ?
5496 THREAD_ABORTSAFE
)) {
5500 case VM_FAULT_INTERRUPTED
:
5503 case VM_FAULT_RETRY
:
5505 case VM_FAULT_MEMORY_ERROR
:
5509 kr
= KERN_MEMORY_ERROR
;
5513 panic("vm_fault: unexpected error 0x%x from "
5514 "vm_fault_page()\n", kr
);
5520 if (m
!= VM_PAGE_NULL
) {
5521 m_object
= VM_PAGE_OBJECT(m
);
5522 assert((change_wiring
&& !wired
) ?
5523 (top_page
== VM_PAGE_NULL
) :
5524 ((top_page
== VM_PAGE_NULL
) == (m_object
== object
)));
5528 * What to do with the resulting page from vm_fault_page
5529 * if it doesn't get entered into the physical map:
5531 #define RELEASE_PAGE(m) \
5533 PAGE_WAKEUP_DONE(m); \
5534 if ( !VM_PAGE_PAGEABLE(m)) { \
5535 vm_page_lockspin_queues(); \
5536 if ( !VM_PAGE_PAGEABLE(m)) \
5537 vm_page_activate(m); \
5538 vm_page_unlock_queues(); \
5543 object_locks_dropped
= FALSE
;
5545 * We must verify that the maps have not changed
5546 * since our last lookup. vm_map_verify() needs the
5547 * map lock (shared) but we are holding object locks.
5548 * So we do a try_lock() first and, if that fails, we
5549 * drop the object locks and go in for the map lock again.
5551 if (!vm_map_try_lock_read(original_map
)) {
5552 if (m
!= VM_PAGE_NULL
) {
5553 old_copy_object
= m_object
->copy
;
5554 vm_object_unlock(m_object
);
5556 old_copy_object
= VM_OBJECT_NULL
;
5557 vm_object_unlock(object
);
5560 object_locks_dropped
= TRUE
;
5562 vm_map_lock_read(original_map
);
5565 if ((map
!= original_map
) || !vm_map_verify(map
, &version
)) {
5566 if (object_locks_dropped
== FALSE
) {
5567 if (m
!= VM_PAGE_NULL
) {
5568 old_copy_object
= m_object
->copy
;
5569 vm_object_unlock(m_object
);
5571 old_copy_object
= VM_OBJECT_NULL
;
5572 vm_object_unlock(object
);
5575 object_locks_dropped
= TRUE
;
5579 * no object locks are held at this point
5581 vm_object_t retry_object
;
5582 vm_object_offset_t retry_offset
;
5583 vm_prot_t retry_prot
;
5586 * To avoid trying to write_lock the map while another
5587 * thread has it read_locked (in vm_map_pageable), we
5588 * do not try for write permission. If the page is
5589 * still writable, we will get write permission. If it
5590 * is not, or has been marked needs_copy, we enter the
5591 * mapping without write permission, and will merely
5592 * take another fault.
5596 kr
= vm_map_lookup_locked(&map
, vaddr
,
5597 fault_type
& ~VM_PROT_WRITE
,
5598 OBJECT_LOCK_EXCLUSIVE
, &version
,
5599 &retry_object
, &retry_offset
, &retry_prot
,
5604 pmap
= real_map
->pmap
;
5606 if (kr
!= KERN_SUCCESS
) {
5607 vm_map_unlock_read(map
);
5609 if (m
!= VM_PAGE_NULL
) {
5610 assert(VM_PAGE_OBJECT(m
) == m_object
);
5613 * retake the lock so that
5614 * we can drop the paging reference
5615 * in vm_fault_cleanup and do the
5616 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5618 vm_object_lock(m_object
);
5622 vm_fault_cleanup(m_object
, top_page
);
5625 * retake the lock so that
5626 * we can drop the paging reference
5627 * in vm_fault_cleanup
5629 vm_object_lock(object
);
5631 vm_fault_cleanup(object
, top_page
);
5633 vm_object_deallocate(object
);
5637 vm_object_unlock(retry_object
);
5639 if ((retry_object
!= object
) || (retry_offset
!= offset
)) {
5640 vm_map_unlock_read(map
);
5641 if (real_map
!= map
) {
5642 vm_map_unlock(real_map
);
5645 if (m
!= VM_PAGE_NULL
) {
5646 assert(VM_PAGE_OBJECT(m
) == m_object
);
5649 * retake the lock so that
5650 * we can drop the paging reference
5651 * in vm_fault_cleanup and do the
5652 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5654 vm_object_lock(m_object
);
5658 vm_fault_cleanup(m_object
, top_page
);
5661 * retake the lock so that
5662 * we can drop the paging reference
5663 * in vm_fault_cleanup
5665 vm_object_lock(object
);
5667 vm_fault_cleanup(object
, top_page
);
5669 vm_object_deallocate(object
);
5674 * Check whether the protection has changed or the object
5675 * has been copied while we left the map unlocked.
5677 if (pmap_has_prot_policy(pmap
, fault_info
.pmap_options
& PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE
, retry_prot
)) {
5678 /* If the pmap layer cares, pass the full set. */
5685 if (object_locks_dropped
== TRUE
) {
5686 if (m
!= VM_PAGE_NULL
) {
5687 vm_object_lock(m_object
);
5689 if (m_object
->copy
!= old_copy_object
) {
5691 * The copy object changed while the top-level object
5692 * was unlocked, so take away write permission.
5694 assert(!pmap_has_prot_policy(pmap
, fault_info
.pmap_options
& PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE
, prot
));
5695 prot
&= ~VM_PROT_WRITE
;
5698 vm_object_lock(object
);
5701 object_locks_dropped
= FALSE
;
5705 !fault_info
.no_copy_on_read
&&
5706 m
!= VM_PAGE_NULL
&&
5707 VM_PAGE_OBJECT(m
) != object
&&
5708 !VM_PAGE_OBJECT(m
)->pager_trusted
&&
5709 vm_protect_privileged_from_untrusted
&&
5710 !((prot
& VM_PROT_EXECUTE
) &&
5711 VM_PAGE_OBJECT(m
)->code_signed
&&
5712 pmap_get_vm_map_cs_enforced(caller_pmap
? caller_pmap
: pmap
)) &&
5713 current_proc_is_privileged()) {
5715 * We found the page we want in an "untrusted" VM object
5716 * down the shadow chain. Since the target is "privileged"
5717 * we want to perform a copy-on-read of that page, so that the
5718 * mapped object gets a stable copy and does not have to
5719 * rely on the "untrusted" object to provide the same
5720 * contents if the page gets reclaimed and has to be paged
5721 * in again later on.
5723 * Special case: if the mapping is executable and the untrusted
5724 * object is code-signed and the process is "cs_enforced", we
5725 * do not copy-on-read because that would break code-signing
5726 * enforcement expectations (an executable page must belong
5727 * to a code-signed object) and we can rely on code-signing
5728 * to re-validate the page if it gets evicted and paged back in.
5730 // printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5731 vm_copied_on_read
++;
5732 need_copy_on_read
= TRUE
;
5735 need_copy_on_read
= FALSE
;
5739 * If we want to wire down this page, but no longer have
5740 * adequate permissions, we must start all over.
5741 * If we decided to copy-on-read, we must also start all over.
5743 if ((wired
&& (fault_type
!= (prot
| VM_PROT_WRITE
))) ||
5744 need_copy_on_read
) {
5745 vm_map_unlock_read(map
);
5746 if (real_map
!= map
) {
5747 vm_map_unlock(real_map
);
5750 if (m
!= VM_PAGE_NULL
) {
5751 assert(VM_PAGE_OBJECT(m
) == m_object
);
5755 vm_fault_cleanup(m_object
, top_page
);
5757 vm_fault_cleanup(object
, top_page
);
5760 vm_object_deallocate(object
);
5764 if (m
!= VM_PAGE_NULL
) {
5766 * Put this page into the physical map.
5767 * We had to do the unlock above because pmap_enter
5768 * may cause other faults. The page may be on
5769 * the pageout queues. If the pageout daemon comes
5770 * across the page, it will remove it from the queues.
5772 if (fault_page_size
< PAGE_SIZE
) {
5773 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map
, original_map
, pmap
, (uint64_t)vaddr
, (uint64_t)((((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
) + fault_phys_offset
), (uint64_t)(((pmap_paddr_t
)VM_PAGE_GET_PHYS_PAGE(m
)) << PAGE_SHIFT
), (uint64_t)fault_phys_offset
, prot
, caller_prot
);
5774 assertf((!(fault_phys_offset
& FOURK_PAGE_MASK
) &&
5775 fault_phys_offset
< PAGE_SIZE
),
5776 "0x%llx\n", (uint64_t)fault_phys_offset
);
5778 assertf(fault_phys_offset
== 0,
5779 "0x%llx\n", (uint64_t)fault_phys_offset
);
5782 kr
= vm_fault_enter(m
,
5796 kr
= vm_fault_enter(m
,
5810 assert(VM_PAGE_OBJECT(m
) == m_object
);
5815 if (m_object
->internal
) {
5816 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_INTERNAL
));
5817 } else if (m_object
->object_is_shared_cache
) {
5818 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_SHAREDCACHE
));
5820 event_code
= (MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_ADDR_EXTERNAL
));
5823 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
, event_code
, trace_real_vaddr
, (fault_info
.user_tag
<< 16) | (caller_prot
<< 8) | vm_fault_type_for_tracing(need_copy_on_read
, type_of_fault
), m
->vmp_offset
, get_current_unique_pid(), 0);
5824 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET
, VM_REAL_FAULT_SLOW
), get_current_unique_pid(), 0, 0, 0, 0);
5826 DTRACE_VM6(real_fault
, vm_map_offset_t
, real_vaddr
, vm_map_offset_t
, m
->vmp_offset
, int, event_code
, int, caller_prot
, int, type_of_fault
, int, fault_info
.user_tag
);
5828 if (kr
!= KERN_SUCCESS
) {
5829 /* abort this page fault */
5830 vm_map_unlock_read(map
);
5831 if (real_map
!= map
) {
5832 vm_map_unlock(real_map
);
5834 PAGE_WAKEUP_DONE(m
);
5835 vm_fault_cleanup(m_object
, top_page
);
5836 vm_object_deallocate(object
);
5839 if (physpage_p
!= NULL
) {
5840 /* for vm_map_wire_and_extract() */
5841 *physpage_p
= VM_PAGE_GET_PHYS_PAGE(m
);
5842 if (prot
& VM_PROT_WRITE
) {
5843 vm_object_lock_assert_exclusive(m_object
);
5844 m
->vmp_dirty
= TRUE
;
5848 vm_map_entry_t entry
;
5849 vm_map_offset_t laddr
;
5850 vm_map_offset_t ldelta
, hdelta
;
5853 * do a pmap block mapping from the physical address
5857 if (real_map
!= map
) {
5858 vm_map_unlock(real_map
);
5861 if (original_map
!= map
) {
5862 vm_map_unlock_read(map
);
5863 vm_map_lock_read(original_map
);
5869 hdelta
= 0xFFFFF000;
5870 ldelta
= 0xFFFFF000;
5872 while (vm_map_lookup_entry(map
, laddr
, &entry
)) {
5873 if (ldelta
> (laddr
- entry
->vme_start
)) {
5874 ldelta
= laddr
- entry
->vme_start
;
5876 if (hdelta
> (entry
->vme_end
- laddr
)) {
5877 hdelta
= entry
->vme_end
- laddr
;
5879 if (entry
->is_sub_map
) {
5880 laddr
= ((laddr
- entry
->vme_start
)
5881 + VME_OFFSET(entry
));
5882 vm_map_lock_read(VME_SUBMAP(entry
));
5884 if (map
!= real_map
) {
5885 vm_map_unlock_read(map
);
5887 if (entry
->use_pmap
) {
5888 vm_map_unlock_read(real_map
);
5889 real_map
= VME_SUBMAP(entry
);
5891 map
= VME_SUBMAP(entry
);
5897 if (vm_map_lookup_entry(map
, laddr
, &entry
) &&
5898 (VME_OBJECT(entry
) != NULL
) &&
5899 (VME_OBJECT(entry
) == object
)) {
5902 if (!object
->pager_created
&&
5903 object
->phys_contiguous
&&
5904 VME_OFFSET(entry
) == 0 &&
5905 (entry
->vme_end
- entry
->vme_start
== object
->vo_size
) &&
5906 VM_MAP_PAGE_ALIGNED(entry
->vme_start
, (object
->vo_size
- 1))) {
5907 superpage
= VM_MEM_SUPERPAGE
;
5912 if (superpage
&& physpage_p
) {
5913 /* for vm_map_wire_and_extract() */
5914 *physpage_p
= (ppnum_t
)
5915 ((((vm_map_offset_t
)
5916 object
->vo_shadow_offset
)
5918 + (laddr
- entry
->vme_start
))
5924 * Set up a block mapped area
5926 assert((uint32_t)((ldelta
+ hdelta
) >> PAGE_SHIFT
) == ((ldelta
+ hdelta
) >> PAGE_SHIFT
));
5927 kr
= pmap_map_block(caller_pmap
,
5928 (addr64_t
)(caller_pmap_addr
- ldelta
),
5929 (ppnum_t
)((((vm_map_offset_t
) (VME_OBJECT(entry
)->vo_shadow_offset
)) +
5930 VME_OFFSET(entry
) + (laddr
- entry
->vme_start
) - ldelta
) >> PAGE_SHIFT
),
5931 (uint32_t)((ldelta
+ hdelta
) >> PAGE_SHIFT
), prot
,
5932 (VM_WIMG_MASK
& (int)object
->wimg_bits
) | superpage
, 0);
5934 if (kr
!= KERN_SUCCESS
) {
5939 * Set up a block mapped area
5941 assert((uint32_t)((ldelta
+ hdelta
) >> PAGE_SHIFT
) == ((ldelta
+ hdelta
) >> PAGE_SHIFT
));
5942 kr
= pmap_map_block(real_map
->pmap
,
5943 (addr64_t
)(vaddr
- ldelta
),
5944 (ppnum_t
)((((vm_map_offset_t
)(VME_OBJECT(entry
)->vo_shadow_offset
)) +
5945 VME_OFFSET(entry
) + (laddr
- entry
->vme_start
) - ldelta
) >> PAGE_SHIFT
),
5946 (uint32_t)((ldelta
+ hdelta
) >> PAGE_SHIFT
), prot
,
5947 (VM_WIMG_MASK
& (int)object
->wimg_bits
) | superpage
, 0);
5949 if (kr
!= KERN_SUCCESS
) {
5962 * TODO: could most of the done cases just use cleanup?
5966 * Unlock everything, and return
5968 vm_map_unlock_read(map
);
5969 if (real_map
!= map
) {
5970 vm_map_unlock(real_map
);
5973 if (m
!= VM_PAGE_NULL
) {
5974 assert(VM_PAGE_OBJECT(m
) == m_object
);
5976 if (!m_object
->internal
&& (fault_type
& VM_PROT_WRITE
)) {
5977 vm_object_paging_begin(m_object
);
5979 assert(written_on_object
== VM_OBJECT_NULL
);
5980 written_on_object
= m_object
;
5981 written_on_pager
= m_object
->pager
;
5982 written_on_offset
= m_object
->paging_offset
+ m
->vmp_offset
;
5984 PAGE_WAKEUP_DONE(m
);
5986 vm_fault_cleanup(m_object
, top_page
);
5988 vm_fault_cleanup(object
, top_page
);
5991 vm_object_deallocate(object
);
5996 thread_interrupt_level(interruptible_state
);
5998 if (resilient_media_object
!= VM_OBJECT_NULL
) {
5999 assert(resilient_media_retry
);
6000 assert(resilient_media_offset
!= (vm_object_offset_t
)-1);
6001 /* release extra reference on failed object */
6002 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6003 vm_object_deallocate(resilient_media_object
);
6004 resilient_media_object
= VM_OBJECT_NULL
;
6005 resilient_media_offset
= (vm_object_offset_t
)-1;
6006 resilient_media_retry
= FALSE
;
6008 assert(!resilient_media_retry
);
6011 * Only I/O throttle on faults which cause a pagein/swapin.
6013 if ((type_of_fault
== DBG_PAGEIND_FAULT
) || (type_of_fault
== DBG_PAGEINV_FAULT
) || (type_of_fault
== DBG_COMPRESSOR_SWAPIN_FAULT
)) {
6014 throttle_lowpri_io(1);
6016 if (kr
== KERN_SUCCESS
&& type_of_fault
!= DBG_CACHE_HIT_FAULT
&& type_of_fault
!= DBG_GUARD_FAULT
) {
6017 if ((throttle_delay
= vm_page_throttled(TRUE
))) {
6018 if (vm_debug_events
) {
6019 if (type_of_fault
== DBG_COMPRESSOR_FAULT
) {
6020 VM_DEBUG_EVENT(vmf_compressordelay
, VMF_COMPRESSORDELAY
, DBG_FUNC_NONE
, throttle_delay
, 0, 0, 0);
6021 } else if (type_of_fault
== DBG_COW_FAULT
) {
6022 VM_DEBUG_EVENT(vmf_cowdelay
, VMF_COWDELAY
, DBG_FUNC_NONE
, throttle_delay
, 0, 0, 0);
6024 VM_DEBUG_EVENT(vmf_zfdelay
, VMF_ZFDELAY
, DBG_FUNC_NONE
, throttle_delay
, 0, 0, 0);
6027 delay(throttle_delay
);
6032 if (written_on_object
) {
6033 vnode_pager_dirtied(written_on_pager
, written_on_offset
, written_on_offset
+ PAGE_SIZE_64
);
6035 vm_object_lock(written_on_object
);
6036 vm_object_paging_end(written_on_object
);
6037 vm_object_unlock(written_on_object
);
6039 written_on_object
= VM_OBJECT_NULL
;
6043 vm_record_rtfault(cthread
, fstart
, trace_vaddr
, type_of_fault
);
6046 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
6047 (MACHDBG_CODE(DBG_MACH_VM
, 2)) | DBG_FUNC_END
,
6048 ((uint64_t)trace_vaddr
>> 32),
6051 vm_fault_type_for_tracing(need_copy_on_read
, type_of_fault
),
6054 if (fault_page_size
< PAGE_SIZE
&& kr
!= KERN_SUCCESS
) {
6055 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map
, original_map
, (uint64_t)trace_real_vaddr
, kr
);
6064 * Wire down a range of virtual addresses in a map.
6069 vm_map_entry_t entry
,
6073 vm_map_offset_t pmap_addr
,
6074 ppnum_t
*physpage_p
)
6077 vm_map_offset_t end_addr
= entry
->vme_end
;
6079 vm_map_size_t effective_page_size
;
6081 assert(entry
->in_transition
);
6083 if ((VME_OBJECT(entry
) != NULL
) &&
6084 !entry
->is_sub_map
&&
6085 VME_OBJECT(entry
)->phys_contiguous
) {
6086 return KERN_SUCCESS
;
6090 * Inform the physical mapping system that the
6091 * range of addresses may not fault, so that
6092 * page tables and such can be locked down as well.
6095 pmap_pageable(pmap
, pmap_addr
,
6096 pmap_addr
+ (end_addr
- entry
->vme_start
), FALSE
);
6099 * We simulate a fault to get the page and enter it
6100 * in the physical map.
6103 effective_page_size
= MIN(VM_MAP_PAGE_SIZE(map
), PAGE_SIZE
);
6104 for (va
= entry
->vme_start
;
6106 va
+= effective_page_size
) {
6107 rc
= vm_fault_wire_fast(map
, va
, prot
, wire_tag
, entry
, pmap
,
6108 pmap_addr
+ (va
- entry
->vme_start
),
6110 if (rc
!= KERN_SUCCESS
) {
6111 rc
= vm_fault_internal(map
, va
, prot
, TRUE
, wire_tag
,
6112 ((pmap
== kernel_pmap
)
6114 : THREAD_ABORTSAFE
),
6117 (va
- entry
->vme_start
)),
6119 DTRACE_VM2(softlock
, int, 1, (uint64_t *), NULL
);
6122 if (rc
!= KERN_SUCCESS
) {
6123 struct vm_map_entry tmp_entry
= *entry
;
6125 /* unwire wired pages */
6126 tmp_entry
.vme_end
= va
;
6127 vm_fault_unwire(map
,
6128 &tmp_entry
, FALSE
, pmap
, pmap_addr
);
6133 return KERN_SUCCESS
;
6139 * Unwire a range of virtual addresses in a map.
6144 vm_map_entry_t entry
,
6145 boolean_t deallocate
,
6147 vm_map_offset_t pmap_addr
)
6150 vm_map_offset_t end_addr
= entry
->vme_end
;
6152 struct vm_object_fault_info fault_info
= {};
6153 unsigned int unwired_pages
;
6154 vm_map_size_t effective_page_size
;
6156 object
= (entry
->is_sub_map
) ? VM_OBJECT_NULL
: VME_OBJECT(entry
);
6159 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6160 * do anything since such memory is wired by default. So we don't have
6161 * anything to undo here.
6164 if (object
!= VM_OBJECT_NULL
&& object
->phys_contiguous
) {
6168 fault_info
.interruptible
= THREAD_UNINT
;
6169 fault_info
.behavior
= entry
->behavior
;
6170 fault_info
.user_tag
= VME_ALIAS(entry
);
6171 if (entry
->iokit_acct
||
6172 (!entry
->is_sub_map
&& !entry
->use_pmap
)) {
6173 fault_info
.pmap_options
|= PMAP_OPTIONS_ALT_ACCT
;
6175 fault_info
.lo_offset
= VME_OFFSET(entry
);
6176 fault_info
.hi_offset
= (entry
->vme_end
- entry
->vme_start
) + VME_OFFSET(entry
);
6177 fault_info
.no_cache
= entry
->no_cache
;
6178 fault_info
.stealth
= TRUE
;
6183 * Since the pages are wired down, we must be able to
6184 * get their mappings from the physical map system.
6187 effective_page_size
= MIN(VM_MAP_PAGE_SIZE(map
), PAGE_SIZE
);
6188 for (va
= entry
->vme_start
;
6190 va
+= effective_page_size
) {
6191 if (object
== VM_OBJECT_NULL
) {
6193 pmap_change_wiring(pmap
,
6194 pmap_addr
+ (va
- entry
->vme_start
), FALSE
);
6196 (void) vm_fault(map
, va
, VM_PROT_NONE
,
6197 TRUE
, VM_KERN_MEMORY_NONE
, THREAD_UNINT
, pmap
, pmap_addr
);
6200 vm_page_t result_page
;
6202 vm_object_t result_object
;
6203 vm_fault_return_t result
;
6205 /* cap cluster size at maximum UPL size */
6206 upl_size_t cluster_size
;
6207 if (os_sub_overflow(end_addr
, va
, &cluster_size
)) {
6208 cluster_size
= 0 - (upl_size_t
)PAGE_SIZE
;
6210 fault_info
.cluster_size
= cluster_size
;
6213 prot
= VM_PROT_NONE
;
6215 vm_object_lock(object
);
6216 vm_object_paging_begin(object
);
6217 result_page
= VM_PAGE_NULL
;
6218 result
= vm_fault_page(
6220 (VME_OFFSET(entry
) +
6221 (va
- entry
->vme_start
)),
6223 FALSE
, /* page not looked up */
6224 &prot
, &result_page
, &top_page
,
6226 NULL
, map
->no_zero_fill
,
6227 FALSE
, &fault_info
);
6228 } while (result
== VM_FAULT_RETRY
);
6231 * If this was a mapping to a file on a device that has been forcibly
6232 * unmounted, then we won't get a page back from vm_fault_page(). Just
6233 * move on to the next one in case the remaining pages are mapped from
6234 * different objects. During a forced unmount, the object is terminated
6235 * so the alive flag will be false if this happens. A forced unmount will
6236 * will occur when an external disk is unplugged before the user does an
6237 * eject, so we don't want to panic in that situation.
6240 if (result
== VM_FAULT_MEMORY_ERROR
&& !object
->alive
) {
6244 if (result
== VM_FAULT_MEMORY_ERROR
&&
6245 object
== kernel_object
) {
6247 * This must have been allocated with
6248 * KMA_KOBJECT and KMA_VAONLY and there's
6249 * no physical page at this offset.
6250 * We're done (no page to free).
6256 if (result
!= VM_FAULT_SUCCESS
) {
6257 panic("vm_fault_unwire: failure");
6260 result_object
= VM_PAGE_OBJECT(result_page
);
6263 assert(VM_PAGE_GET_PHYS_PAGE(result_page
) !=
6264 vm_page_fictitious_addr
);
6265 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page
));
6266 if (VM_PAGE_WIRED(result_page
)) {
6269 VM_PAGE_FREE(result_page
);
6271 if ((pmap
) && (VM_PAGE_GET_PHYS_PAGE(result_page
) != vm_page_guard_addr
)) {
6272 pmap_change_wiring(pmap
,
6273 pmap_addr
+ (va
- entry
->vme_start
), FALSE
);
6277 if (VM_PAGE_WIRED(result_page
)) {
6278 vm_page_lockspin_queues();
6279 vm_page_unwire(result_page
, TRUE
);
6280 vm_page_unlock_queues();
6283 if (entry
->zero_wired_pages
) {
6284 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page
));
6285 entry
->zero_wired_pages
= FALSE
;
6288 PAGE_WAKEUP_DONE(result_page
);
6290 vm_fault_cleanup(result_object
, top_page
);
6295 * Inform the physical mapping system that the range
6296 * of addresses may fault, so that page tables and
6297 * such may be unwired themselves.
6300 pmap_pageable(pmap
, pmap_addr
,
6301 pmap_addr
+ (end_addr
- entry
->vme_start
), TRUE
);
6303 if (kernel_object
== object
) {
6305 * Would like to make user_tag in vm_object_fault_info
6306 * vm_tag_t (unsigned short) but user_tag derives its value from
6307 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6308 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6309 * code at many places.
6311 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6313 assertf((fault_info
.user_tag
& VME_ALIAS_MASK
) == fault_info
.user_tag
,
6314 "VM Tag truncated from 0x%x to 0x%x\n", fault_info
.user_tag
, (fault_info
.user_tag
& VME_ALIAS_MASK
));
6315 vm_tag_update_size((vm_tag_t
) fault_info
.user_tag
, -ptoa_64(unwired_pages
));
6320 * vm_fault_wire_fast:
6322 * Handle common case of a wire down page fault at the given address.
6323 * If successful, the page is inserted into the associated physical map.
6324 * The map entry is passed in to avoid the overhead of a map lookup.
6326 * NOTE: the given address should be truncated to the
6327 * proper page address.
6329 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
6330 * a standard error specifying why the fault is fatal is returned.
6332 * The map in question must be referenced, and remains so.
6333 * Caller has a read lock on the map.
6335 * This is a stripped version of vm_fault() for wiring pages. Anything
6336 * other than the common case will return KERN_FAILURE, and the caller
6337 * is expected to call vm_fault().
6339 static kern_return_t
6341 __unused vm_map_t map
,
6343 __unused vm_prot_t caller_prot
,
6345 vm_map_entry_t entry
,
6347 vm_map_offset_t pmap_addr
,
6348 ppnum_t
*physpage_p
)
6351 vm_object_offset_t offset
;
6354 thread_t thread
= current_thread();
6357 vm_map_size_t fault_page_size
;
6358 vm_map_offset_t fault_phys_offset
;
6359 struct vm_object_fault_info fault_info
= {};
6361 VM_STAT_INCR(faults
);
6363 if (thread
!= THREAD_NULL
&& thread
->task
!= TASK_NULL
) {
6364 thread
->task
->faults
++;
6372 #define RELEASE_PAGE(m) { \
6373 PAGE_WAKEUP_DONE(m); \
6374 vm_page_lockspin_queues(); \
6375 vm_page_unwire(m, TRUE); \
6376 vm_page_unlock_queues(); \
6380 #undef UNLOCK_THINGS
6381 #define UNLOCK_THINGS { \
6382 vm_object_paging_end(object); \
6383 vm_object_unlock(object); \
6386 #undef UNLOCK_AND_DEALLOCATE
6387 #define UNLOCK_AND_DEALLOCATE { \
6389 vm_object_deallocate(object); \
6392 * Give up and have caller do things the hard way.
6396 UNLOCK_AND_DEALLOCATE; \
6397 return(KERN_FAILURE); \
6402 * If this entry is not directly to a vm_object, bail out.
6404 if (entry
->is_sub_map
) {
6405 assert(physpage_p
== NULL
);
6406 return KERN_FAILURE
;
6410 * Find the backing store object and offset into it.
6413 object
= VME_OBJECT(entry
);
6414 offset
= (va
- entry
->vme_start
) + VME_OFFSET(entry
);
6415 prot
= entry
->protection
;
6418 * Make a reference to this object to prevent its
6419 * disposal while we are messing with it.
6422 vm_object_lock(object
);
6423 vm_object_reference_locked(object
);
6424 vm_object_paging_begin(object
);
6427 * INVARIANTS (through entire routine):
6429 * 1) At all times, we must either have the object
6430 * lock or a busy page in some object to prevent
6431 * some other thread from trying to bring in
6434 * 2) Once we have a busy page, we must remove it from
6435 * the pageout queues, so that the pageout daemon
6436 * will not grab it away.
6441 * Look for page in top-level object. If it's not there or
6442 * there's something going on, give up.
6444 m
= vm_page_lookup(object
, vm_object_trunc_page(offset
));
6445 if ((m
== VM_PAGE_NULL
) || (m
->vmp_busy
) ||
6446 (m
->vmp_unusual
&& (m
->vmp_error
|| m
->vmp_restart
|| m
->vmp_absent
))) {
6449 if (m
->vmp_fictitious
&&
6450 VM_PAGE_GET_PHYS_PAGE(m
) == vm_page_guard_addr
) {
6452 * Guard pages are fictitious pages and are never
6453 * entered into a pmap, so let's say it's been wired...
6460 * Wire the page down now. All bail outs beyond this
6461 * point must unwire the page.
6464 vm_page_lockspin_queues();
6465 vm_page_wire(m
, wire_tag
, TRUE
);
6466 vm_page_unlock_queues();
6469 * Mark page busy for other threads.
6471 assert(!m
->vmp_busy
);
6473 assert(!m
->vmp_absent
);
6476 * Give up if the page is being written and there's a copy object
6478 if ((object
->copy
!= VM_OBJECT_NULL
) && (prot
& VM_PROT_WRITE
)) {
6483 fault_info
.user_tag
= VME_ALIAS(entry
);
6484 fault_info
.pmap_options
= 0;
6485 if (entry
->iokit_acct
||
6486 (!entry
->is_sub_map
&& !entry
->use_pmap
)) {
6487 fault_info
.pmap_options
|= PMAP_OPTIONS_ALT_ACCT
;
6490 fault_page_size
= MIN(VM_MAP_PAGE_SIZE(map
), PAGE_SIZE
);
6491 fault_phys_offset
= offset
- vm_object_trunc_page(offset
);
6494 * Put this page into the physical map.
6496 type_of_fault
= DBG_CACHE_HIT_FAULT
;
6497 kr
= vm_fault_enter(m
,
6505 FALSE
, /* change_wiring */
6510 if (kr
!= KERN_SUCCESS
) {
6517 * Unlock everything, and return
6521 /* for vm_map_wire_and_extract() */
6522 if (kr
== KERN_SUCCESS
) {
6523 assert(object
== VM_PAGE_OBJECT(m
));
6524 *physpage_p
= VM_PAGE_GET_PHYS_PAGE(m
);
6525 if (prot
& VM_PROT_WRITE
) {
6526 vm_object_lock_assert_exclusive(object
);
6527 m
->vmp_dirty
= TRUE
;
6534 PAGE_WAKEUP_DONE(m
);
6535 UNLOCK_AND_DEALLOCATE
;
6541 * Routine: vm_fault_copy_cleanup
6543 * Release a page used by vm_fault_copy.
6547 vm_fault_copy_cleanup(
6551 vm_object_t object
= VM_PAGE_OBJECT(page
);
6553 vm_object_lock(object
);
6554 PAGE_WAKEUP_DONE(page
);
6555 if (!VM_PAGE_PAGEABLE(page
)) {
6556 vm_page_lockspin_queues();
6557 if (!VM_PAGE_PAGEABLE(page
)) {
6558 vm_page_activate(page
);
6560 vm_page_unlock_queues();
6562 vm_fault_cleanup(object
, top_page
);
6566 vm_fault_copy_dst_cleanup(
6571 if (page
!= VM_PAGE_NULL
) {
6572 object
= VM_PAGE_OBJECT(page
);
6573 vm_object_lock(object
);
6574 vm_page_lockspin_queues();
6575 vm_page_unwire(page
, TRUE
);
6576 vm_page_unlock_queues();
6577 vm_object_paging_end(object
);
6578 vm_object_unlock(object
);
6583 * Routine: vm_fault_copy
6586 * Copy pages from one virtual memory object to another --
6587 * neither the source nor destination pages need be resident.
6589 * Before actually copying a page, the version associated with
6590 * the destination address map wil be verified.
6592 * In/out conditions:
6593 * The caller must hold a reference, but not a lock, to
6594 * each of the source and destination objects and to the
6598 * Returns KERN_SUCCESS if no errors were encountered in
6599 * reading or writing the data. Returns KERN_INTERRUPTED if
6600 * the operation was interrupted (only possible if the
6601 * "interruptible" argument is asserted). Other return values
6602 * indicate a permanent error in copying the data.
6604 * The actual amount of data copied will be returned in the
6605 * "copy_size" argument. In the event that the destination map
6606 * verification failed, this amount may be less than the amount
6611 vm_object_t src_object
,
6612 vm_object_offset_t src_offset
,
6613 vm_map_size_t
*copy_size
, /* INOUT */
6614 vm_object_t dst_object
,
6615 vm_object_offset_t dst_offset
,
6617 vm_map_version_t
*dst_version
,
6620 vm_page_t result_page
;
6623 vm_page_t src_top_page
;
6627 vm_page_t dst_top_page
;
6630 vm_map_size_t amount_left
;
6631 vm_object_t old_copy_object
;
6632 vm_object_t result_page_object
= NULL
;
6633 kern_return_t error
= 0;
6634 vm_fault_return_t result
;
6636 vm_map_size_t part_size
;
6637 struct vm_object_fault_info fault_info_src
= {};
6638 struct vm_object_fault_info fault_info_dst
= {};
6641 * In order not to confuse the clustered pageins, align
6642 * the different offsets on a page boundary.
6647 *copy_size -= amount_left; \
6651 amount_left
= *copy_size
;
6653 fault_info_src
.interruptible
= interruptible
;
6654 fault_info_src
.behavior
= VM_BEHAVIOR_SEQUENTIAL
;
6655 fault_info_src
.lo_offset
= vm_object_trunc_page(src_offset
);
6656 fault_info_src
.hi_offset
= fault_info_src
.lo_offset
+ amount_left
;
6657 fault_info_src
.stealth
= TRUE
;
6659 fault_info_dst
.interruptible
= interruptible
;
6660 fault_info_dst
.behavior
= VM_BEHAVIOR_SEQUENTIAL
;
6661 fault_info_dst
.lo_offset
= vm_object_trunc_page(dst_offset
);
6662 fault_info_dst
.hi_offset
= fault_info_dst
.lo_offset
+ amount_left
;
6663 fault_info_dst
.stealth
= TRUE
;
6665 do { /* while (amount_left > 0) */
6667 * There may be a deadlock if both source and destination
6668 * pages are the same. To avoid this deadlock, the copy must
6669 * start by getting the destination page in order to apply
6670 * COW semantics if any.
6673 RetryDestinationFault
:;
6675 dst_prot
= VM_PROT_WRITE
| VM_PROT_READ
;
6677 vm_object_lock(dst_object
);
6678 vm_object_paging_begin(dst_object
);
6680 /* cap cluster size at maximum UPL size */
6681 upl_size_t cluster_size
;
6682 if (os_convert_overflow(amount_left
, &cluster_size
)) {
6683 cluster_size
= 0 - (upl_size_t
)PAGE_SIZE
;
6685 fault_info_dst
.cluster_size
= cluster_size
;
6687 dst_page
= VM_PAGE_NULL
;
6688 result
= vm_fault_page(dst_object
,
6689 vm_object_trunc_page(dst_offset
),
6690 VM_PROT_WRITE
| VM_PROT_READ
,
6692 FALSE
, /* page not looked up */
6693 &dst_prot
, &dst_page
, &dst_top_page
,
6696 dst_map
->no_zero_fill
,
6697 FALSE
, &fault_info_dst
);
6699 case VM_FAULT_SUCCESS
:
6701 case VM_FAULT_RETRY
:
6702 goto RetryDestinationFault
;
6703 case VM_FAULT_MEMORY_SHORTAGE
:
6704 if (vm_page_wait(interruptible
)) {
6705 goto RetryDestinationFault
;
6708 case VM_FAULT_INTERRUPTED
:
6709 RETURN(MACH_SEND_INTERRUPTED
);
6710 case VM_FAULT_SUCCESS_NO_VM_PAGE
:
6711 /* success but no VM page: fail the copy */
6712 vm_object_paging_end(dst_object
);
6713 vm_object_unlock(dst_object
);
6715 case VM_FAULT_MEMORY_ERROR
:
6719 return KERN_MEMORY_ERROR
;
6722 panic("vm_fault_copy: unexpected error 0x%x from "
6723 "vm_fault_page()\n", result
);
6725 assert((dst_prot
& VM_PROT_WRITE
) != VM_PROT_NONE
);
6727 assert(dst_object
== VM_PAGE_OBJECT(dst_page
));
6728 old_copy_object
= dst_object
->copy
;
6731 * There exists the possiblity that the source and
6732 * destination page are the same. But we can't
6733 * easily determine that now. If they are the
6734 * same, the call to vm_fault_page() for the
6735 * destination page will deadlock. To prevent this we
6736 * wire the page so we can drop busy without having
6737 * the page daemon steal the page. We clean up the
6738 * top page but keep the paging reference on the object
6739 * holding the dest page so it doesn't go away.
6742 vm_page_lockspin_queues();
6743 vm_page_wire(dst_page
, VM_KERN_MEMORY_OSFMK
, TRUE
);
6744 vm_page_unlock_queues();
6745 PAGE_WAKEUP_DONE(dst_page
);
6746 vm_object_unlock(dst_object
);
6748 if (dst_top_page
!= VM_PAGE_NULL
) {
6749 vm_object_lock(dst_object
);
6750 VM_PAGE_FREE(dst_top_page
);
6751 vm_object_paging_end(dst_object
);
6752 vm_object_unlock(dst_object
);
6757 if (src_object
== VM_OBJECT_NULL
) {
6759 * No source object. We will just
6760 * zero-fill the page in dst_object.
6762 src_page
= VM_PAGE_NULL
;
6763 result_page
= VM_PAGE_NULL
;
6765 vm_object_lock(src_object
);
6766 src_page
= vm_page_lookup(src_object
,
6767 vm_object_trunc_page(src_offset
));
6768 if (src_page
== dst_page
) {
6769 src_prot
= dst_prot
;
6770 result_page
= VM_PAGE_NULL
;
6772 src_prot
= VM_PROT_READ
;
6773 vm_object_paging_begin(src_object
);
6775 /* cap cluster size at maximum UPL size */
6776 if (os_convert_overflow(amount_left
, &cluster_size
)) {
6777 cluster_size
= 0 - (upl_size_t
)PAGE_SIZE
;
6779 fault_info_src
.cluster_size
= cluster_size
;
6781 result_page
= VM_PAGE_NULL
;
6782 result
= vm_fault_page(
6784 vm_object_trunc_page(src_offset
),
6785 VM_PROT_READ
, FALSE
,
6786 FALSE
, /* page not looked up */
6788 &result_page
, &src_top_page
,
6789 (int *)0, &error
, FALSE
,
6790 FALSE
, &fault_info_src
);
6793 case VM_FAULT_SUCCESS
:
6795 case VM_FAULT_RETRY
:
6796 goto RetrySourceFault
;
6797 case VM_FAULT_MEMORY_SHORTAGE
:
6798 if (vm_page_wait(interruptible
)) {
6799 goto RetrySourceFault
;
6802 case VM_FAULT_INTERRUPTED
:
6803 vm_fault_copy_dst_cleanup(dst_page
);
6804 RETURN(MACH_SEND_INTERRUPTED
);
6805 case VM_FAULT_SUCCESS_NO_VM_PAGE
:
6806 /* success but no VM page: fail */
6807 vm_object_paging_end(src_object
);
6808 vm_object_unlock(src_object
);
6810 case VM_FAULT_MEMORY_ERROR
:
6811 vm_fault_copy_dst_cleanup(dst_page
);
6815 return KERN_MEMORY_ERROR
;
6818 panic("vm_fault_copy(2): unexpected "
6820 "vm_fault_page()\n", result
);
6823 result_page_object
= VM_PAGE_OBJECT(result_page
);
6824 assert((src_top_page
== VM_PAGE_NULL
) ==
6825 (result_page_object
== src_object
));
6827 assert((src_prot
& VM_PROT_READ
) != VM_PROT_NONE
);
6828 vm_object_unlock(result_page_object
);
6831 vm_map_lock_read(dst_map
);
6833 if (!vm_map_verify(dst_map
, dst_version
)) {
6834 vm_map_unlock_read(dst_map
);
6835 if (result_page
!= VM_PAGE_NULL
&& src_page
!= dst_page
) {
6836 vm_fault_copy_cleanup(result_page
, src_top_page
);
6838 vm_fault_copy_dst_cleanup(dst_page
);
6841 assert(dst_object
== VM_PAGE_OBJECT(dst_page
));
6843 vm_object_lock(dst_object
);
6845 if (dst_object
->copy
!= old_copy_object
) {
6846 vm_object_unlock(dst_object
);
6847 vm_map_unlock_read(dst_map
);
6848 if (result_page
!= VM_PAGE_NULL
&& src_page
!= dst_page
) {
6849 vm_fault_copy_cleanup(result_page
, src_top_page
);
6851 vm_fault_copy_dst_cleanup(dst_page
);
6854 vm_object_unlock(dst_object
);
6857 * Copy the page, and note that it is dirty
6861 if (!page_aligned(src_offset
) ||
6862 !page_aligned(dst_offset
) ||
6863 !page_aligned(amount_left
)) {
6864 vm_object_offset_t src_po
,
6867 src_po
= src_offset
- vm_object_trunc_page(src_offset
);
6868 dst_po
= dst_offset
- vm_object_trunc_page(dst_offset
);
6870 if (dst_po
> src_po
) {
6871 part_size
= PAGE_SIZE
- dst_po
;
6873 part_size
= PAGE_SIZE
- src_po
;
6875 if (part_size
> (amount_left
)) {
6876 part_size
= amount_left
;
6879 if (result_page
== VM_PAGE_NULL
) {
6880 assert((vm_offset_t
) dst_po
== dst_po
);
6881 assert((vm_size_t
) part_size
== part_size
);
6882 vm_page_part_zero_fill(dst_page
,
6883 (vm_offset_t
) dst_po
,
6884 (vm_size_t
) part_size
);
6886 assert((vm_offset_t
) src_po
== src_po
);
6887 assert((vm_offset_t
) dst_po
== dst_po
);
6888 assert((vm_size_t
) part_size
== part_size
);
6889 vm_page_part_copy(result_page
,
6890 (vm_offset_t
) src_po
,
6892 (vm_offset_t
) dst_po
,
6893 (vm_size_t
)part_size
);
6894 if (!dst_page
->vmp_dirty
) {
6895 vm_object_lock(dst_object
);
6896 SET_PAGE_DIRTY(dst_page
, TRUE
);
6897 vm_object_unlock(dst_object
);
6901 part_size
= PAGE_SIZE
;
6903 if (result_page
== VM_PAGE_NULL
) {
6904 vm_page_zero_fill(dst_page
);
6906 vm_object_lock(result_page_object
);
6907 vm_page_copy(result_page
, dst_page
);
6908 vm_object_unlock(result_page_object
);
6910 if (!dst_page
->vmp_dirty
) {
6911 vm_object_lock(dst_object
);
6912 SET_PAGE_DIRTY(dst_page
, TRUE
);
6913 vm_object_unlock(dst_object
);
6919 * Unlock everything, and return
6922 vm_map_unlock_read(dst_map
);
6924 if (result_page
!= VM_PAGE_NULL
&& src_page
!= dst_page
) {
6925 vm_fault_copy_cleanup(result_page
, src_top_page
);
6927 vm_fault_copy_dst_cleanup(dst_page
);
6929 amount_left
-= part_size
;
6930 src_offset
+= part_size
;
6931 dst_offset
+= part_size
;
6932 } while (amount_left
> 0);
6934 RETURN(KERN_SUCCESS
);
6940 #if VM_FAULT_CLASSIFY
6942 * Temporary statistics gathering support.
6946 * Statistics arrays:
6948 #define VM_FAULT_TYPES_MAX 5
6949 #define VM_FAULT_LEVEL_MAX 8
6951 int vm_fault_stats
[VM_FAULT_TYPES_MAX
][VM_FAULT_LEVEL_MAX
];
6953 #define VM_FAULT_TYPE_ZERO_FILL 0
6954 #define VM_FAULT_TYPE_MAP_IN 1
6955 #define VM_FAULT_TYPE_PAGER 2
6956 #define VM_FAULT_TYPE_COPY 3
6957 #define VM_FAULT_TYPE_OTHER 4
6961 vm_fault_classify(vm_object_t object
,
6962 vm_object_offset_t offset
,
6963 vm_prot_t fault_type
)
6965 int type
, level
= 0;
6969 m
= vm_page_lookup(object
, offset
);
6970 if (m
!= VM_PAGE_NULL
) {
6971 if (m
->vmp_busy
|| m
->vmp_error
|| m
->vmp_restart
|| m
->vmp_absent
) {
6972 type
= VM_FAULT_TYPE_OTHER
;
6975 if (((fault_type
& VM_PROT_WRITE
) == 0) ||
6976 ((level
== 0) && object
->copy
== VM_OBJECT_NULL
)) {
6977 type
= VM_FAULT_TYPE_MAP_IN
;
6980 type
= VM_FAULT_TYPE_COPY
;
6983 if (object
->pager_created
) {
6984 type
= VM_FAULT_TYPE_PAGER
;
6987 if (object
->shadow
== VM_OBJECT_NULL
) {
6988 type
= VM_FAULT_TYPE_ZERO_FILL
;
6992 offset
+= object
->vo_shadow_offset
;
6993 object
= object
->shadow
;
6999 if (level
> VM_FAULT_LEVEL_MAX
) {
7000 level
= VM_FAULT_LEVEL_MAX
;
7003 vm_fault_stats
[type
][level
] += 1;
7008 /* cleanup routine to call from debugger */
7011 vm_fault_classify_init(void)
7015 for (type
= 0; type
< VM_FAULT_TYPES_MAX
; type
++) {
7016 for (level
= 0; level
< VM_FAULT_LEVEL_MAX
; level
++) {
7017 vm_fault_stats
[type
][level
] = 0;
7023 #endif /* VM_FAULT_CLASSIFY */
7026 kdp_lightweight_fault(vm_map_t map
, vm_offset_t cur_target_addr
)
7028 vm_map_entry_t entry
;
7030 vm_offset_t object_offset
;
7032 int compressor_external_state
, compressed_count_delta
;
7033 int compressor_flags
= (C_DONT_BLOCK
| C_KEEP
| C_KDP
);
7034 int my_fault_type
= VM_PROT_READ
;
7036 int effective_page_mask
, effective_page_size
;
7038 if (VM_MAP_PAGE_SHIFT(map
) < PAGE_SHIFT
) {
7039 effective_page_mask
= VM_MAP_PAGE_MASK(map
);
7040 effective_page_size
= VM_MAP_PAGE_SIZE(map
);
7042 effective_page_mask
= PAGE_MASK
;
7043 effective_page_size
= PAGE_SIZE
;
7047 panic("kdp_lightweight_fault called from outside of debugger context");
7050 assert(map
!= VM_MAP_NULL
);
7052 assert((cur_target_addr
& effective_page_mask
) == 0);
7053 if ((cur_target_addr
& effective_page_mask
) != 0) {
7057 if (kdp_lck_rw_lock_is_acquired_exclusive(&map
->lock
)) {
7061 if (!vm_map_lookup_entry(map
, cur_target_addr
, &entry
)) {
7065 if (entry
->is_sub_map
) {
7069 object
= VME_OBJECT(entry
);
7070 if (object
== VM_OBJECT_NULL
) {
7074 object_offset
= cur_target_addr
- entry
->vme_start
+ VME_OFFSET(entry
);
7077 if (kdp_lck_rw_lock_is_acquired_exclusive(&object
->Lock
)) {
7081 if (object
->pager_created
&& (object
->paging_in_progress
||
7082 object
->activity_in_progress
)) {
7086 m
= kdp_vm_page_lookup(object
, vm_object_trunc_page(object_offset
));
7088 if (m
!= VM_PAGE_NULL
) {
7089 if ((object
->wimg_bits
& VM_WIMG_MASK
) != VM_WIMG_DEFAULT
) {
7093 if (m
->vmp_laundry
|| m
->vmp_busy
|| m
->vmp_free_when_done
|| m
->vmp_absent
|| m
->vmp_error
|| m
->vmp_cleaning
||
7094 m
->vmp_overwriting
|| m
->vmp_restart
|| m
->vmp_unusual
) {
7098 assert(!m
->vmp_private
);
7099 if (m
->vmp_private
) {
7103 assert(!m
->vmp_fictitious
);
7104 if (m
->vmp_fictitious
) {
7108 assert(m
->vmp_q_state
!= VM_PAGE_USED_BY_COMPRESSOR
);
7109 if (m
->vmp_q_state
== VM_PAGE_USED_BY_COMPRESSOR
) {
7113 return ptoa(VM_PAGE_GET_PHYS_PAGE(m
));
7116 compressor_external_state
= VM_EXTERNAL_STATE_UNKNOWN
;
7118 if (object
->pager_created
&& MUST_ASK_PAGER(object
, object_offset
, compressor_external_state
)) {
7119 if (compressor_external_state
== VM_EXTERNAL_STATE_EXISTS
) {
7120 kr
= vm_compressor_pager_get(object
->pager
,
7121 vm_object_trunc_page(object_offset
+ object
->paging_offset
),
7122 kdp_compressor_decompressed_page_ppnum
, &my_fault_type
,
7123 compressor_flags
, &compressed_count_delta
);
7124 if (kr
== KERN_SUCCESS
) {
7125 return kdp_compressor_decompressed_page_paddr
;
7132 if (object
->shadow
== VM_OBJECT_NULL
) {
7136 object_offset
+= object
->vo_shadow_offset
;
7137 object
= object
->shadow
;
7142 * vm_page_validate_cs_fast():
7143 * Performs a few quick checks to determine if the page's code signature
7144 * really needs to be fully validated. It could:
7145 * 1. have been modified (i.e. automatically tainted),
7146 * 2. have already been validated,
7147 * 3. have already been found to be tainted,
7148 * 4. no longer have a backing store.
7149 * Returns FALSE if the page needs to be fully validated.
7152 vm_page_validate_cs_fast(
7154 vm_map_size_t fault_page_size
,
7155 vm_map_offset_t fault_phys_offset
)
7159 object
= VM_PAGE_OBJECT(page
);
7160 vm_object_lock_assert_held(object
);
7162 if (page
->vmp_wpmapped
&&
7163 !VMP_CS_TAINTED(page
, fault_page_size
, fault_phys_offset
)) {
7165 * This page was mapped for "write" access sometime in the
7166 * past and could still be modifiable in the future.
7167 * Consider it tainted.
7168 * [ If the page was already found to be "tainted", no
7169 * need to re-validate. ]
7171 vm_object_lock_assert_exclusive(object
);
7172 VMP_CS_SET_VALIDATED(page
, fault_page_size
, fault_phys_offset
, TRUE
);
7173 VMP_CS_SET_TAINTED(page
, fault_page_size
, fault_phys_offset
, TRUE
);
7175 printf("CODESIGNING: %s: "
7176 "page %p obj %p off 0x%llx "
7179 page
, object
, page
->vmp_offset
);
7181 vm_cs_validated_dirtied
++;
7184 if (VMP_CS_VALIDATED(page
, fault_page_size
, fault_phys_offset
) ||
7185 VMP_CS_TAINTED(page
, fault_page_size
, fault_phys_offset
)) {
7188 vm_object_lock_assert_exclusive(object
);
7190 #if CHECK_CS_VALIDATION_BITMAP
7193 kr
= vnode_pager_cs_check_validation_bitmap(
7195 page
->vmp_offset
+ object
->paging_offset
,
7197 if (kr
== KERN_SUCCESS
) {
7198 page
->vmp_cs_validated
= VMP_CS_ALL_TRUE
;
7199 page
->vmp_cs_tainted
= VMP_CS_ALL_FALSE
;
7200 vm_cs_bitmap_validated
++;
7203 #endif /* CHECK_CS_VALIDATION_BITMAP */
7205 if (!object
->alive
|| object
->terminating
|| object
->pager
== NULL
) {
7207 * The object is terminating and we don't have its pager
7208 * so we can't validate the data...
7213 /* we need to really validate this page */
7214 vm_object_lock_assert_exclusive(object
);
7219 vm_page_validate_cs_mapped_slow(
7224 memory_object_offset_t mo_offset
;
7225 memory_object_t pager
;
7226 struct vnode
*vnode
;
7227 int validated
, tainted
, nx
;
7229 assert(page
->vmp_busy
);
7230 object
= VM_PAGE_OBJECT(page
);
7231 vm_object_lock_assert_exclusive(object
);
7236 * Since we get here to validate a page that was brought in by
7237 * the pager, we know that this pager is all setup and ready
7240 assert(object
->code_signed
);
7241 assert(!object
->internal
);
7242 assert(object
->pager
!= NULL
);
7243 assert(object
->pager_ready
);
7245 pager
= object
->pager
;
7246 assert(object
->paging_in_progress
);
7247 vnode
= vnode_pager_lookup_vnode(pager
);
7248 mo_offset
= page
->vmp_offset
+ object
->paging_offset
;
7250 /* verify the SHA1 hash for this page */
7254 cs_validate_page(vnode
,
7257 (const void *)((const char *)kaddr
),
7262 page
->vmp_cs_validated
|= validated
;
7263 page
->vmp_cs_tainted
|= tainted
;
7264 page
->vmp_cs_nx
|= nx
;
7266 #if CHECK_CS_VALIDATION_BITMAP
7267 if (page
->vmp_cs_validated
== VMP_CS_ALL_TRUE
&&
7268 page
->vmp_cs_tainted
== VMP_CS_ALL_FALSE
) {
7269 vnode_pager_cs_check_validation_bitmap(object
->pager
,
7273 #endif /* CHECK_CS_VALIDATION_BITMAP */
7277 vm_page_validate_cs_mapped(
7279 vm_map_size_t fault_page_size
,
7280 vm_map_offset_t fault_phys_offset
,
7283 if (!vm_page_validate_cs_fast(page
, fault_page_size
, fault_phys_offset
)) {
7284 vm_page_validate_cs_mapped_slow(page
, kaddr
);
7289 vm_page_validate_cs(
7291 vm_map_size_t fault_page_size
,
7292 vm_map_offset_t fault_phys_offset
)
7295 vm_object_offset_t offset
;
7296 vm_map_offset_t koffset
;
7297 vm_map_size_t ksize
;
7300 boolean_t busy_page
;
7301 boolean_t need_unmap
;
7303 object
= VM_PAGE_OBJECT(page
);
7304 vm_object_lock_assert_held(object
);
7306 if (vm_page_validate_cs_fast(page
, fault_page_size
, fault_phys_offset
)) {
7309 vm_object_lock_assert_exclusive(object
);
7311 assert(object
->code_signed
);
7312 offset
= page
->vmp_offset
;
7314 busy_page
= page
->vmp_busy
;
7316 /* keep page busy while we map (and unlock) the VM object */
7317 page
->vmp_busy
= TRUE
;
7321 * Take a paging reference on the VM object
7322 * to protect it from collapse or bypass,
7323 * and keep it from disappearing too.
7325 vm_object_paging_begin(object
);
7327 /* map the page in the kernel address space */
7328 ksize
= PAGE_SIZE_64
;
7331 kr
= vm_paging_map_object(page
,
7335 FALSE
, /* can't unlock object ! */
7339 if (kr
!= KERN_SUCCESS
) {
7340 panic("%s: could not map page: 0x%x\n", __FUNCTION__
, kr
);
7342 kaddr
= CAST_DOWN(vm_offset_t
, koffset
);
7344 /* validate the mapped page */
7345 vm_page_validate_cs_mapped_slow(page
, (const void *) kaddr
);
7347 assert(page
->vmp_busy
);
7348 assert(object
== VM_PAGE_OBJECT(page
));
7349 vm_object_lock_assert_exclusive(object
);
7352 PAGE_WAKEUP_DONE(page
);
7355 /* unmap the map from the kernel address space */
7356 vm_paging_unmap_object(object
, koffset
, koffset
+ ksize
);
7361 vm_object_paging_end(object
);
7365 vm_page_validate_cs_mapped_chunk(
7368 vm_offset_t chunk_offset
,
7369 vm_size_t chunk_size
,
7370 boolean_t
*validated_p
,
7371 unsigned *tainted_p
)
7374 vm_object_offset_t offset
, offset_in_page
;
7375 memory_object_t pager
;
7376 struct vnode
*vnode
;
7377 boolean_t validated
;
7380 *validated_p
= FALSE
;
7383 assert(page
->vmp_busy
);
7384 object
= VM_PAGE_OBJECT(page
);
7385 vm_object_lock_assert_exclusive(object
);
7387 assert(object
->code_signed
);
7388 offset
= page
->vmp_offset
;
7390 if (!object
->alive
|| object
->terminating
|| object
->pager
== NULL
) {
7392 * The object is terminating and we don't have its pager
7393 * so we can't validate the data...
7398 * Since we get here to validate a page that was brought in by
7399 * the pager, we know that this pager is all setup and ready
7402 assert(!object
->internal
);
7403 assert(object
->pager
!= NULL
);
7404 assert(object
->pager_ready
);
7406 pager
= object
->pager
;
7407 assert(object
->paging_in_progress
);
7408 vnode
= vnode_pager_lookup_vnode(pager
);
7410 /* verify the signature for this chunk */
7411 offset_in_page
= chunk_offset
;
7412 assert(offset_in_page
< PAGE_SIZE
);
7415 validated
= cs_validate_range(vnode
,
7417 (object
->paging_offset
+
7420 (const void *)((const char *)kaddr
7425 *validated_p
= TRUE
;
7428 *tainted_p
= tainted
;
7433 vm_rtfrecord_lock(void)
7435 lck_spin_lock(&vm_rtfr_slock
);
7439 vm_rtfrecord_unlock(void)
7441 lck_spin_unlock(&vm_rtfr_slock
);
7445 vmrtfaultinfo_bufsz(void)
7447 return vmrtf_num_records
* sizeof(vm_rtfault_record_t
);
7450 #include <kern/backtrace.h>
7452 __attribute__((noinline
))
7454 vm_record_rtfault(thread_t cthread
, uint64_t fstart
, vm_map_offset_t fault_vaddr
, int type_of_fault
)
7456 uint64_t fend
= mach_continuous_time();
7459 uint64_t ctid
= cthread
->thread_id
;
7460 uint64_t cupid
= get_current_unique_pid();
7466 /* Capture a single-frame backtrace; this extracts just the program
7467 * counter at the point of the fault into "bpc", and should perform no
7468 * further user stack traversals, thus avoiding copyin()s and further
7471 unsigned int bfrs
= backtrace_thread_user(cthread
, &bpc
, 1U, &btr
, &u64
, NULL
, false);
7473 if ((btr
== 0) && (bfrs
> 0)) {
7477 assert((fstart
!= 0) && fend
>= fstart
);
7478 vm_rtfrecord_lock();
7479 assert(vmrtfrs
.vmrtfr_curi
<= vmrtfrs
.vmrtfr_maxi
);
7481 vmrtfrs
.vmrtf_total
++;
7482 vm_rtfault_record_t
*cvmr
= &vmrtfrs
.vm_rtf_records
[vmrtfrs
.vmrtfr_curi
++];
7484 cvmr
->rtfabstime
= fstart
;
7485 cvmr
->rtfduration
= fend
- fstart
;
7486 cvmr
->rtfaddr
= fault_vaddr
;
7488 cvmr
->rtftype
= type_of_fault
;
7489 cvmr
->rtfupid
= cupid
;
7490 cvmr
->rtftid
= ctid
;
7492 if (vmrtfrs
.vmrtfr_curi
> vmrtfrs
.vmrtfr_maxi
) {
7493 vmrtfrs
.vmrtfr_curi
= 0;
7496 vm_rtfrecord_unlock();
7500 vmrtf_extract(uint64_t cupid
, __unused boolean_t isroot
, unsigned long vrecordsz
, void *vrecords
, unsigned long *vmrtfrv
)
7502 vm_rtfault_record_t
*cvmrd
= vrecords
;
7503 size_t residue
= vrecordsz
;
7504 size_t numextracted
= 0;
7505 boolean_t early_exit
= FALSE
;
7507 vm_rtfrecord_lock();
7509 for (int vmfi
= 0; vmfi
<= vmrtfrs
.vmrtfr_maxi
; vmfi
++) {
7510 if (residue
< sizeof(vm_rtfault_record_t
)) {
7515 if (vmrtfrs
.vm_rtf_records
[vmfi
].rtfupid
!= cupid
) {
7516 #if DEVELOPMENT || DEBUG
7517 if (isroot
== FALSE
) {
7522 #endif /* DEVDEBUG */
7525 *cvmrd
= vmrtfrs
.vm_rtf_records
[vmfi
];
7527 residue
-= sizeof(vm_rtfault_record_t
);
7531 vm_rtfrecord_unlock();
7533 *vmrtfrv
= numextracted
;