]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
fb7ce25443893e3b32b64722171e3ad069eb8e32
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <libkern/OSAtomic.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/kern_return.h>
71 #include <mach/message.h> /* for error codes */
72 #include <mach/vm_param.h>
73 #include <mach/vm_behavior.h>
74 #include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/host_statistics.h>
80 #include <kern/counters.h>
81 #include <kern/task.h>
82 #include <kern/thread.h>
83 #include <kern/sched_prim.h>
84 #include <kern/host.h>
85 #include <kern/mach_param.h>
86 #include <kern/macro_help.h>
87 #include <kern/zalloc.h>
88 #include <kern/misc_protos.h>
89 #include <kern/policy_internal.h>
90
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_fault.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_kern.h>
98 #include <vm/pmap.h>
99 #include <vm/vm_pageout.h>
100 #include <vm/vm_protos.h>
101 #include <vm/vm_external.h>
102 #include <vm/memory_object.h>
103 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
104 #include <vm/vm_shared_region.h>
105
106 #include <sys/codesign.h>
107 #include <sys/reason.h>
108 #include <sys/signalvar.h>
109
110 #include <san/kasan.h>
111
112 #define VM_FAULT_CLASSIFY 0
113
114 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
115
116 int vm_protect_privileged_from_untrusted = 1;
117
118 unsigned int vm_object_pagein_throttle = 16;
119
120 /*
121 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
122 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
123 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
124 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
125 * keep the UI active so that the user has a chance to kill the offending task before the system
126 * completely hangs.
127 *
128 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
129 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
130 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
131 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
132 */
133
134 extern void throttle_lowpri_io(int);
135
136 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
137
138 uint64_t vm_hard_throttle_threshold;
139
140
141 OS_ALWAYS_INLINE
142 boolean_t
143 NEED_TO_HARD_THROTTLE_THIS_TASK(void)
144 {
145 return vm_wants_task_throttled(current_task()) ||
146 ((vm_page_free_count < vm_page_throttle_limit ||
147 HARD_THROTTLE_LIMIT_REACHED()) &&
148 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
149 }
150
151 #define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
152 #define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
153
154 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
155 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
156
157
158 #define VM_STAT_DECOMPRESSIONS() \
159 MACRO_BEGIN \
160 VM_STAT_INCR(decompressions); \
161 current_thread()->decompressions++; \
162 MACRO_END
163
164 boolean_t current_thread_aborted(void);
165
166 /* Forward declarations of internal routines. */
167 static kern_return_t vm_fault_wire_fast(
168 vm_map_t map,
169 vm_map_offset_t va,
170 vm_prot_t prot,
171 vm_tag_t wire_tag,
172 vm_map_entry_t entry,
173 pmap_t pmap,
174 vm_map_offset_t pmap_addr,
175 ppnum_t *physpage_p);
176
177 static kern_return_t vm_fault_internal(
178 vm_map_t map,
179 vm_map_offset_t vaddr,
180 vm_prot_t caller_prot,
181 boolean_t change_wiring,
182 vm_tag_t wire_tag,
183 int interruptible,
184 pmap_t pmap,
185 vm_map_offset_t pmap_addr,
186 ppnum_t *physpage_p);
187
188 static void vm_fault_copy_cleanup(
189 vm_page_t page,
190 vm_page_t top_page);
191
192 static void vm_fault_copy_dst_cleanup(
193 vm_page_t page);
194
195 #if VM_FAULT_CLASSIFY
196 extern void vm_fault_classify(vm_object_t object,
197 vm_object_offset_t offset,
198 vm_prot_t fault_type);
199
200 extern void vm_fault_classify_init(void);
201 #endif
202
203 unsigned long vm_pmap_enter_blocked = 0;
204 unsigned long vm_pmap_enter_retried = 0;
205
206 unsigned long vm_cs_validates = 0;
207 unsigned long vm_cs_revalidates = 0;
208 unsigned long vm_cs_query_modified = 0;
209 unsigned long vm_cs_validated_dirtied = 0;
210 unsigned long vm_cs_bitmap_validated = 0;
211 #if PMAP_CS
212 uint64_t vm_cs_defer_to_pmap_cs = 0;
213 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
214 #endif /* PMAP_CS */
215
216 void vm_pre_fault(vm_map_offset_t, vm_prot_t);
217
218 extern char *kdp_compressor_decompressed_page;
219 extern addr64_t kdp_compressor_decompressed_page_paddr;
220 extern ppnum_t kdp_compressor_decompressed_page_ppnum;
221
222 struct vmrtfr {
223 int vmrtfr_maxi;
224 int vmrtfr_curi;
225 int64_t vmrtf_total;
226 vm_rtfault_record_t *vm_rtf_records;
227 } vmrtfrs;
228 #define VMRTF_DEFAULT_BUFSIZE (4096)
229 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
230 TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
231
232 static void vm_rtfrecord_lock(void);
233 static void vm_rtfrecord_unlock(void);
234 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
235
236 extern lck_grp_t vm_page_lck_grp_bucket;
237 extern lck_attr_t vm_page_lck_attr;
238 LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
239
240 /*
241 * Routine: vm_fault_init
242 * Purpose:
243 * Initialize our private data structures.
244 */
245 __startup_func
246 void
247 vm_fault_init(void)
248 {
249 int i, vm_compressor_temp;
250 boolean_t need_default_val = TRUE;
251 /*
252 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
253 * computed as a percentage of available memory, and the percentage used is scaled inversely with
254 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
255 * and reduce the value down to 10% for very large memory configurations. This helps give us a
256 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
257 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
258 */
259
260 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
261
262 /*
263 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
264 */
265
266 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
267 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
268 if (((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
269 need_default_val = FALSE;
270 vm_compressor_mode = vm_compressor_temp;
271 break;
272 }
273 }
274 if (need_default_val) {
275 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
276 }
277 }
278 if (need_default_val) {
279 /* If no boot arg or incorrect boot arg, try device tree. */
280 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
281 }
282 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
283
284 PE_parse_boot_argn("vm_protect_privileged_from_untrusted",
285 &vm_protect_privileged_from_untrusted,
286 sizeof(vm_protect_privileged_from_untrusted));
287 }
288
289 __startup_func
290 static void
291 vm_rtfault_record_init(void)
292 {
293 size_t size;
294
295 vmrtf_num_records = MAX(vmrtf_num_records, 1);
296 size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
297 vmrtfrs.vm_rtf_records = zalloc_permanent(size,
298 ZALIGN(vm_rtfault_record_t));
299 vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
300 }
301 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
302
303 /*
304 * Routine: vm_fault_cleanup
305 * Purpose:
306 * Clean up the result of vm_fault_page.
307 * Results:
308 * The paging reference for "object" is released.
309 * "object" is unlocked.
310 * If "top_page" is not null, "top_page" is
311 * freed and the paging reference for the object
312 * containing it is released.
313 *
314 * In/out conditions:
315 * "object" must be locked.
316 */
317 void
318 vm_fault_cleanup(
319 vm_object_t object,
320 vm_page_t top_page)
321 {
322 vm_object_paging_end(object);
323 vm_object_unlock(object);
324
325 if (top_page != VM_PAGE_NULL) {
326 object = VM_PAGE_OBJECT(top_page);
327
328 vm_object_lock(object);
329 VM_PAGE_FREE(top_page);
330 vm_object_paging_end(object);
331 vm_object_unlock(object);
332 }
333 }
334
335 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
336
337
338 boolean_t vm_page_deactivate_behind = TRUE;
339 /*
340 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
341 */
342 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
343 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
344 /* we use it to size an array on the stack */
345
346 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
347
348 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
349
350 /*
351 * vm_page_is_sequential
352 *
353 * Determine if sequential access is in progress
354 * in accordance with the behavior specified.
355 * Update state to indicate current access pattern.
356 *
357 * object must have at least the shared lock held
358 */
359 static
360 void
361 vm_fault_is_sequential(
362 vm_object_t object,
363 vm_object_offset_t offset,
364 vm_behavior_t behavior)
365 {
366 vm_object_offset_t last_alloc;
367 int sequential;
368 int orig_sequential;
369
370 last_alloc = object->last_alloc;
371 sequential = object->sequential;
372 orig_sequential = sequential;
373
374 offset = vm_object_trunc_page(offset);
375 if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
376 /* re-faulting in the same page: no change in behavior */
377 return;
378 }
379
380 switch (behavior) {
381 case VM_BEHAVIOR_RANDOM:
382 /*
383 * reset indicator of sequential behavior
384 */
385 sequential = 0;
386 break;
387
388 case VM_BEHAVIOR_SEQUENTIAL:
389 if (offset && last_alloc == offset - PAGE_SIZE_64) {
390 /*
391 * advance indicator of sequential behavior
392 */
393 if (sequential < MAX_SEQUENTIAL_RUN) {
394 sequential += PAGE_SIZE;
395 }
396 } else {
397 /*
398 * reset indicator of sequential behavior
399 */
400 sequential = 0;
401 }
402 break;
403
404 case VM_BEHAVIOR_RSEQNTL:
405 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
406 /*
407 * advance indicator of sequential behavior
408 */
409 if (sequential > -MAX_SEQUENTIAL_RUN) {
410 sequential -= PAGE_SIZE;
411 }
412 } else {
413 /*
414 * reset indicator of sequential behavior
415 */
416 sequential = 0;
417 }
418 break;
419
420 case VM_BEHAVIOR_DEFAULT:
421 default:
422 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
423 /*
424 * advance indicator of sequential behavior
425 */
426 if (sequential < 0) {
427 sequential = 0;
428 }
429 if (sequential < MAX_SEQUENTIAL_RUN) {
430 sequential += PAGE_SIZE;
431 }
432 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
433 /*
434 * advance indicator of sequential behavior
435 */
436 if (sequential > 0) {
437 sequential = 0;
438 }
439 if (sequential > -MAX_SEQUENTIAL_RUN) {
440 sequential -= PAGE_SIZE;
441 }
442 } else {
443 /*
444 * reset indicator of sequential behavior
445 */
446 sequential = 0;
447 }
448 break;
449 }
450 if (sequential != orig_sequential) {
451 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
452 /*
453 * if someone else has already updated object->sequential
454 * don't bother trying to update it or object->last_alloc
455 */
456 return;
457 }
458 }
459 /*
460 * I'd like to do this with a OSCompareAndSwap64, but that
461 * doesn't exist for PPC... however, it shouldn't matter
462 * that much... last_alloc is maintained so that we can determine
463 * if a sequential access pattern is taking place... if only
464 * one thread is banging on this object, no problem with the unprotected
465 * update... if 2 or more threads are banging away, we run the risk of
466 * someone seeing a mangled update... however, in the face of multiple
467 * accesses, no sequential access pattern can develop anyway, so we
468 * haven't lost any real info.
469 */
470 object->last_alloc = offset;
471 }
472
473
474 int vm_page_deactivate_behind_count = 0;
475
476 /*
477 * vm_page_deactivate_behind
478 *
479 * Determine if sequential access is in progress
480 * in accordance with the behavior specified. If
481 * so, compute a potential page to deactivate and
482 * deactivate it.
483 *
484 * object must be locked.
485 *
486 * return TRUE if we actually deactivate a page
487 */
488 static
489 boolean_t
490 vm_fault_deactivate_behind(
491 vm_object_t object,
492 vm_object_offset_t offset,
493 vm_behavior_t behavior)
494 {
495 int n;
496 int pages_in_run = 0;
497 int max_pages_in_run = 0;
498 int sequential_run;
499 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
500 vm_object_offset_t run_offset = 0;
501 vm_object_offset_t pg_offset = 0;
502 vm_page_t m;
503 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
504
505 pages_in_run = 0;
506 #if TRACEFAULTPAGE
507 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
508 #endif
509 if (object == kernel_object || vm_page_deactivate_behind == FALSE || (vm_object_trunc_page(offset) != offset)) {
510 /*
511 * Do not deactivate pages from the kernel object: they
512 * are not intended to become pageable.
513 * or we've disabled the deactivate behind mechanism
514 * or we are dealing with an offset that is not aligned to
515 * the system's PAGE_SIZE because in that case we will
516 * handle the deactivation on the aligned offset and, thus,
517 * the full PAGE_SIZE page once. This helps us avoid the redundant
518 * deactivates and the extra faults.
519 */
520 return FALSE;
521 }
522 if ((sequential_run = object->sequential)) {
523 if (sequential_run < 0) {
524 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
525 sequential_run = 0 - sequential_run;
526 } else {
527 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
528 }
529 }
530 switch (behavior) {
531 case VM_BEHAVIOR_RANDOM:
532 break;
533 case VM_BEHAVIOR_SEQUENTIAL:
534 if (sequential_run >= (int)PAGE_SIZE) {
535 run_offset = 0 - PAGE_SIZE_64;
536 max_pages_in_run = 1;
537 }
538 break;
539 case VM_BEHAVIOR_RSEQNTL:
540 if (sequential_run >= (int)PAGE_SIZE) {
541 run_offset = PAGE_SIZE_64;
542 max_pages_in_run = 1;
543 }
544 break;
545 case VM_BEHAVIOR_DEFAULT:
546 default:
547 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
548
549 /*
550 * determine if the run of sequential accesss has been
551 * long enough on an object with default access behavior
552 * to consider it for deactivation
553 */
554 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
555 /*
556 * the comparisons between offset and behind are done
557 * in this kind of odd fashion in order to prevent wrap around
558 * at the end points
559 */
560 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
561 if (offset >= behind) {
562 run_offset = 0 - behind;
563 pg_offset = PAGE_SIZE_64;
564 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
565 }
566 } else {
567 if (offset < -behind) {
568 run_offset = behind;
569 pg_offset = 0 - PAGE_SIZE_64;
570 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
571 }
572 }
573 }
574 break;}
575 }
576 for (n = 0; n < max_pages_in_run; n++) {
577 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
578
579 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
580 page_run[pages_in_run++] = m;
581
582 /*
583 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
584 *
585 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
586 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
587 * new reference happens. If no futher references happen on the page after that remote TLB flushes
588 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
589 * by pageout_scan, which is just fine since the last reference would have happened quite far
590 * in the past (TLB caches don't hang around for very long), and of course could just as easily
591 * have happened before we did the deactivate_behind.
592 */
593 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
594 }
595 }
596 if (pages_in_run) {
597 vm_page_lockspin_queues();
598
599 for (n = 0; n < pages_in_run; n++) {
600 m = page_run[n];
601
602 vm_page_deactivate_internal(m, FALSE);
603
604 vm_page_deactivate_behind_count++;
605 #if TRACEFAULTPAGE
606 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
607 #endif
608 }
609 vm_page_unlock_queues();
610
611 return TRUE;
612 }
613 return FALSE;
614 }
615
616
617 #if (DEVELOPMENT || DEBUG)
618 uint32_t vm_page_creation_throttled_hard = 0;
619 uint32_t vm_page_creation_throttled_soft = 0;
620 uint64_t vm_page_creation_throttle_avoided = 0;
621 #endif /* DEVELOPMENT || DEBUG */
622
623 static int
624 vm_page_throttled(boolean_t page_kept)
625 {
626 clock_sec_t elapsed_sec;
627 clock_sec_t tv_sec;
628 clock_usec_t tv_usec;
629
630 thread_t thread = current_thread();
631
632 if (thread->options & TH_OPT_VMPRIV) {
633 return 0;
634 }
635
636 if (thread->t_page_creation_throttled) {
637 thread->t_page_creation_throttled = 0;
638
639 if (page_kept == FALSE) {
640 goto no_throttle;
641 }
642 }
643 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
644 #if (DEVELOPMENT || DEBUG)
645 thread->t_page_creation_throttled_hard++;
646 OSAddAtomic(1, &vm_page_creation_throttled_hard);
647 #endif /* DEVELOPMENT || DEBUG */
648 return HARD_THROTTLE_DELAY;
649 }
650
651 if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
652 thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
653 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
654 #if (DEVELOPMENT || DEBUG)
655 OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
656 #endif
657 goto no_throttle;
658 }
659 clock_get_system_microtime(&tv_sec, &tv_usec);
660
661 elapsed_sec = tv_sec - thread->t_page_creation_time;
662
663 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
664 (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
665 if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
666 /*
667 * we'll reset our stats to give a well behaved app
668 * that was unlucky enough to accumulate a bunch of pages
669 * over a long period of time a chance to get out of
670 * the throttled state... we reset the counter and timestamp
671 * so that if it stays under the rate limit for the next second
672 * it will be back in our good graces... if it exceeds it, it
673 * will remain in the throttled state
674 */
675 thread->t_page_creation_time = tv_sec;
676 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
677 }
678 VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
679
680 thread->t_page_creation_throttled = 1;
681
682 if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
683 #if (DEVELOPMENT || DEBUG)
684 thread->t_page_creation_throttled_hard++;
685 OSAddAtomic(1, &vm_page_creation_throttled_hard);
686 #endif /* DEVELOPMENT || DEBUG */
687 return HARD_THROTTLE_DELAY;
688 } else {
689 #if (DEVELOPMENT || DEBUG)
690 thread->t_page_creation_throttled_soft++;
691 OSAddAtomic(1, &vm_page_creation_throttled_soft);
692 #endif /* DEVELOPMENT || DEBUG */
693 return SOFT_THROTTLE_DELAY;
694 }
695 }
696 thread->t_page_creation_time = tv_sec;
697 thread->t_page_creation_count = 0;
698 }
699 no_throttle:
700 thread->t_page_creation_count++;
701
702 return 0;
703 }
704
705
706 /*
707 * check for various conditions that would
708 * prevent us from creating a ZF page...
709 * cleanup is based on being called from vm_fault_page
710 *
711 * object must be locked
712 * object == m->vmp_object
713 */
714 static vm_fault_return_t
715 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
716 {
717 int throttle_delay;
718
719 if (object->shadow_severed ||
720 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
721 /*
722 * Either:
723 * 1. the shadow chain was severed,
724 * 2. the purgeable object is volatile or empty and is marked
725 * to fault on access while volatile.
726 * Just have to return an error at this point
727 */
728 if (m != VM_PAGE_NULL) {
729 VM_PAGE_FREE(m);
730 }
731 vm_fault_cleanup(object, first_m);
732
733 thread_interrupt_level(interruptible_state);
734
735 return VM_FAULT_MEMORY_ERROR;
736 }
737 if (page_throttle == TRUE) {
738 if ((throttle_delay = vm_page_throttled(FALSE))) {
739 /*
740 * we're throttling zero-fills...
741 * treat this as if we couldn't grab a page
742 */
743 if (m != VM_PAGE_NULL) {
744 VM_PAGE_FREE(m);
745 }
746 vm_fault_cleanup(object, first_m);
747
748 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
749
750 delay(throttle_delay);
751
752 if (current_thread_aborted()) {
753 thread_interrupt_level(interruptible_state);
754 return VM_FAULT_INTERRUPTED;
755 }
756 thread_interrupt_level(interruptible_state);
757
758 return VM_FAULT_MEMORY_SHORTAGE;
759 }
760 }
761 return VM_FAULT_SUCCESS;
762 }
763
764 /*
765 * Clear the code signing bits on the given page_t
766 */
767 static void
768 vm_fault_cs_clear(vm_page_t m)
769 {
770 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
771 m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
772 m->vmp_cs_nx = VMP_CS_ALL_FALSE;
773 }
774
775 /*
776 * Enqueues the given page on the throttled queue.
777 * The caller must hold the vm_page_queue_lock and it will be held on return.
778 */
779 static void
780 vm_fault_enqueue_throttled_locked(vm_page_t m)
781 {
782 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
783 assert(!VM_PAGE_WIRED(m));
784
785 /*
786 * can't be on the pageout queue since we don't
787 * have a pager to try and clean to
788 */
789 vm_page_queues_remove(m, TRUE);
790 vm_page_check_pageable_safe(m);
791 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
792 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
793 vm_page_throttled_count++;
794 }
795
796 /*
797 * do the work to zero fill a page and
798 * inject it into the correct paging queue
799 *
800 * m->vmp_object must be locked
801 * page queue lock must NOT be held
802 */
803 static int
804 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
805 {
806 int my_fault = DBG_ZERO_FILL_FAULT;
807 vm_object_t object;
808
809 object = VM_PAGE_OBJECT(m);
810
811 /*
812 * This is is a zero-fill page fault...
813 *
814 * Checking the page lock is a waste of
815 * time; this page was absent, so
816 * it can't be page locked by a pager.
817 *
818 * we also consider it undefined
819 * with respect to instruction
820 * execution. i.e. it is the responsibility
821 * of higher layers to call for an instruction
822 * sync after changing the contents and before
823 * sending a program into this area. We
824 * choose this approach for performance
825 */
826 vm_fault_cs_clear(m);
827 m->vmp_pmapped = TRUE;
828
829 if (no_zero_fill == TRUE) {
830 my_fault = DBG_NZF_PAGE_FAULT;
831
832 if (m->vmp_absent && m->vmp_busy) {
833 return my_fault;
834 }
835 } else {
836 vm_page_zero_fill(m);
837
838 VM_STAT_INCR(zero_fill_count);
839 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
840 }
841 assert(!m->vmp_laundry);
842 assert(object != kernel_object);
843 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
844 if (!VM_DYNAMIC_PAGING_ENABLED() &&
845 (object->purgable == VM_PURGABLE_DENY ||
846 object->purgable == VM_PURGABLE_NONVOLATILE ||
847 object->purgable == VM_PURGABLE_VOLATILE)) {
848 vm_page_lockspin_queues();
849 if (!VM_DYNAMIC_PAGING_ENABLED()) {
850 vm_fault_enqueue_throttled_locked(m);
851 }
852 vm_page_unlock_queues();
853 }
854 return my_fault;
855 }
856
857
858 /*
859 * Routine: vm_fault_page
860 * Purpose:
861 * Find the resident page for the virtual memory
862 * specified by the given virtual memory object
863 * and offset.
864 * Additional arguments:
865 * The required permissions for the page is given
866 * in "fault_type". Desired permissions are included
867 * in "protection".
868 * fault_info is passed along to determine pagein cluster
869 * limits... it contains the expected reference pattern,
870 * cluster size if available, etc...
871 *
872 * If the desired page is known to be resident (for
873 * example, because it was previously wired down), asserting
874 * the "unwiring" parameter will speed the search.
875 *
876 * If the operation can be interrupted (by thread_abort
877 * or thread_terminate), then the "interruptible"
878 * parameter should be asserted.
879 *
880 * Results:
881 * The page containing the proper data is returned
882 * in "result_page".
883 *
884 * In/out conditions:
885 * The source object must be locked and referenced,
886 * and must donate one paging reference. The reference
887 * is not affected. The paging reference and lock are
888 * consumed.
889 *
890 * If the call succeeds, the object in which "result_page"
891 * resides is left locked and holding a paging reference.
892 * If this is not the original object, a busy page in the
893 * original object is returned in "top_page", to prevent other
894 * callers from pursuing this same data, along with a paging
895 * reference for the original object. The "top_page" should
896 * be destroyed when this guarantee is no longer required.
897 * The "result_page" is also left busy. It is not removed
898 * from the pageout queues.
899 * Special Case:
900 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
901 * fault succeeded but there's no VM page (i.e. the VM object
902 * does not actually hold VM pages, but device memory or
903 * large pages). The object is still locked and we still hold a
904 * paging_in_progress reference.
905 */
906 unsigned int vm_fault_page_blocked_access = 0;
907 unsigned int vm_fault_page_forced_retry = 0;
908
909 vm_fault_return_t
910 vm_fault_page(
911 /* Arguments: */
912 vm_object_t first_object, /* Object to begin search */
913 vm_object_offset_t first_offset, /* Offset into object */
914 vm_prot_t fault_type, /* What access is requested */
915 boolean_t must_be_resident,/* Must page be resident? */
916 boolean_t caller_lookup, /* caller looked up page */
917 /* Modifies in place: */
918 vm_prot_t *protection, /* Protection for mapping */
919 vm_page_t *result_page, /* Page found, if successful */
920 /* Returns: */
921 vm_page_t *top_page, /* Page in top object, if
922 * not result_page. */
923 int *type_of_fault, /* if non-null, fill in with type of fault
924 * COW, zero-fill, etc... returned in trace point */
925 /* More arguments: */
926 kern_return_t *error_code, /* code if page is in error */
927 boolean_t no_zero_fill, /* don't zero fill absent pages */
928 boolean_t data_supply, /* treat as data_supply if
929 * it is a write fault and a full
930 * page is provided */
931 vm_object_fault_info_t fault_info)
932 {
933 vm_page_t m;
934 vm_object_t object;
935 vm_object_offset_t offset;
936 vm_page_t first_m;
937 vm_object_t next_object;
938 vm_object_t copy_object;
939 boolean_t look_for_page;
940 boolean_t force_fault_retry = FALSE;
941 vm_prot_t access_required = fault_type;
942 vm_prot_t wants_copy_flag;
943 kern_return_t wait_result;
944 wait_interrupt_t interruptible_state;
945 boolean_t data_already_requested = FALSE;
946 vm_behavior_t orig_behavior;
947 vm_size_t orig_cluster_size;
948 vm_fault_return_t error;
949 int my_fault;
950 uint32_t try_failed_count;
951 int interruptible; /* how may fault be interrupted? */
952 int external_state = VM_EXTERNAL_STATE_UNKNOWN;
953 memory_object_t pager;
954 vm_fault_return_t retval;
955 int grab_options;
956
957 /*
958 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
959 * marked as paged out in the compressor pager or the pager doesn't exist.
960 * Note also that if the pager for an internal object
961 * has not been created, the pager is not invoked regardless of the value
962 * of MUST_ASK_PAGER().
963 *
964 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
965 * is marked as paged out in the compressor pager.
966 * PAGED_OUT() is used to determine if a page has already been pushed
967 * into a copy object in order to avoid a redundant page out operation.
968 */
969 #define MUST_ASK_PAGER(o, f, s) \
970 ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
971
972 #define PAGED_OUT(o, f) \
973 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
974
975 /*
976 * Recovery actions
977 */
978 #define RELEASE_PAGE(m) \
979 MACRO_BEGIN \
980 PAGE_WAKEUP_DONE(m); \
981 if ( !VM_PAGE_PAGEABLE(m)) { \
982 vm_page_lockspin_queues(); \
983 if ( !VM_PAGE_PAGEABLE(m)) { \
984 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
985 vm_page_deactivate(m); \
986 else \
987 vm_page_activate(m); \
988 } \
989 vm_page_unlock_queues(); \
990 } \
991 MACRO_END
992
993 #if TRACEFAULTPAGE
994 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
995 #endif
996
997 interruptible = fault_info->interruptible;
998 interruptible_state = thread_interrupt_level(interruptible);
999
1000 /*
1001 * INVARIANTS (through entire routine):
1002 *
1003 * 1) At all times, we must either have the object
1004 * lock or a busy page in some object to prevent
1005 * some other thread from trying to bring in
1006 * the same page.
1007 *
1008 * Note that we cannot hold any locks during the
1009 * pager access or when waiting for memory, so
1010 * we use a busy page then.
1011 *
1012 * 2) To prevent another thread from racing us down the
1013 * shadow chain and entering a new page in the top
1014 * object before we do, we must keep a busy page in
1015 * the top object while following the shadow chain.
1016 *
1017 * 3) We must increment paging_in_progress on any object
1018 * for which we have a busy page before dropping
1019 * the object lock
1020 *
1021 * 4) We leave busy pages on the pageout queues.
1022 * If the pageout daemon comes across a busy page,
1023 * it will remove the page from the pageout queues.
1024 */
1025
1026 object = first_object;
1027 offset = first_offset;
1028 first_m = VM_PAGE_NULL;
1029 access_required = fault_type;
1030
1031 /*
1032 * default type of fault
1033 */
1034 my_fault = DBG_CACHE_HIT_FAULT;
1035
1036 while (TRUE) {
1037 #if TRACEFAULTPAGE
1038 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1039 #endif
1040
1041 grab_options = 0;
1042 #if CONFIG_SECLUDED_MEMORY
1043 if (object->can_grab_secluded) {
1044 grab_options |= VM_PAGE_GRAB_SECLUDED;
1045 }
1046 #endif /* CONFIG_SECLUDED_MEMORY */
1047
1048 if (!object->alive) {
1049 /*
1050 * object is no longer valid
1051 * clean up and return error
1052 */
1053 vm_fault_cleanup(object, first_m);
1054 thread_interrupt_level(interruptible_state);
1055
1056 return VM_FAULT_MEMORY_ERROR;
1057 }
1058
1059 if (!object->pager_created && object->phys_contiguous) {
1060 /*
1061 * A physically-contiguous object without a pager:
1062 * must be a "large page" object. We do not deal
1063 * with VM pages for this object.
1064 */
1065 caller_lookup = FALSE;
1066 m = VM_PAGE_NULL;
1067 goto phys_contig_object;
1068 }
1069
1070 if (object->blocked_access) {
1071 /*
1072 * Access to this VM object has been blocked.
1073 * Replace our "paging_in_progress" reference with
1074 * a "activity_in_progress" reference and wait for
1075 * access to be unblocked.
1076 */
1077 caller_lookup = FALSE; /* no longer valid after sleep */
1078 vm_object_activity_begin(object);
1079 vm_object_paging_end(object);
1080 while (object->blocked_access) {
1081 vm_object_sleep(object,
1082 VM_OBJECT_EVENT_UNBLOCKED,
1083 THREAD_UNINT);
1084 }
1085 vm_fault_page_blocked_access++;
1086 vm_object_paging_begin(object);
1087 vm_object_activity_end(object);
1088 }
1089
1090 /*
1091 * See whether the page at 'offset' is resident
1092 */
1093 if (caller_lookup == TRUE) {
1094 /*
1095 * The caller has already looked up the page
1096 * and gave us the result in "result_page".
1097 * We can use this for the first lookup but
1098 * it loses its validity as soon as we unlock
1099 * the object.
1100 */
1101 m = *result_page;
1102 caller_lookup = FALSE; /* no longer valid after that */
1103 } else {
1104 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1105 }
1106 #if TRACEFAULTPAGE
1107 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1108 #endif
1109 if (m != VM_PAGE_NULL) {
1110 if (m->vmp_busy) {
1111 /*
1112 * The page is being brought in,
1113 * wait for it and then retry.
1114 */
1115 #if TRACEFAULTPAGE
1116 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1117 #endif
1118 wait_result = PAGE_SLEEP(object, m, interruptible);
1119
1120 counter(c_vm_fault_page_block_busy_kernel++);
1121
1122 if (wait_result != THREAD_AWAKENED) {
1123 vm_fault_cleanup(object, first_m);
1124 thread_interrupt_level(interruptible_state);
1125
1126 if (wait_result == THREAD_RESTART) {
1127 return VM_FAULT_RETRY;
1128 } else {
1129 return VM_FAULT_INTERRUPTED;
1130 }
1131 }
1132 continue;
1133 }
1134 if (m->vmp_laundry) {
1135 m->vmp_free_when_done = FALSE;
1136
1137 if (!m->vmp_cleaning) {
1138 vm_pageout_steal_laundry(m, FALSE);
1139 }
1140 }
1141 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1142 /*
1143 * Guard page: off limits !
1144 */
1145 if (fault_type == VM_PROT_NONE) {
1146 /*
1147 * The fault is not requesting any
1148 * access to the guard page, so it must
1149 * be just to wire or unwire it.
1150 * Let's pretend it succeeded...
1151 */
1152 m->vmp_busy = TRUE;
1153 *result_page = m;
1154 assert(first_m == VM_PAGE_NULL);
1155 *top_page = first_m;
1156 if (type_of_fault) {
1157 *type_of_fault = DBG_GUARD_FAULT;
1158 }
1159 thread_interrupt_level(interruptible_state);
1160 return VM_FAULT_SUCCESS;
1161 } else {
1162 /*
1163 * The fault requests access to the
1164 * guard page: let's deny that !
1165 */
1166 vm_fault_cleanup(object, first_m);
1167 thread_interrupt_level(interruptible_state);
1168 return VM_FAULT_MEMORY_ERROR;
1169 }
1170 }
1171
1172 if (m->vmp_error) {
1173 /*
1174 * The page is in error, give up now.
1175 */
1176 #if TRACEFAULTPAGE
1177 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1178 #endif
1179 if (error_code) {
1180 *error_code = KERN_MEMORY_ERROR;
1181 }
1182 VM_PAGE_FREE(m);
1183
1184 vm_fault_cleanup(object, first_m);
1185 thread_interrupt_level(interruptible_state);
1186
1187 return VM_FAULT_MEMORY_ERROR;
1188 }
1189 if (m->vmp_restart) {
1190 /*
1191 * The pager wants us to restart
1192 * at the top of the chain,
1193 * typically because it has moved the
1194 * page to another pager, then do so.
1195 */
1196 #if TRACEFAULTPAGE
1197 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1198 #endif
1199 VM_PAGE_FREE(m);
1200
1201 vm_fault_cleanup(object, first_m);
1202 thread_interrupt_level(interruptible_state);
1203
1204 return VM_FAULT_RETRY;
1205 }
1206 if (m->vmp_absent) {
1207 /*
1208 * The page isn't busy, but is absent,
1209 * therefore it's deemed "unavailable".
1210 *
1211 * Remove the non-existent page (unless it's
1212 * in the top object) and move on down to the
1213 * next object (if there is one).
1214 */
1215 #if TRACEFAULTPAGE
1216 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1217 #endif
1218 next_object = object->shadow;
1219
1220 if (next_object == VM_OBJECT_NULL) {
1221 /*
1222 * Absent page at bottom of shadow
1223 * chain; zero fill the page we left
1224 * busy in the first object, and free
1225 * the absent page.
1226 */
1227 assert(!must_be_resident);
1228
1229 /*
1230 * check for any conditions that prevent
1231 * us from creating a new zero-fill page
1232 * vm_fault_check will do all of the
1233 * fault cleanup in the case of an error condition
1234 * including resetting the thread_interrupt_level
1235 */
1236 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1237
1238 if (error != VM_FAULT_SUCCESS) {
1239 return error;
1240 }
1241
1242 if (object != first_object) {
1243 /*
1244 * free the absent page we just found
1245 */
1246 VM_PAGE_FREE(m);
1247
1248 /*
1249 * drop reference and lock on current object
1250 */
1251 vm_object_paging_end(object);
1252 vm_object_unlock(object);
1253
1254 /*
1255 * grab the original page we
1256 * 'soldered' in place and
1257 * retake lock on 'first_object'
1258 */
1259 m = first_m;
1260 first_m = VM_PAGE_NULL;
1261
1262 object = first_object;
1263 offset = first_offset;
1264
1265 vm_object_lock(object);
1266 } else {
1267 /*
1268 * we're going to use the absent page we just found
1269 * so convert it to a 'busy' page
1270 */
1271 m->vmp_absent = FALSE;
1272 m->vmp_busy = TRUE;
1273 }
1274 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1275 m->vmp_absent = TRUE;
1276 }
1277 /*
1278 * zero-fill the page and put it on
1279 * the correct paging queue
1280 */
1281 my_fault = vm_fault_zero_page(m, no_zero_fill);
1282
1283 break;
1284 } else {
1285 if (must_be_resident) {
1286 vm_object_paging_end(object);
1287 } else if (object != first_object) {
1288 vm_object_paging_end(object);
1289 VM_PAGE_FREE(m);
1290 } else {
1291 first_m = m;
1292 m->vmp_absent = FALSE;
1293 m->vmp_busy = TRUE;
1294
1295 vm_page_lockspin_queues();
1296 vm_page_queues_remove(m, FALSE);
1297 vm_page_unlock_queues();
1298 }
1299
1300 offset += object->vo_shadow_offset;
1301 fault_info->lo_offset += object->vo_shadow_offset;
1302 fault_info->hi_offset += object->vo_shadow_offset;
1303 access_required = VM_PROT_READ;
1304
1305 vm_object_lock(next_object);
1306 vm_object_unlock(object);
1307 object = next_object;
1308 vm_object_paging_begin(object);
1309
1310 /*
1311 * reset to default type of fault
1312 */
1313 my_fault = DBG_CACHE_HIT_FAULT;
1314
1315 continue;
1316 }
1317 }
1318 if ((m->vmp_cleaning)
1319 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1320 && (fault_type & VM_PROT_WRITE)) {
1321 /*
1322 * This is a copy-on-write fault that will
1323 * cause us to revoke access to this page, but
1324 * this page is in the process of being cleaned
1325 * in a clustered pageout. We must wait until
1326 * the cleaning operation completes before
1327 * revoking access to the original page,
1328 * otherwise we might attempt to remove a
1329 * wired mapping.
1330 */
1331 #if TRACEFAULTPAGE
1332 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1333 #endif
1334 /*
1335 * take an extra ref so that object won't die
1336 */
1337 vm_object_reference_locked(object);
1338
1339 vm_fault_cleanup(object, first_m);
1340
1341 counter(c_vm_fault_page_block_backoff_kernel++);
1342 vm_object_lock(object);
1343 assert(object->ref_count > 0);
1344
1345 m = vm_page_lookup(object, vm_object_trunc_page(offset));
1346
1347 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1348 PAGE_ASSERT_WAIT(m, interruptible);
1349
1350 vm_object_unlock(object);
1351 wait_result = thread_block(THREAD_CONTINUE_NULL);
1352 vm_object_deallocate(object);
1353
1354 goto backoff;
1355 } else {
1356 vm_object_unlock(object);
1357
1358 vm_object_deallocate(object);
1359 thread_interrupt_level(interruptible_state);
1360
1361 return VM_FAULT_RETRY;
1362 }
1363 }
1364 if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1365 !(fault_info != NULL && fault_info->stealth)) {
1366 /*
1367 * If we were passed a non-NULL pointer for
1368 * "type_of_fault", than we came from
1369 * vm_fault... we'll let it deal with
1370 * this condition, since it
1371 * needs to see m->vmp_speculative to correctly
1372 * account the pageins, otherwise...
1373 * take it off the speculative queue, we'll
1374 * let the caller of vm_fault_page deal
1375 * with getting it onto the correct queue
1376 *
1377 * If the caller specified in fault_info that
1378 * it wants a "stealth" fault, we also leave
1379 * the page in the speculative queue.
1380 */
1381 vm_page_lockspin_queues();
1382 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1383 vm_page_queues_remove(m, FALSE);
1384 }
1385 vm_page_unlock_queues();
1386 }
1387 assert(object == VM_PAGE_OBJECT(m));
1388
1389 if (object->code_signed) {
1390 /*
1391 * CODE SIGNING:
1392 * We just paged in a page from a signed
1393 * memory object but we don't need to
1394 * validate it now. We'll validate it if
1395 * when it gets mapped into a user address
1396 * space for the first time or when the page
1397 * gets copied to another object as a result
1398 * of a copy-on-write.
1399 */
1400 }
1401
1402 /*
1403 * We mark the page busy and leave it on
1404 * the pageout queues. If the pageout
1405 * deamon comes across it, then it will
1406 * remove the page from the queue, but not the object
1407 */
1408 #if TRACEFAULTPAGE
1409 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1410 #endif
1411 assert(!m->vmp_busy);
1412 assert(!m->vmp_absent);
1413
1414 m->vmp_busy = TRUE;
1415 break;
1416 }
1417
1418
1419 /*
1420 * we get here when there is no page present in the object at
1421 * the offset we're interested in... we'll allocate a page
1422 * at this point if the pager associated with
1423 * this object can provide the data or we're the top object...
1424 * object is locked; m == NULL
1425 */
1426
1427 if (must_be_resident) {
1428 if (fault_type == VM_PROT_NONE &&
1429 object == kernel_object) {
1430 /*
1431 * We've been called from vm_fault_unwire()
1432 * while removing a map entry that was allocated
1433 * with KMA_KOBJECT and KMA_VAONLY. This page
1434 * is not present and there's nothing more to
1435 * do here (nothing to unwire).
1436 */
1437 vm_fault_cleanup(object, first_m);
1438 thread_interrupt_level(interruptible_state);
1439
1440 return VM_FAULT_MEMORY_ERROR;
1441 }
1442
1443 goto dont_look_for_page;
1444 }
1445
1446 /* Don't expect to fault pages into the kernel object. */
1447 assert(object != kernel_object);
1448
1449 data_supply = FALSE;
1450
1451 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1452
1453 #if TRACEFAULTPAGE
1454 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1455 #endif
1456 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1457 /*
1458 * Allocate a new page for this object/offset pair as a placeholder
1459 */
1460 m = vm_page_grab_options(grab_options);
1461 #if TRACEFAULTPAGE
1462 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1463 #endif
1464 if (m == VM_PAGE_NULL) {
1465 vm_fault_cleanup(object, first_m);
1466 thread_interrupt_level(interruptible_state);
1467
1468 return VM_FAULT_MEMORY_SHORTAGE;
1469 }
1470
1471 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1472 vm_page_insert_internal(m, object,
1473 vm_object_trunc_page(offset),
1474 VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1475 } else {
1476 vm_page_insert(m, object, vm_object_trunc_page(offset));
1477 }
1478 }
1479 if (look_for_page) {
1480 kern_return_t rc;
1481 int my_fault_type;
1482
1483 /*
1484 * If the memory manager is not ready, we
1485 * cannot make requests.
1486 */
1487 if (!object->pager_ready) {
1488 #if TRACEFAULTPAGE
1489 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1490 #endif
1491 if (m != VM_PAGE_NULL) {
1492 VM_PAGE_FREE(m);
1493 }
1494
1495 /*
1496 * take an extra ref so object won't die
1497 */
1498 vm_object_reference_locked(object);
1499 vm_fault_cleanup(object, first_m);
1500 counter(c_vm_fault_page_block_backoff_kernel++);
1501
1502 vm_object_lock(object);
1503 assert(object->ref_count > 0);
1504
1505 if (!object->pager_ready) {
1506 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1507
1508 vm_object_unlock(object);
1509 if (wait_result == THREAD_WAITING) {
1510 wait_result = thread_block(THREAD_CONTINUE_NULL);
1511 }
1512 vm_object_deallocate(object);
1513
1514 goto backoff;
1515 } else {
1516 vm_object_unlock(object);
1517 vm_object_deallocate(object);
1518 thread_interrupt_level(interruptible_state);
1519
1520 return VM_FAULT_RETRY;
1521 }
1522 }
1523 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1524 /*
1525 * If there are too many outstanding page
1526 * requests pending on this external object, we
1527 * wait for them to be resolved now.
1528 */
1529 #if TRACEFAULTPAGE
1530 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1531 #endif
1532 if (m != VM_PAGE_NULL) {
1533 VM_PAGE_FREE(m);
1534 }
1535 /*
1536 * take an extra ref so object won't die
1537 */
1538 vm_object_reference_locked(object);
1539
1540 vm_fault_cleanup(object, first_m);
1541
1542 counter(c_vm_fault_page_block_backoff_kernel++);
1543
1544 vm_object_lock(object);
1545 assert(object->ref_count > 0);
1546
1547 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1548 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1549
1550 vm_object_unlock(object);
1551 wait_result = thread_block(THREAD_CONTINUE_NULL);
1552 vm_object_deallocate(object);
1553
1554 goto backoff;
1555 } else {
1556 vm_object_unlock(object);
1557 vm_object_deallocate(object);
1558 thread_interrupt_level(interruptible_state);
1559
1560 return VM_FAULT_RETRY;
1561 }
1562 }
1563 if (object->internal) {
1564 int compressed_count_delta;
1565
1566 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1567
1568 if (m == VM_PAGE_NULL) {
1569 /*
1570 * Allocate a new page for this object/offset pair as a placeholder
1571 */
1572 m = vm_page_grab_options(grab_options);
1573 #if TRACEFAULTPAGE
1574 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1575 #endif
1576 if (m == VM_PAGE_NULL) {
1577 vm_fault_cleanup(object, first_m);
1578 thread_interrupt_level(interruptible_state);
1579
1580 return VM_FAULT_MEMORY_SHORTAGE;
1581 }
1582
1583 m->vmp_absent = TRUE;
1584 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1585 vm_page_insert_internal(m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1586 } else {
1587 vm_page_insert(m, object, vm_object_trunc_page(offset));
1588 }
1589 }
1590 assert(m->vmp_busy);
1591
1592 m->vmp_absent = TRUE;
1593 pager = object->pager;
1594
1595 assert(object->paging_in_progress > 0);
1596 vm_object_unlock(object);
1597
1598 rc = vm_compressor_pager_get(
1599 pager,
1600 offset + object->paging_offset,
1601 VM_PAGE_GET_PHYS_PAGE(m),
1602 &my_fault_type,
1603 0,
1604 &compressed_count_delta);
1605
1606 if (type_of_fault == NULL) {
1607 int throttle_delay;
1608
1609 /*
1610 * we weren't called from vm_fault, so we
1611 * need to apply page creation throttling
1612 * do it before we re-acquire any locks
1613 */
1614 if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1615 if ((throttle_delay = vm_page_throttled(TRUE))) {
1616 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1617 delay(throttle_delay);
1618 }
1619 }
1620 }
1621 vm_object_lock(object);
1622 assert(object->paging_in_progress > 0);
1623
1624 vm_compressor_pager_count(
1625 pager,
1626 compressed_count_delta,
1627 FALSE, /* shared_lock */
1628 object);
1629
1630 switch (rc) {
1631 case KERN_SUCCESS:
1632 m->vmp_absent = FALSE;
1633 m->vmp_dirty = TRUE;
1634 if ((object->wimg_bits &
1635 VM_WIMG_MASK) !=
1636 VM_WIMG_USE_DEFAULT) {
1637 /*
1638 * If the page is not cacheable,
1639 * we can't let its contents
1640 * linger in the data cache
1641 * after the decompression.
1642 */
1643 pmap_sync_page_attributes_phys(
1644 VM_PAGE_GET_PHYS_PAGE(m));
1645 } else {
1646 m->vmp_written_by_kernel = TRUE;
1647 }
1648
1649 /*
1650 * If the object is purgeable, its
1651 * owner's purgeable ledgers have been
1652 * updated in vm_page_insert() but the
1653 * page was also accounted for in a
1654 * "compressed purgeable" ledger, so
1655 * update that now.
1656 */
1657 if (((object->purgable !=
1658 VM_PURGABLE_DENY) ||
1659 object->vo_ledger_tag) &&
1660 (object->vo_owner !=
1661 NULL)) {
1662 /*
1663 * One less compressed
1664 * purgeable/tagged page.
1665 */
1666 vm_object_owner_compressed_update(
1667 object,
1668 -1);
1669 }
1670
1671 break;
1672 case KERN_MEMORY_FAILURE:
1673 m->vmp_unusual = TRUE;
1674 m->vmp_error = TRUE;
1675 m->vmp_absent = FALSE;
1676 break;
1677 case KERN_MEMORY_ERROR:
1678 assert(m->vmp_absent);
1679 break;
1680 default:
1681 panic("vm_fault_page(): unexpected "
1682 "error %d from "
1683 "vm_compressor_pager_get()\n",
1684 rc);
1685 }
1686 PAGE_WAKEUP_DONE(m);
1687
1688 rc = KERN_SUCCESS;
1689 goto data_requested;
1690 }
1691 my_fault_type = DBG_PAGEIN_FAULT;
1692
1693 if (m != VM_PAGE_NULL) {
1694 VM_PAGE_FREE(m);
1695 m = VM_PAGE_NULL;
1696 }
1697
1698 #if TRACEFAULTPAGE
1699 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1700 #endif
1701
1702 /*
1703 * It's possible someone called vm_object_destroy while we weren't
1704 * holding the object lock. If that has happened, then bail out
1705 * here.
1706 */
1707
1708 pager = object->pager;
1709
1710 if (pager == MEMORY_OBJECT_NULL) {
1711 vm_fault_cleanup(object, first_m);
1712 thread_interrupt_level(interruptible_state);
1713 return VM_FAULT_MEMORY_ERROR;
1714 }
1715
1716 /*
1717 * We have an absent page in place for the faulting offset,
1718 * so we can release the object lock.
1719 */
1720
1721 if (object->object_is_shared_cache) {
1722 set_thread_rwlock_boost();
1723 }
1724
1725 vm_object_unlock(object);
1726
1727 /*
1728 * If this object uses a copy_call strategy,
1729 * and we are interested in a copy of this object
1730 * (having gotten here only by following a
1731 * shadow chain), then tell the memory manager
1732 * via a flag added to the desired_access
1733 * parameter, so that it can detect a race
1734 * between our walking down the shadow chain
1735 * and its pushing pages up into a copy of
1736 * the object that it manages.
1737 */
1738 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1739 wants_copy_flag = VM_PROT_WANTS_COPY;
1740 } else {
1741 wants_copy_flag = VM_PROT_NONE;
1742 }
1743
1744 if (object->copy == first_object) {
1745 /*
1746 * if we issue the memory_object_data_request in
1747 * this state, we are subject to a deadlock with
1748 * the underlying filesystem if it is trying to
1749 * shrink the file resulting in a push of pages
1750 * into the copy object... that push will stall
1751 * on the placeholder page, and if the pushing thread
1752 * is holding a lock that is required on the pagein
1753 * path (such as a truncate lock), we'll deadlock...
1754 * to avoid this potential deadlock, we throw away
1755 * our placeholder page before calling memory_object_data_request
1756 * and force this thread to retry the vm_fault_page after
1757 * we have issued the I/O. the second time through this path
1758 * we will find the page already in the cache (presumably still
1759 * busy waiting for the I/O to complete) and then complete
1760 * the fault w/o having to go through memory_object_data_request again
1761 */
1762 assert(first_m != VM_PAGE_NULL);
1763 assert(VM_PAGE_OBJECT(first_m) == first_object);
1764
1765 vm_object_lock(first_object);
1766 VM_PAGE_FREE(first_m);
1767 vm_object_paging_end(first_object);
1768 vm_object_unlock(first_object);
1769
1770 first_m = VM_PAGE_NULL;
1771 force_fault_retry = TRUE;
1772
1773 vm_fault_page_forced_retry++;
1774 }
1775
1776 if (data_already_requested == TRUE) {
1777 orig_behavior = fault_info->behavior;
1778 orig_cluster_size = fault_info->cluster_size;
1779
1780 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1781 fault_info->cluster_size = PAGE_SIZE;
1782 }
1783 /*
1784 * Call the memory manager to retrieve the data.
1785 */
1786 rc = memory_object_data_request(
1787 pager,
1788 vm_object_trunc_page(offset) + object->paging_offset,
1789 PAGE_SIZE,
1790 access_required | wants_copy_flag,
1791 (memory_object_fault_info_t)fault_info);
1792
1793 if (data_already_requested == TRUE) {
1794 fault_info->behavior = orig_behavior;
1795 fault_info->cluster_size = orig_cluster_size;
1796 } else {
1797 data_already_requested = TRUE;
1798 }
1799
1800 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1801 #if TRACEFAULTPAGE
1802 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1803 #endif
1804 vm_object_lock(object);
1805
1806 if (object->object_is_shared_cache) {
1807 clear_thread_rwlock_boost();
1808 }
1809
1810 data_requested:
1811 if (rc != KERN_SUCCESS) {
1812 vm_fault_cleanup(object, first_m);
1813 thread_interrupt_level(interruptible_state);
1814
1815 return (rc == MACH_SEND_INTERRUPTED) ?
1816 VM_FAULT_INTERRUPTED :
1817 VM_FAULT_MEMORY_ERROR;
1818 } else {
1819 clock_sec_t tv_sec;
1820 clock_usec_t tv_usec;
1821
1822 if (my_fault_type == DBG_PAGEIN_FAULT) {
1823 clock_get_system_microtime(&tv_sec, &tv_usec);
1824 current_thread()->t_page_creation_time = tv_sec;
1825 current_thread()->t_page_creation_count = 0;
1826 }
1827 }
1828 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1829 vm_fault_cleanup(object, first_m);
1830 thread_interrupt_level(interruptible_state);
1831
1832 return VM_FAULT_INTERRUPTED;
1833 }
1834 if (force_fault_retry == TRUE) {
1835 vm_fault_cleanup(object, first_m);
1836 thread_interrupt_level(interruptible_state);
1837
1838 return VM_FAULT_RETRY;
1839 }
1840 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1841 /*
1842 * No page here means that the object we
1843 * initially looked up was "physically
1844 * contiguous" (i.e. device memory). However,
1845 * with Virtual VRAM, the object might not
1846 * be backed by that device memory anymore,
1847 * so we're done here only if the object is
1848 * still "phys_contiguous".
1849 * Otherwise, if the object is no longer
1850 * "phys_contiguous", we need to retry the
1851 * page fault against the object's new backing
1852 * store (different memory object).
1853 */
1854 phys_contig_object:
1855 goto done;
1856 }
1857 /*
1858 * potentially a pagein fault
1859 * if we make it through the state checks
1860 * above, than we'll count it as such
1861 */
1862 my_fault = my_fault_type;
1863
1864 /*
1865 * Retry with same object/offset, since new data may
1866 * be in a different page (i.e., m is meaningless at
1867 * this point).
1868 */
1869 continue;
1870 }
1871 dont_look_for_page:
1872 /*
1873 * We get here if the object has no pager, or an existence map
1874 * exists and indicates the page isn't present on the pager
1875 * or we're unwiring a page. If a pager exists, but there
1876 * is no existence map, then the m->vmp_absent case above handles
1877 * the ZF case when the pager can't provide the page
1878 */
1879 #if TRACEFAULTPAGE
1880 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1881 #endif
1882 if (object == first_object) {
1883 first_m = m;
1884 } else {
1885 assert(m == VM_PAGE_NULL);
1886 }
1887
1888 next_object = object->shadow;
1889
1890 if (next_object == VM_OBJECT_NULL) {
1891 /*
1892 * we've hit the bottom of the shadown chain,
1893 * fill the page in the top object with zeros.
1894 */
1895 assert(!must_be_resident);
1896
1897 if (object != first_object) {
1898 vm_object_paging_end(object);
1899 vm_object_unlock(object);
1900
1901 object = first_object;
1902 offset = first_offset;
1903 vm_object_lock(object);
1904 }
1905 m = first_m;
1906 assert(VM_PAGE_OBJECT(m) == object);
1907 first_m = VM_PAGE_NULL;
1908
1909 /*
1910 * check for any conditions that prevent
1911 * us from creating a new zero-fill page
1912 * vm_fault_check will do all of the
1913 * fault cleanup in the case of an error condition
1914 * including resetting the thread_interrupt_level
1915 */
1916 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1917
1918 if (error != VM_FAULT_SUCCESS) {
1919 return error;
1920 }
1921
1922 if (m == VM_PAGE_NULL) {
1923 m = vm_page_grab_options(grab_options);
1924
1925 if (m == VM_PAGE_NULL) {
1926 vm_fault_cleanup(object, VM_PAGE_NULL);
1927 thread_interrupt_level(interruptible_state);
1928
1929 return VM_FAULT_MEMORY_SHORTAGE;
1930 }
1931 vm_page_insert(m, object, vm_object_trunc_page(offset));
1932 }
1933 if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1934 m->vmp_absent = TRUE;
1935 }
1936
1937 my_fault = vm_fault_zero_page(m, no_zero_fill);
1938
1939 break;
1940 } else {
1941 /*
1942 * Move on to the next object. Lock the next
1943 * object before unlocking the current one.
1944 */
1945 if ((object != first_object) || must_be_resident) {
1946 vm_object_paging_end(object);
1947 }
1948
1949 offset += object->vo_shadow_offset;
1950 fault_info->lo_offset += object->vo_shadow_offset;
1951 fault_info->hi_offset += object->vo_shadow_offset;
1952 access_required = VM_PROT_READ;
1953
1954 vm_object_lock(next_object);
1955 vm_object_unlock(object);
1956
1957 object = next_object;
1958 vm_object_paging_begin(object);
1959 }
1960 }
1961
1962 /*
1963 * PAGE HAS BEEN FOUND.
1964 *
1965 * This page (m) is:
1966 * busy, so that we can play with it;
1967 * not absent, so that nobody else will fill it;
1968 * possibly eligible for pageout;
1969 *
1970 * The top-level page (first_m) is:
1971 * VM_PAGE_NULL if the page was found in the
1972 * top-level object;
1973 * busy, not absent, and ineligible for pageout.
1974 *
1975 * The current object (object) is locked. A paging
1976 * reference is held for the current and top-level
1977 * objects.
1978 */
1979
1980 #if TRACEFAULTPAGE
1981 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1982 #endif
1983 #if EXTRA_ASSERTIONS
1984 assert(m->vmp_busy && !m->vmp_absent);
1985 assert((first_m == VM_PAGE_NULL) ||
1986 (first_m->vmp_busy && !first_m->vmp_absent &&
1987 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1988 #endif /* EXTRA_ASSERTIONS */
1989
1990 /*
1991 * If the page is being written, but isn't
1992 * already owned by the top-level object,
1993 * we have to copy it into a new page owned
1994 * by the top-level object.
1995 */
1996 if (object != first_object) {
1997 #if TRACEFAULTPAGE
1998 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1999 #endif
2000 if (fault_type & VM_PROT_WRITE) {
2001 vm_page_t copy_m;
2002
2003 /*
2004 * We only really need to copy if we
2005 * want to write it.
2006 */
2007 assert(!must_be_resident);
2008
2009 /*
2010 * If we try to collapse first_object at this
2011 * point, we may deadlock when we try to get
2012 * the lock on an intermediate object (since we
2013 * have the bottom object locked). We can't
2014 * unlock the bottom object, because the page
2015 * we found may move (by collapse) if we do.
2016 *
2017 * Instead, we first copy the page. Then, when
2018 * we have no more use for the bottom object,
2019 * we unlock it and try to collapse.
2020 *
2021 * Note that we copy the page even if we didn't
2022 * need to... that's the breaks.
2023 */
2024
2025 /*
2026 * Allocate a page for the copy
2027 */
2028 copy_m = vm_page_grab_options(grab_options);
2029
2030 if (copy_m == VM_PAGE_NULL) {
2031 RELEASE_PAGE(m);
2032
2033 vm_fault_cleanup(object, first_m);
2034 thread_interrupt_level(interruptible_state);
2035
2036 return VM_FAULT_MEMORY_SHORTAGE;
2037 }
2038
2039 vm_page_copy(m, copy_m);
2040
2041 /*
2042 * If another map is truly sharing this
2043 * page with us, we have to flush all
2044 * uses of the original page, since we
2045 * can't distinguish those which want the
2046 * original from those which need the
2047 * new copy.
2048 *
2049 * XXXO If we know that only one map has
2050 * access to this page, then we could
2051 * avoid the pmap_disconnect() call.
2052 */
2053 if (m->vmp_pmapped) {
2054 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2055 }
2056
2057 if (m->vmp_clustered) {
2058 VM_PAGE_COUNT_AS_PAGEIN(m);
2059 VM_PAGE_CONSUME_CLUSTERED(m);
2060 }
2061 assert(!m->vmp_cleaning);
2062
2063 /*
2064 * We no longer need the old page or object.
2065 */
2066 RELEASE_PAGE(m);
2067
2068 /*
2069 * This check helps with marking the object as having a sequential pattern
2070 * Normally we'll miss doing this below because this fault is about COW to
2071 * the first_object i.e. bring page in from disk, push to object above but
2072 * don't update the file object's sequential pattern.
2073 */
2074 if (object->internal == FALSE) {
2075 vm_fault_is_sequential(object, offset, fault_info->behavior);
2076 }
2077
2078 vm_object_paging_end(object);
2079 vm_object_unlock(object);
2080
2081 my_fault = DBG_COW_FAULT;
2082 VM_STAT_INCR(cow_faults);
2083 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2084 current_task()->cow_faults++;
2085
2086 object = first_object;
2087 offset = first_offset;
2088
2089 vm_object_lock(object);
2090 /*
2091 * get rid of the place holder
2092 * page that we soldered in earlier
2093 */
2094 VM_PAGE_FREE(first_m);
2095 first_m = VM_PAGE_NULL;
2096
2097 /*
2098 * and replace it with the
2099 * page we just copied into
2100 */
2101 assert(copy_m->vmp_busy);
2102 vm_page_insert(copy_m, object, vm_object_trunc_page(offset));
2103 SET_PAGE_DIRTY(copy_m, TRUE);
2104
2105 m = copy_m;
2106 /*
2107 * Now that we've gotten the copy out of the
2108 * way, let's try to collapse the top object.
2109 * But we have to play ugly games with
2110 * paging_in_progress to do that...
2111 */
2112 vm_object_paging_end(object);
2113 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2114 vm_object_paging_begin(object);
2115 } else {
2116 *protection &= (~VM_PROT_WRITE);
2117 }
2118 }
2119 /*
2120 * Now check whether the page needs to be pushed into the
2121 * copy object. The use of asymmetric copy on write for
2122 * shared temporary objects means that we may do two copies to
2123 * satisfy the fault; one above to get the page from a
2124 * shadowed object, and one here to push it into the copy.
2125 */
2126 try_failed_count = 0;
2127
2128 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2129 vm_object_offset_t copy_offset;
2130 vm_page_t copy_m;
2131
2132 #if TRACEFAULTPAGE
2133 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2134 #endif
2135 /*
2136 * If the page is being written, but hasn't been
2137 * copied to the copy-object, we have to copy it there.
2138 */
2139 if ((fault_type & VM_PROT_WRITE) == 0) {
2140 *protection &= ~VM_PROT_WRITE;
2141 break;
2142 }
2143
2144 /*
2145 * If the page was guaranteed to be resident,
2146 * we must have already performed the copy.
2147 */
2148 if (must_be_resident) {
2149 break;
2150 }
2151
2152 /*
2153 * Try to get the lock on the copy_object.
2154 */
2155 if (!vm_object_lock_try(copy_object)) {
2156 vm_object_unlock(object);
2157 try_failed_count++;
2158
2159 mutex_pause(try_failed_count); /* wait a bit */
2160 vm_object_lock(object);
2161
2162 continue;
2163 }
2164 try_failed_count = 0;
2165
2166 /*
2167 * Make another reference to the copy-object,
2168 * to keep it from disappearing during the
2169 * copy.
2170 */
2171 vm_object_reference_locked(copy_object);
2172
2173 /*
2174 * Does the page exist in the copy?
2175 */
2176 copy_offset = first_offset - copy_object->vo_shadow_offset;
2177 copy_offset = vm_object_trunc_page(copy_offset);
2178
2179 if (copy_object->vo_size <= copy_offset) {
2180 /*
2181 * Copy object doesn't cover this page -- do nothing.
2182 */
2183 ;
2184 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2185 /*
2186 * Page currently exists in the copy object
2187 */
2188 if (copy_m->vmp_busy) {
2189 /*
2190 * If the page is being brought
2191 * in, wait for it and then retry.
2192 */
2193 RELEASE_PAGE(m);
2194
2195 /*
2196 * take an extra ref so object won't die
2197 */
2198 vm_object_reference_locked(copy_object);
2199 vm_object_unlock(copy_object);
2200 vm_fault_cleanup(object, first_m);
2201 counter(c_vm_fault_page_block_backoff_kernel++);
2202
2203 vm_object_lock(copy_object);
2204 assert(copy_object->ref_count > 0);
2205 VM_OBJ_RES_DECR(copy_object);
2206 vm_object_lock_assert_exclusive(copy_object);
2207 copy_object->ref_count--;
2208 assert(copy_object->ref_count > 0);
2209 copy_m = vm_page_lookup(copy_object, copy_offset);
2210
2211 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2212 PAGE_ASSERT_WAIT(copy_m, interruptible);
2213
2214 vm_object_unlock(copy_object);
2215 wait_result = thread_block(THREAD_CONTINUE_NULL);
2216 vm_object_deallocate(copy_object);
2217
2218 goto backoff;
2219 } else {
2220 vm_object_unlock(copy_object);
2221 vm_object_deallocate(copy_object);
2222 thread_interrupt_level(interruptible_state);
2223
2224 return VM_FAULT_RETRY;
2225 }
2226 }
2227 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2228 /*
2229 * If PAGED_OUT is TRUE, then the page used to exist
2230 * in the copy-object, and has already been paged out.
2231 * We don't need to repeat this. If PAGED_OUT is
2232 * FALSE, then either we don't know (!pager_created,
2233 * for example) or it hasn't been paged out.
2234 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2235 * We must copy the page to the copy object.
2236 *
2237 * Allocate a page for the copy
2238 */
2239 copy_m = vm_page_alloc(copy_object, copy_offset);
2240
2241 if (copy_m == VM_PAGE_NULL) {
2242 RELEASE_PAGE(m);
2243
2244 VM_OBJ_RES_DECR(copy_object);
2245 vm_object_lock_assert_exclusive(copy_object);
2246 copy_object->ref_count--;
2247 assert(copy_object->ref_count > 0);
2248
2249 vm_object_unlock(copy_object);
2250 vm_fault_cleanup(object, first_m);
2251 thread_interrupt_level(interruptible_state);
2252
2253 return VM_FAULT_MEMORY_SHORTAGE;
2254 }
2255 /*
2256 * Must copy page into copy-object.
2257 */
2258 vm_page_copy(m, copy_m);
2259
2260 /*
2261 * If the old page was in use by any users
2262 * of the copy-object, it must be removed
2263 * from all pmaps. (We can't know which
2264 * pmaps use it.)
2265 */
2266 if (m->vmp_pmapped) {
2267 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2268 }
2269
2270 if (m->vmp_clustered) {
2271 VM_PAGE_COUNT_AS_PAGEIN(m);
2272 VM_PAGE_CONSUME_CLUSTERED(m);
2273 }
2274 /*
2275 * If there's a pager, then immediately
2276 * page out this page, using the "initialize"
2277 * option. Else, we use the copy.
2278 */
2279 if ((!copy_object->pager_ready)
2280 || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2281 ) {
2282 vm_page_lockspin_queues();
2283 assert(!m->vmp_cleaning);
2284 vm_page_activate(copy_m);
2285 vm_page_unlock_queues();
2286
2287 SET_PAGE_DIRTY(copy_m, TRUE);
2288 PAGE_WAKEUP_DONE(copy_m);
2289 } else {
2290 assert(copy_m->vmp_busy == TRUE);
2291 assert(!m->vmp_cleaning);
2292
2293 /*
2294 * dirty is protected by the object lock
2295 */
2296 SET_PAGE_DIRTY(copy_m, TRUE);
2297
2298 /*
2299 * The page is already ready for pageout:
2300 * not on pageout queues and busy.
2301 * Unlock everything except the
2302 * copy_object itself.
2303 */
2304 vm_object_unlock(object);
2305
2306 /*
2307 * Write the page to the copy-object,
2308 * flushing it from the kernel.
2309 */
2310 vm_pageout_initialize_page(copy_m);
2311
2312 /*
2313 * Since the pageout may have
2314 * temporarily dropped the
2315 * copy_object's lock, we
2316 * check whether we'll have
2317 * to deallocate the hard way.
2318 */
2319 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2320 vm_object_unlock(copy_object);
2321 vm_object_deallocate(copy_object);
2322 vm_object_lock(object);
2323
2324 continue;
2325 }
2326 /*
2327 * Pick back up the old object's
2328 * lock. [It is safe to do so,
2329 * since it must be deeper in the
2330 * object tree.]
2331 */
2332 vm_object_lock(object);
2333 }
2334
2335 /*
2336 * Because we're pushing a page upward
2337 * in the object tree, we must restart
2338 * any faults that are waiting here.
2339 * [Note that this is an expansion of
2340 * PAGE_WAKEUP that uses the THREAD_RESTART
2341 * wait result]. Can't turn off the page's
2342 * busy bit because we're not done with it.
2343 */
2344 if (m->vmp_wanted) {
2345 m->vmp_wanted = FALSE;
2346 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2347 }
2348 }
2349 /*
2350 * The reference count on copy_object must be
2351 * at least 2: one for our extra reference,
2352 * and at least one from the outside world
2353 * (we checked that when we last locked
2354 * copy_object).
2355 */
2356 vm_object_lock_assert_exclusive(copy_object);
2357 copy_object->ref_count--;
2358 assert(copy_object->ref_count > 0);
2359
2360 VM_OBJ_RES_DECR(copy_object);
2361 vm_object_unlock(copy_object);
2362
2363 break;
2364 }
2365
2366 done:
2367 *result_page = m;
2368 *top_page = first_m;
2369
2370 if (m != VM_PAGE_NULL) {
2371 assert(VM_PAGE_OBJECT(m) == object);
2372
2373 retval = VM_FAULT_SUCCESS;
2374
2375 if (my_fault == DBG_PAGEIN_FAULT) {
2376 VM_PAGE_COUNT_AS_PAGEIN(m);
2377
2378 if (object->internal) {
2379 my_fault = DBG_PAGEIND_FAULT;
2380 } else {
2381 my_fault = DBG_PAGEINV_FAULT;
2382 }
2383
2384 /*
2385 * evaluate access pattern and update state
2386 * vm_fault_deactivate_behind depends on the
2387 * state being up to date
2388 */
2389 vm_fault_is_sequential(object, offset, fault_info->behavior);
2390 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2391 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2392 /*
2393 * we weren't called from vm_fault, so handle the
2394 * accounting here for hits in the cache
2395 */
2396 if (m->vmp_clustered) {
2397 VM_PAGE_COUNT_AS_PAGEIN(m);
2398 VM_PAGE_CONSUME_CLUSTERED(m);
2399 }
2400 vm_fault_is_sequential(object, offset, fault_info->behavior);
2401 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2402 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2403 VM_STAT_DECOMPRESSIONS();
2404 }
2405 if (type_of_fault) {
2406 *type_of_fault = my_fault;
2407 }
2408 } else {
2409 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2410 assert(first_m == VM_PAGE_NULL);
2411 assert(object == first_object);
2412 }
2413
2414 thread_interrupt_level(interruptible_state);
2415
2416 #if TRACEFAULTPAGE
2417 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2418 #endif
2419 return retval;
2420
2421 backoff:
2422 thread_interrupt_level(interruptible_state);
2423
2424 if (wait_result == THREAD_INTERRUPTED) {
2425 return VM_FAULT_INTERRUPTED;
2426 }
2427 return VM_FAULT_RETRY;
2428
2429 #undef RELEASE_PAGE
2430 }
2431
2432
2433 extern int panic_on_cs_killed;
2434 extern int proc_selfpid(void);
2435 extern char *proc_name_address(void *p);
2436 unsigned long cs_enter_tainted_rejected = 0;
2437 unsigned long cs_enter_tainted_accepted = 0;
2438
2439 /*
2440 * CODE SIGNING:
2441 * When soft faulting a page, we have to validate the page if:
2442 * 1. the page is being mapped in user space
2443 * 2. the page hasn't already been found to be "tainted"
2444 * 3. the page belongs to a code-signed object
2445 * 4. the page has not been validated yet or has been mapped for write.
2446 */
2447 static bool
2448 vm_fault_cs_need_validation(
2449 pmap_t pmap,
2450 vm_page_t page,
2451 vm_object_t page_obj,
2452 vm_map_size_t fault_page_size,
2453 vm_map_offset_t fault_phys_offset)
2454 {
2455 if (pmap == kernel_pmap) {
2456 /* 1 - not user space */
2457 return false;
2458 }
2459 if (!page_obj->code_signed) {
2460 /* 3 - page does not belong to a code-signed object */
2461 return false;
2462 }
2463 if (fault_page_size == PAGE_SIZE) {
2464 /* looking at the whole page */
2465 assertf(fault_phys_offset == 0,
2466 "fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2467 (uint64_t)fault_page_size,
2468 (uint64_t)fault_phys_offset);
2469 if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2470 /* 2 - page is all tainted */
2471 return false;
2472 }
2473 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2474 !page->vmp_wpmapped) {
2475 /* 4 - already fully validated and never mapped writable */
2476 return false;
2477 }
2478 } else {
2479 /* looking at a specific sub-page */
2480 if (VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
2481 /* 2 - sub-page was already marked as tainted */
2482 return false;
2483 }
2484 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) &&
2485 !page->vmp_wpmapped) {
2486 /* 4 - already validated and never mapped writable */
2487 return false;
2488 }
2489 }
2490 /* page needs to be validated */
2491 return true;
2492 }
2493
2494
2495 static bool
2496 vm_fault_cs_page_immutable(
2497 vm_page_t m,
2498 vm_map_size_t fault_page_size,
2499 vm_map_offset_t fault_phys_offset,
2500 vm_prot_t prot __unused)
2501 {
2502 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)
2503 /*&& ((prot) & VM_PROT_EXECUTE)*/) {
2504 return true;
2505 }
2506 return false;
2507 }
2508
2509 static bool
2510 vm_fault_cs_page_nx(
2511 vm_page_t m,
2512 vm_map_size_t fault_page_size,
2513 vm_map_offset_t fault_phys_offset)
2514 {
2515 return VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2516 }
2517
2518 /*
2519 * Check if the page being entered into the pmap violates code signing.
2520 */
2521 static kern_return_t
2522 vm_fault_cs_check_violation(
2523 bool cs_bypass,
2524 vm_object_t object,
2525 vm_page_t m,
2526 pmap_t pmap,
2527 vm_prot_t prot,
2528 vm_prot_t caller_prot,
2529 vm_map_size_t fault_page_size,
2530 vm_map_offset_t fault_phys_offset,
2531 vm_object_fault_info_t fault_info,
2532 bool map_is_switched,
2533 bool map_is_switch_protected,
2534 bool *cs_violation)
2535 {
2536 #if !PMAP_CS
2537 #pragma unused(caller_prot)
2538 #pragma unused(fault_info)
2539 #endif /* !PMAP_CS */
2540 int cs_enforcement_enabled;
2541 if (!cs_bypass &&
2542 vm_fault_cs_need_validation(pmap, m, object,
2543 fault_page_size, fault_phys_offset)) {
2544 vm_object_lock_assert_exclusive(object);
2545
2546 if (VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset)) {
2547 vm_cs_revalidates++;
2548 }
2549
2550 /* VM map is locked, so 1 ref will remain on VM object -
2551 * so no harm if vm_page_validate_cs drops the object lock */
2552
2553 #if PMAP_CS
2554 if (fault_info->pmap_cs_associated &&
2555 pmap_cs_enforced(pmap) &&
2556 !VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2557 !VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) &&
2558 !VMP_CS_NX(m, fault_page_size, fault_phys_offset) &&
2559 (prot & VM_PROT_EXECUTE) &&
2560 (caller_prot & VM_PROT_EXECUTE)) {
2561 /*
2562 * With pmap_cs, the pmap layer will validate the
2563 * code signature for any executable pmap mapping.
2564 * No need for us to validate this page too:
2565 * in pmap_cs we trust...
2566 */
2567 vm_cs_defer_to_pmap_cs++;
2568 } else {
2569 vm_cs_defer_to_pmap_cs_not++;
2570 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2571 }
2572 #else /* PMAP_CS */
2573 vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2574 #endif /* PMAP_CS */
2575 }
2576
2577 /* If the map is switched, and is switch-protected, we must protect
2578 * some pages from being write-faulted: immutable pages because by
2579 * definition they may not be written, and executable pages because that
2580 * would provide a way to inject unsigned code.
2581 * If the page is immutable, we can simply return. However, we can't
2582 * immediately determine whether a page is executable anywhere. But,
2583 * we can disconnect it everywhere and remove the executable protection
2584 * from the current map. We do that below right before we do the
2585 * PMAP_ENTER.
2586 */
2587 if (pmap == kernel_pmap) {
2588 /* kernel fault: cs_enforcement does not apply */
2589 cs_enforcement_enabled = 0;
2590 } else {
2591 cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2592 }
2593
2594 if (cs_enforcement_enabled && map_is_switched &&
2595 map_is_switch_protected &&
2596 vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2597 (prot & VM_PROT_WRITE)) {
2598 return KERN_CODESIGN_ERROR;
2599 }
2600
2601 if (cs_enforcement_enabled &&
2602 vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2603 (prot & VM_PROT_EXECUTE)) {
2604 if (cs_debug) {
2605 printf("page marked to be NX, not letting it be mapped EXEC\n");
2606 }
2607 return KERN_CODESIGN_ERROR;
2608 }
2609
2610 /* A page could be tainted, or pose a risk of being tainted later.
2611 * Check whether the receiving process wants it, and make it feel
2612 * the consequences (that hapens in cs_invalid_page()).
2613 * For CS Enforcement, two other conditions will
2614 * cause that page to be tainted as well:
2615 * - pmapping an unsigned page executable - this means unsigned code;
2616 * - writeable mapping of a validated page - the content of that page
2617 * can be changed without the kernel noticing, therefore unsigned
2618 * code can be created
2619 */
2620 if (cs_bypass) {
2621 /* code-signing is bypassed */
2622 *cs_violation = FALSE;
2623 } else if (VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2624 /* tainted page */
2625 *cs_violation = TRUE;
2626 } else if (!cs_enforcement_enabled) {
2627 /* no further code-signing enforcement */
2628 *cs_violation = FALSE;
2629 } else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2630 ((prot & VM_PROT_WRITE) ||
2631 m->vmp_wpmapped)) {
2632 /*
2633 * The page should be immutable, but is in danger of being
2634 * modified.
2635 * This is the case where we want policy from the code
2636 * directory - is the page immutable or not? For now we have
2637 * to assume that code pages will be immutable, data pages not.
2638 * We'll assume a page is a code page if it has a code directory
2639 * and we fault for execution.
2640 * That is good enough since if we faulted the code page for
2641 * writing in another map before, it is wpmapped; if we fault
2642 * it for writing in this map later it will also be faulted for
2643 * executing at the same time; and if we fault for writing in
2644 * another map later, we will disconnect it from this pmap so
2645 * we'll notice the change.
2646 */
2647 *cs_violation = TRUE;
2648 } else if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2649 (prot & VM_PROT_EXECUTE)
2650 #if PMAP_CS
2651 /*
2652 * Executable pages will be validated by pmap_cs;
2653 * in pmap_cs we trust...
2654 * If pmap_cs is turned off, this is a code-signing
2655 * violation.
2656 */
2657 && !(pmap_cs_enforced(pmap))
2658 #endif /* PMAP_CS */
2659 ) {
2660 *cs_violation = TRUE;
2661 } else {
2662 *cs_violation = FALSE;
2663 }
2664 return KERN_SUCCESS;
2665 }
2666
2667 /*
2668 * Handles a code signing violation by either rejecting the page or forcing a disconnect.
2669 * @param must_disconnect This value will be set to true if the caller must disconnect
2670 * this page.
2671 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2672 */
2673 static kern_return_t
2674 vm_fault_cs_handle_violation(
2675 vm_object_t object,
2676 vm_page_t m,
2677 pmap_t pmap,
2678 vm_prot_t prot,
2679 vm_map_offset_t vaddr,
2680 vm_map_size_t fault_page_size,
2681 vm_map_offset_t fault_phys_offset,
2682 bool map_is_switched,
2683 bool map_is_switch_protected,
2684 bool *must_disconnect)
2685 {
2686 #if !MACH_ASSERT
2687 #pragma unused(pmap)
2688 #pragma unused(map_is_switch_protected)
2689 #endif /* !MACH_ASSERT */
2690 /*
2691 * We will have a tainted page. Have to handle the special case
2692 * of a switched map now. If the map is not switched, standard
2693 * procedure applies - call cs_invalid_page().
2694 * If the map is switched, the real owner is invalid already.
2695 * There is no point in invalidating the switching process since
2696 * it will not be executing from the map. So we don't call
2697 * cs_invalid_page() in that case.
2698 */
2699 boolean_t reject_page, cs_killed;
2700 kern_return_t kr;
2701 if (map_is_switched) {
2702 assert(pmap == vm_map_pmap(current_thread()->map));
2703 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2704 reject_page = FALSE;
2705 } else {
2706 if (cs_debug > 5) {
2707 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2708 object->code_signed ? "yes" : "no",
2709 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2710 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2711 m->vmp_wpmapped ? "yes" : "no",
2712 (int)prot);
2713 }
2714 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2715 }
2716
2717 if (reject_page) {
2718 /* reject the invalid page: abort the page fault */
2719 int pid;
2720 const char *procname;
2721 task_t task;
2722 vm_object_t file_object, shadow;
2723 vm_object_offset_t file_offset;
2724 char *pathname, *filename;
2725 vm_size_t pathname_len, filename_len;
2726 boolean_t truncated_path;
2727 #define __PATH_MAX 1024
2728 struct timespec mtime, cs_mtime;
2729 int shadow_depth;
2730 os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2731
2732 kr = KERN_CODESIGN_ERROR;
2733 cs_enter_tainted_rejected++;
2734
2735 /* get process name and pid */
2736 procname = "?";
2737 task = current_task();
2738 pid = proc_selfpid();
2739 if (task->bsd_info != NULL) {
2740 procname = proc_name_address(task->bsd_info);
2741 }
2742
2743 /* get file's VM object */
2744 file_object = object;
2745 file_offset = m->vmp_offset;
2746 for (shadow = file_object->shadow,
2747 shadow_depth = 0;
2748 shadow != VM_OBJECT_NULL;
2749 shadow = file_object->shadow,
2750 shadow_depth++) {
2751 vm_object_lock_shared(shadow);
2752 if (file_object != object) {
2753 vm_object_unlock(file_object);
2754 }
2755 file_offset += file_object->vo_shadow_offset;
2756 file_object = shadow;
2757 }
2758
2759 mtime.tv_sec = 0;
2760 mtime.tv_nsec = 0;
2761 cs_mtime.tv_sec = 0;
2762 cs_mtime.tv_nsec = 0;
2763
2764 /* get file's pathname and/or filename */
2765 pathname = NULL;
2766 filename = NULL;
2767 pathname_len = 0;
2768 filename_len = 0;
2769 truncated_path = FALSE;
2770 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2771 if (file_object->pager != NULL) {
2772 pathname = kheap_alloc(KHEAP_TEMP, __PATH_MAX * 2, Z_WAITOK);
2773 if (pathname) {
2774 pathname[0] = '\0';
2775 pathname_len = __PATH_MAX;
2776 filename = pathname + pathname_len;
2777 filename_len = __PATH_MAX;
2778
2779 if (vnode_pager_get_object_name(file_object->pager,
2780 pathname,
2781 pathname_len,
2782 filename,
2783 filename_len,
2784 &truncated_path) == KERN_SUCCESS) {
2785 /* safety first... */
2786 pathname[__PATH_MAX - 1] = '\0';
2787 filename[__PATH_MAX - 1] = '\0';
2788
2789 vnode_pager_get_object_mtime(file_object->pager,
2790 &mtime,
2791 &cs_mtime);
2792 } else {
2793 kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
2794 pathname = NULL;
2795 filename = NULL;
2796 pathname_len = 0;
2797 filename_len = 0;
2798 truncated_path = FALSE;
2799 }
2800 }
2801 }
2802 printf("CODE SIGNING: process %d[%s]: "
2803 "rejecting invalid page at address 0x%llx "
2804 "from offset 0x%llx in file \"%s%s%s\" "
2805 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2806 "(signed:%d validated:%d tainted:%d nx:%d "
2807 "wpmapped:%d dirty:%d depth:%d)\n",
2808 pid, procname, (addr64_t) vaddr,
2809 file_offset,
2810 (pathname ? pathname : "<nil>"),
2811 (truncated_path ? "/.../" : ""),
2812 (truncated_path ? filename : ""),
2813 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2814 ((cs_mtime.tv_sec == mtime.tv_sec &&
2815 cs_mtime.tv_nsec == mtime.tv_nsec)
2816 ? "=="
2817 : "!="),
2818 mtime.tv_sec, mtime.tv_nsec,
2819 object->code_signed,
2820 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2821 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2822 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2823 m->vmp_wpmapped,
2824 m->vmp_dirty,
2825 shadow_depth);
2826
2827 /*
2828 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2829 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2830 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2831 * will deal with the segmentation fault.
2832 */
2833 if (cs_killed) {
2834 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2835 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2836
2837 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2838 if (codesigning_exit_reason == NULL) {
2839 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2840 } else {
2841 mach_vm_address_t data_addr = 0;
2842 struct codesigning_exit_reason_info *ceri = NULL;
2843 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2844
2845 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2846 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2847 } else {
2848 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2849 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2850 ceri = (struct codesigning_exit_reason_info *)data_addr;
2851 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2852
2853 ceri->ceri_virt_addr = vaddr;
2854 ceri->ceri_file_offset = file_offset;
2855 if (pathname) {
2856 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2857 } else {
2858 ceri->ceri_pathname[0] = '\0';
2859 }
2860 if (filename) {
2861 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2862 } else {
2863 ceri->ceri_filename[0] = '\0';
2864 }
2865 ceri->ceri_path_truncated = (truncated_path ? 1 : 0);
2866 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2867 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2868 ceri->ceri_page_modtime_secs = mtime.tv_sec;
2869 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2870 ceri->ceri_object_codesigned = (object->code_signed);
2871 ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset);
2872 ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset);
2873 ceri->ceri_page_codesig_nx = VMP_CS_NX(m, fault_page_size, fault_phys_offset);
2874 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2875 ceri->ceri_page_slid = 0;
2876 ceri->ceri_page_dirty = (m->vmp_dirty);
2877 ceri->ceri_page_shadow_depth = shadow_depth;
2878 } else {
2879 #if DEBUG || DEVELOPMENT
2880 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2881 #else
2882 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2883 #endif /* DEBUG || DEVELOPMENT */
2884 /* Free the buffer */
2885 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2886 }
2887 }
2888 }
2889
2890 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2891 }
2892 if (panic_on_cs_killed &&
2893 object->object_is_shared_cache) {
2894 char *tainted_contents;
2895 vm_map_offset_t src_vaddr;
2896 src_vaddr = (vm_map_offset_t) phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
2897 tainted_contents = kalloc(PAGE_SIZE);
2898 bcopy((const char *)src_vaddr, tainted_contents, PAGE_SIZE);
2899 printf("CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
2900 panic("CODE SIGNING: process %d[%s]: "
2901 "rejecting invalid page (phys#0x%x) at address 0x%llx "
2902 "from offset 0x%llx in file \"%s%s%s\" "
2903 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2904 "(signed:%d validated:%d tainted:%d nx:%d"
2905 "wpmapped:%d dirty:%d depth:%d)\n",
2906 pid, procname,
2907 VM_PAGE_GET_PHYS_PAGE(m),
2908 (addr64_t) vaddr,
2909 file_offset,
2910 (pathname ? pathname : "<nil>"),
2911 (truncated_path ? "/.../" : ""),
2912 (truncated_path ? filename : ""),
2913 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2914 ((cs_mtime.tv_sec == mtime.tv_sec &&
2915 cs_mtime.tv_nsec == mtime.tv_nsec)
2916 ? "=="
2917 : "!="),
2918 mtime.tv_sec, mtime.tv_nsec,
2919 object->code_signed,
2920 VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
2921 VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
2922 VMP_CS_NX(m, fault_page_size, fault_phys_offset),
2923 m->vmp_wpmapped,
2924 m->vmp_dirty,
2925 shadow_depth);
2926 }
2927
2928 if (file_object != object) {
2929 vm_object_unlock(file_object);
2930 }
2931 if (pathname_len != 0) {
2932 kheap_free(KHEAP_TEMP, pathname, __PATH_MAX * 2);
2933 pathname = NULL;
2934 filename = NULL;
2935 }
2936 } else {
2937 /* proceed with the invalid page */
2938 kr = KERN_SUCCESS;
2939 if (!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2940 !object->code_signed) {
2941 /*
2942 * This page has not been (fully) validated but
2943 * does not belong to a code-signed object
2944 * so it should not be forcefully considered
2945 * as tainted.
2946 * We're just concerned about it here because
2947 * we've been asked to "execute" it but that
2948 * does not mean that it should cause other
2949 * accesses to fail.
2950 * This happens when a debugger sets a
2951 * breakpoint and we then execute code in
2952 * that page. Marking the page as "tainted"
2953 * would cause any inspection tool ("leaks",
2954 * "vmmap", "CrashReporter", ...) to get killed
2955 * due to code-signing violation on that page,
2956 * even though they're just reading it and not
2957 * executing from it.
2958 */
2959 } else {
2960 /*
2961 * Page might have been tainted before or not;
2962 * now it definitively is. If the page wasn't
2963 * tainted, we must disconnect it from all
2964 * pmaps later, to force existing mappings
2965 * through that code path for re-consideration
2966 * of the validity of that page.
2967 */
2968 if (!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset)) {
2969 *must_disconnect = TRUE;
2970 VMP_CS_SET_TAINTED(m, fault_page_size, fault_phys_offset, TRUE);
2971 }
2972 }
2973 cs_enter_tainted_accepted++;
2974 }
2975 if (kr != KERN_SUCCESS) {
2976 if (cs_debug) {
2977 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2978 "*** INVALID PAGE ***\n",
2979 (long long)vaddr);
2980 }
2981 #if !SECURE_KERNEL
2982 if (cs_enforcement_panic) {
2983 panic("CODESIGNING: panicking on invalid page\n");
2984 }
2985 #endif
2986 }
2987 return kr;
2988 }
2989
2990 /*
2991 * Check that the code signature is valid for the given page being inserted into
2992 * the pmap.
2993 *
2994 * @param must_disconnect This value will be set to true if the caller must disconnect
2995 * this page.
2996 * @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2997 */
2998 static kern_return_t
2999 vm_fault_validate_cs(
3000 bool cs_bypass,
3001 vm_object_t object,
3002 vm_page_t m,
3003 pmap_t pmap,
3004 vm_map_offset_t vaddr,
3005 vm_prot_t prot,
3006 vm_prot_t caller_prot,
3007 vm_map_size_t fault_page_size,
3008 vm_map_offset_t fault_phys_offset,
3009 vm_object_fault_info_t fault_info,
3010 bool *must_disconnect)
3011 {
3012 bool map_is_switched, map_is_switch_protected, cs_violation;
3013 kern_return_t kr;
3014 /* Validate code signature if necessary. */
3015 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3016 (pmap == vm_map_pmap(current_thread()->map)));
3017 map_is_switch_protected = current_thread()->map->switch_protect;
3018 kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3019 prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3020 map_is_switched, map_is_switch_protected, &cs_violation);
3021 if (kr != KERN_SUCCESS) {
3022 return kr;
3023 }
3024 if (cs_violation) {
3025 kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3026 fault_page_size, fault_phys_offset,
3027 map_is_switched, map_is_switch_protected, must_disconnect);
3028 }
3029 return kr;
3030 }
3031
3032 /*
3033 * Enqueue the page on the appropriate paging queue.
3034 */
3035 static void
3036 vm_fault_enqueue_page(
3037 vm_object_t object,
3038 vm_page_t m,
3039 bool wired,
3040 bool change_wiring,
3041 vm_tag_t wire_tag,
3042 bool no_cache,
3043 int *type_of_fault,
3044 kern_return_t kr)
3045 {
3046 assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
3047 boolean_t page_queues_locked = FALSE;
3048 boolean_t previously_pmapped = m->vmp_pmapped;
3049 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3050 MACRO_BEGIN \
3051 if (! page_queues_locked) { \
3052 page_queues_locked = TRUE; \
3053 vm_page_lockspin_queues(); \
3054 } \
3055 MACRO_END
3056 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3057 MACRO_BEGIN \
3058 if (page_queues_locked) { \
3059 page_queues_locked = FALSE; \
3060 vm_page_unlock_queues(); \
3061 } \
3062 MACRO_END
3063
3064 #if CONFIG_BACKGROUND_QUEUE
3065 vm_page_update_background_state(m);
3066 #endif
3067 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3068 /*
3069 * Compressor pages are neither wired
3070 * nor pageable and should never change.
3071 */
3072 assert(object == compressor_object);
3073 } else if (change_wiring) {
3074 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3075
3076 if (wired) {
3077 if (kr == KERN_SUCCESS) {
3078 vm_page_wire(m, wire_tag, TRUE);
3079 }
3080 } else {
3081 vm_page_unwire(m, TRUE);
3082 }
3083 /* we keep the page queues lock, if we need it later */
3084 } else {
3085 if (object->internal == TRUE) {
3086 /*
3087 * don't allow anonymous pages on
3088 * the speculative queues
3089 */
3090 no_cache = FALSE;
3091 }
3092 if (kr != KERN_SUCCESS) {
3093 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3094 vm_page_deactivate(m);
3095 /* we keep the page queues lock, if we need it later */
3096 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3097 (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3098 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3099 ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3100 !VM_PAGE_WIRED(m)) {
3101 if (vm_page_local_q &&
3102 (*type_of_fault == DBG_COW_FAULT ||
3103 *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3104 struct vpl *lq;
3105 uint32_t lid;
3106
3107 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3108
3109 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3110 vm_object_lock_assert_exclusive(object);
3111
3112 /*
3113 * we got a local queue to stuff this
3114 * new page on...
3115 * its safe to manipulate local and
3116 * local_id at this point since we're
3117 * behind an exclusive object lock and
3118 * the page is not on any global queue.
3119 *
3120 * we'll use the current cpu number to
3121 * select the queue note that we don't
3122 * need to disable preemption... we're
3123 * going to be behind the local queue's
3124 * lock to do the real work
3125 */
3126 lid = cpu_number();
3127
3128 lq = zpercpu_get_cpu(vm_page_local_q, lid);
3129
3130 VPL_LOCK(&lq->vpl_lock);
3131
3132 vm_page_check_pageable_safe(m);
3133 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3134 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3135 m->vmp_local_id = lid;
3136 lq->vpl_count++;
3137
3138 if (object->internal) {
3139 lq->vpl_internal_count++;
3140 } else {
3141 lq->vpl_external_count++;
3142 }
3143
3144 VPL_UNLOCK(&lq->vpl_lock);
3145
3146 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3147 /*
3148 * we're beyond the soft limit
3149 * for the local queue
3150 * vm_page_reactivate_local will
3151 * 'try' to take the global page
3152 * queue lock... if it can't
3153 * that's ok... we'll let the
3154 * queue continue to grow up
3155 * to the hard limit... at that
3156 * point we'll wait for the
3157 * lock... once we've got the
3158 * lock, we'll transfer all of
3159 * the pages from the local
3160 * queue to the global active
3161 * queue
3162 */
3163 vm_page_reactivate_local(lid, FALSE, FALSE);
3164 }
3165 } else {
3166 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3167
3168 /*
3169 * test again now that we hold the
3170 * page queue lock
3171 */
3172 if (!VM_PAGE_WIRED(m)) {
3173 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3174 vm_page_queues_remove(m, FALSE);
3175
3176 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3177 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3178 }
3179
3180 if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3181 no_cache) {
3182 /*
3183 * If this is a no_cache mapping
3184 * and the page has never been
3185 * mapped before or was
3186 * previously a no_cache page,
3187 * then we want to leave pages
3188 * in the speculative state so
3189 * that they can be readily
3190 * recycled if free memory runs
3191 * low. Otherwise the page is
3192 * activated as normal.
3193 */
3194
3195 if (no_cache &&
3196 (!previously_pmapped ||
3197 m->vmp_no_cache)) {
3198 m->vmp_no_cache = TRUE;
3199
3200 if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3201 vm_page_speculate(m, FALSE);
3202 }
3203 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3204 vm_page_activate(m);
3205 }
3206 }
3207 }
3208 /* we keep the page queues lock, if we need it later */
3209 }
3210 }
3211 }
3212 /* we're done with the page queues lock, if we ever took it */
3213 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3214 }
3215
3216 /*
3217 * Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3218 * @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3219 * before being inserted into the pmap.
3220 */
3221 static bool
3222 vm_fault_enter_set_mapped(
3223 vm_object_t object,
3224 vm_page_t m,
3225 vm_prot_t prot,
3226 vm_prot_t fault_type)
3227 {
3228 bool page_needs_sync = false;
3229 /*
3230 * NOTE: we may only hold the vm_object lock SHARED
3231 * at this point, so we need the phys_page lock to
3232 * properly serialize updating the pmapped and
3233 * xpmapped bits
3234 */
3235 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3236 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3237
3238 pmap_lock_phys_page(phys_page);
3239 m->vmp_pmapped = TRUE;
3240
3241 if (!m->vmp_xpmapped) {
3242 m->vmp_xpmapped = TRUE;
3243
3244 pmap_unlock_phys_page(phys_page);
3245
3246 if (!object->internal) {
3247 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3248 }
3249
3250 #if defined(__arm__) || defined(__arm64__)
3251 page_needs_sync = true;
3252 #else
3253 if (object->internal &&
3254 object->pager != NULL) {
3255 /*
3256 * This page could have been
3257 * uncompressed by the
3258 * compressor pager and its
3259 * contents might be only in
3260 * the data cache.
3261 * Since it's being mapped for
3262 * "execute" for the fist time,
3263 * make sure the icache is in
3264 * sync.
3265 */
3266 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3267 page_needs_sync = true;
3268 }
3269 #endif
3270 } else {
3271 pmap_unlock_phys_page(phys_page);
3272 }
3273 } else {
3274 if (m->vmp_pmapped == FALSE) {
3275 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3276
3277 pmap_lock_phys_page(phys_page);
3278 m->vmp_pmapped = TRUE;
3279 pmap_unlock_phys_page(phys_page);
3280 }
3281 }
3282
3283 if (fault_type & VM_PROT_WRITE) {
3284 if (m->vmp_wpmapped == FALSE) {
3285 vm_object_lock_assert_exclusive(object);
3286 if (!object->internal && object->pager) {
3287 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3288 }
3289 m->vmp_wpmapped = TRUE;
3290 }
3291 }
3292 return page_needs_sync;
3293 }
3294
3295 /*
3296 * Try to enter the given page into the pmap.
3297 * Will retry without execute permission iff PMAP_CS is enabled and we encounter
3298 * a codesigning failure on a non-execute fault.
3299 */
3300 static kern_return_t
3301 vm_fault_attempt_pmap_enter(
3302 pmap_t pmap,
3303 vm_map_offset_t vaddr,
3304 vm_map_size_t fault_page_size,
3305 vm_map_offset_t fault_phys_offset,
3306 vm_page_t m,
3307 vm_prot_t *prot,
3308 vm_prot_t caller_prot,
3309 vm_prot_t fault_type,
3310 bool wired,
3311 int pmap_options)
3312 {
3313 #if !PMAP_CS
3314 #pragma unused(caller_prot)
3315 #endif /* !PMAP_CS */
3316 kern_return_t kr;
3317 if (fault_page_size != PAGE_SIZE) {
3318 DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3319 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3320 fault_phys_offset < PAGE_SIZE),
3321 "0x%llx\n", (uint64_t)fault_phys_offset);
3322 } else {
3323 assertf(fault_phys_offset == 0,
3324 "0x%llx\n", (uint64_t)fault_phys_offset);
3325 }
3326
3327 PMAP_ENTER_OPTIONS(pmap, vaddr,
3328 fault_phys_offset,
3329 m, *prot, fault_type, 0,
3330 wired,
3331 pmap_options,
3332 kr);
3333 #if PMAP_CS
3334 /*
3335 * Retry without execute permission if we encountered a codesigning
3336 * failure on a non-execute fault. This allows applications which
3337 * don't actually need to execute code to still map it for read access.
3338 */
3339 if ((kr == KERN_CODESIGN_ERROR) && pmap_cs_enforced(pmap) &&
3340 (*prot & VM_PROT_EXECUTE) && !(caller_prot & VM_PROT_EXECUTE)) {
3341 *prot &= ~VM_PROT_EXECUTE;
3342 PMAP_ENTER_OPTIONS(pmap, vaddr,
3343 fault_phys_offset,
3344 m, *prot, fault_type, 0,
3345 wired,
3346 pmap_options,
3347 kr);
3348 }
3349 #endif
3350 return kr;
3351 }
3352
3353 /*
3354 * Enter the given page into the pmap.
3355 * The map must be locked shared.
3356 * The vm object must NOT be locked.
3357 *
3358 * @param need_retry if not null, avoid making a (potentially) blocking call into
3359 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3360 */
3361 static kern_return_t
3362 vm_fault_pmap_enter(
3363 pmap_t pmap,
3364 vm_map_offset_t vaddr,
3365 vm_map_size_t fault_page_size,
3366 vm_map_offset_t fault_phys_offset,
3367 vm_page_t m,
3368 vm_prot_t *prot,
3369 vm_prot_t caller_prot,
3370 vm_prot_t fault_type,
3371 bool wired,
3372 int pmap_options,
3373 boolean_t *need_retry)
3374 {
3375 kern_return_t kr;
3376 if (need_retry != NULL) {
3377 /*
3378 * Although we don't hold a lock on this object, we hold a lock
3379 * on the top object in the chain. To prevent a deadlock, we
3380 * can't allow the pmap layer to block.
3381 */
3382 pmap_options |= PMAP_OPTIONS_NOWAIT;
3383 }
3384 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3385 fault_page_size, fault_phys_offset,
3386 m, prot, caller_prot, fault_type, wired, pmap_options);
3387 if (kr == KERN_RESOURCE_SHORTAGE) {
3388 if (need_retry) {
3389 /*
3390 * There's nothing we can do here since we hold the
3391 * lock on the top object in the chain. The caller
3392 * will need to deal with this by dropping that lock and retrying.
3393 */
3394 *need_retry = TRUE;
3395 vm_pmap_enter_retried++;
3396 }
3397 }
3398 return kr;
3399 }
3400
3401 /*
3402 * Enter the given page into the pmap.
3403 * The vm map must be locked shared.
3404 * The vm object must be locked exclusive, unless this is a soft fault.
3405 * For a soft fault, the object must be locked shared or exclusive.
3406 *
3407 * @param need_retry if not null, avoid making a (potentially) blocking call into
3408 * the pmap layer. When such a call would be necessary, return true in this boolean instead.
3409 */
3410 static kern_return_t
3411 vm_fault_pmap_enter_with_object_lock(
3412 vm_object_t object,
3413 pmap_t pmap,
3414 vm_map_offset_t vaddr,
3415 vm_map_size_t fault_page_size,
3416 vm_map_offset_t fault_phys_offset,
3417 vm_page_t m,
3418 vm_prot_t *prot,
3419 vm_prot_t caller_prot,
3420 vm_prot_t fault_type,
3421 bool wired,
3422 int pmap_options,
3423 boolean_t *need_retry)
3424 {
3425 kern_return_t kr;
3426 /*
3427 * Prevent a deadlock by not
3428 * holding the object lock if we need to wait for a page in
3429 * pmap_enter() - <rdar://problem/7138958>
3430 */
3431 kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3432 fault_page_size, fault_phys_offset,
3433 m, prot, caller_prot, fault_type, wired, pmap_options | PMAP_OPTIONS_NOWAIT);
3434 #if __x86_64__
3435 if (kr == KERN_INVALID_ARGUMENT &&
3436 pmap == PMAP_NULL &&
3437 wired) {
3438 /*
3439 * Wiring a page in a pmap-less VM map:
3440 * VMware's "vmmon" kernel extension does this
3441 * to grab pages.
3442 * Let it proceed even though the PMAP_ENTER() failed.
3443 */
3444 kr = KERN_SUCCESS;
3445 }
3446 #endif /* __x86_64__ */
3447
3448 if (kr == KERN_RESOURCE_SHORTAGE) {
3449 if (need_retry) {
3450 /*
3451 * this will be non-null in the case where we hold the lock
3452 * on the top-object in this chain... we can't just drop
3453 * the lock on the object we're inserting the page into
3454 * and recall the PMAP_ENTER since we can still cause
3455 * a deadlock if one of the critical paths tries to
3456 * acquire the lock on the top-object and we're blocked
3457 * in PMAP_ENTER waiting for memory... our only recourse
3458 * is to deal with it at a higher level where we can
3459 * drop both locks.
3460 */
3461 *need_retry = TRUE;
3462 vm_pmap_enter_retried++;
3463 goto done;
3464 }
3465 /*
3466 * The nonblocking version of pmap_enter did not succeed.
3467 * and we don't need to drop other locks and retry
3468 * at the level above us, so
3469 * use the blocking version instead. Requires marking
3470 * the page busy and unlocking the object
3471 */
3472 boolean_t was_busy = m->vmp_busy;
3473
3474 vm_object_lock_assert_exclusive(object);
3475
3476 m->vmp_busy = TRUE;
3477 vm_object_unlock(object);
3478
3479 PMAP_ENTER_OPTIONS(pmap, vaddr,
3480 fault_phys_offset,
3481 m, *prot, fault_type,
3482 0, wired,
3483 pmap_options, kr);
3484
3485 assert(VM_PAGE_OBJECT(m) == object);
3486
3487 /* Take the object lock again. */
3488 vm_object_lock(object);
3489
3490 /* If the page was busy, someone else will wake it up.
3491 * Otherwise, we have to do it now. */
3492 assert(m->vmp_busy);
3493 if (!was_busy) {
3494 PAGE_WAKEUP_DONE(m);
3495 }
3496 vm_pmap_enter_blocked++;
3497 }
3498
3499 done:
3500 return kr;
3501 }
3502
3503 /*
3504 * Prepare to enter a page into the pmap by checking CS, protection bits,
3505 * and setting mapped bits on the page_t.
3506 * Does not modify the page's paging queue.
3507 *
3508 * page queue lock must NOT be held
3509 * m->vmp_object must be locked
3510 *
3511 * NOTE: m->vmp_object could be locked "shared" only if we are called
3512 * from vm_fault() as part of a soft fault.
3513 */
3514 static kern_return_t
3515 vm_fault_enter_prepare(
3516 vm_page_t m,
3517 pmap_t pmap,
3518 vm_map_offset_t vaddr,
3519 vm_prot_t *prot,
3520 vm_prot_t caller_prot,
3521 vm_map_size_t fault_page_size,
3522 vm_map_offset_t fault_phys_offset,
3523 boolean_t change_wiring,
3524 vm_prot_t fault_type,
3525 vm_object_fault_info_t fault_info,
3526 int *type_of_fault,
3527 bool *page_needs_data_sync)
3528 {
3529 kern_return_t kr;
3530 bool is_tainted = false;
3531 vm_object_t object;
3532 boolean_t cs_bypass = fault_info->cs_bypass;
3533
3534 object = VM_PAGE_OBJECT(m);
3535
3536 vm_object_lock_assert_held(object);
3537
3538 #if KASAN
3539 if (pmap == kernel_pmap) {
3540 kasan_notify_address(vaddr, PAGE_SIZE);
3541 }
3542 #endif
3543 #if PMAP_CS
3544 if (pmap_cs_exempt(pmap)) {
3545 cs_bypass = TRUE;
3546 }
3547 #endif
3548
3549 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3550
3551 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3552 vm_object_lock_assert_exclusive(object);
3553 } else if ((fault_type & VM_PROT_WRITE) == 0 &&
3554 !change_wiring &&
3555 (!m->vmp_wpmapped
3556 #if VM_OBJECT_ACCESS_TRACKING
3557 || object->access_tracking
3558 #endif /* VM_OBJECT_ACCESS_TRACKING */
3559 )) {
3560 /*
3561 * This is not a "write" fault, so we
3562 * might not have taken the object lock
3563 * exclusively and we might not be able
3564 * to update the "wpmapped" bit in
3565 * vm_fault_enter().
3566 * Let's just grant read access to
3567 * the page for now and we'll
3568 * soft-fault again if we need write
3569 * access later...
3570 */
3571
3572 /* This had better not be a JIT page. */
3573 if (!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot)) {
3574 *prot &= ~VM_PROT_WRITE;
3575 } else {
3576 assert(cs_bypass);
3577 }
3578 }
3579 if (m->vmp_pmapped == FALSE) {
3580 if (m->vmp_clustered) {
3581 if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3582 /*
3583 * found it in the cache, but this
3584 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
3585 * so it must have come in as part of
3586 * a cluster... account 1 pagein against it
3587 */
3588 if (object->internal) {
3589 *type_of_fault = DBG_PAGEIND_FAULT;
3590 } else {
3591 *type_of_fault = DBG_PAGEINV_FAULT;
3592 }
3593
3594 VM_PAGE_COUNT_AS_PAGEIN(m);
3595 }
3596 VM_PAGE_CONSUME_CLUSTERED(m);
3597 }
3598 }
3599
3600 if (*type_of_fault != DBG_COW_FAULT) {
3601 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
3602
3603 if (pmap == kernel_pmap) {
3604 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
3605 }
3606 }
3607
3608 kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3609 *prot, caller_prot, fault_page_size, fault_phys_offset,
3610 fault_info, &is_tainted);
3611 if (kr == KERN_SUCCESS) {
3612 /*
3613 * We either have a good page, or a tainted page that has been accepted by the process.
3614 * In both cases the page will be entered into the pmap.
3615 */
3616 *page_needs_data_sync = vm_fault_enter_set_mapped(object, m, *prot, fault_type);
3617 if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3618 /*
3619 * This page is tainted but we're inserting it anyways.
3620 * Since it's writeable, we need to disconnect it from other pmaps
3621 * now so those processes can take note.
3622 */
3623
3624 /*
3625 * We can only get here
3626 * because of the CSE logic
3627 */
3628 assert(pmap_get_vm_map_cs_enforced(pmap));
3629 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3630 /*
3631 * If we are faulting for a write, we can clear
3632 * the execute bit - that will ensure the page is
3633 * checked again before being executable, which
3634 * protects against a map switch.
3635 * This only happens the first time the page
3636 * gets tainted, so we won't get stuck here
3637 * to make an already writeable page executable.
3638 */
3639 if (!cs_bypass) {
3640 assert(!pmap_has_prot_policy(pmap, fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, *prot));
3641 *prot &= ~VM_PROT_EXECUTE;
3642 }
3643 }
3644 assert(VM_PAGE_OBJECT(m) == object);
3645
3646 #if VM_OBJECT_ACCESS_TRACKING
3647 if (object->access_tracking) {
3648 DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3649 if (fault_type & VM_PROT_WRITE) {
3650 object->access_tracking_writes++;
3651 vm_object_access_tracking_writes++;
3652 } else {
3653 object->access_tracking_reads++;
3654 vm_object_access_tracking_reads++;
3655 }
3656 }
3657 #endif /* VM_OBJECT_ACCESS_TRACKING */
3658 }
3659
3660 return kr;
3661 }
3662
3663 /*
3664 * page queue lock must NOT be held
3665 * m->vmp_object must be locked
3666 *
3667 * NOTE: m->vmp_object could be locked "shared" only if we are called
3668 * from vm_fault() as part of a soft fault. If so, we must be
3669 * careful not to modify the VM object in any way that is not
3670 * legal under a shared lock...
3671 */
3672 kern_return_t
3673 vm_fault_enter(
3674 vm_page_t m,
3675 pmap_t pmap,
3676 vm_map_offset_t vaddr,
3677 vm_map_size_t fault_page_size,
3678 vm_map_offset_t fault_phys_offset,
3679 vm_prot_t prot,
3680 vm_prot_t caller_prot,
3681 boolean_t wired,
3682 boolean_t change_wiring,
3683 vm_tag_t wire_tag,
3684 vm_object_fault_info_t fault_info,
3685 boolean_t *need_retry,
3686 int *type_of_fault)
3687 {
3688 kern_return_t kr;
3689 vm_object_t object;
3690 bool page_needs_data_sync;
3691 vm_prot_t fault_type;
3692 int pmap_options = fault_info->pmap_options;
3693
3694 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3695 assert(m->vmp_fictitious);
3696 return KERN_SUCCESS;
3697 }
3698
3699 fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3700
3701 kr = vm_fault_enter_prepare(m, pmap, vaddr, &prot, caller_prot,
3702 fault_page_size, fault_phys_offset, change_wiring, fault_type,
3703 fault_info, type_of_fault, &page_needs_data_sync);
3704 object = VM_PAGE_OBJECT(m);
3705
3706 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info->no_cache, type_of_fault, kr);
3707
3708 if (kr == KERN_SUCCESS) {
3709 if (page_needs_data_sync) {
3710 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
3711 }
3712
3713 kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3714 fault_page_size, fault_phys_offset, m,
3715 &prot, caller_prot, fault_type, wired, pmap_options, need_retry);
3716 }
3717
3718 return kr;
3719 }
3720
3721 void
3722 vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3723 {
3724 if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3725 vm_fault(current_map(), /* map */
3726 vaddr, /* vaddr */
3727 prot, /* fault_type */
3728 FALSE, /* change_wiring */
3729 VM_KERN_MEMORY_NONE, /* tag - not wiring */
3730 THREAD_UNINT, /* interruptible */
3731 NULL, /* caller_pmap */
3732 0 /* caller_pmap_addr */);
3733 }
3734 }
3735
3736
3737 /*
3738 * Routine: vm_fault
3739 * Purpose:
3740 * Handle page faults, including pseudo-faults
3741 * used to change the wiring status of pages.
3742 * Returns:
3743 * Explicit continuations have been removed.
3744 * Implementation:
3745 * vm_fault and vm_fault_page save mucho state
3746 * in the moral equivalent of a closure. The state
3747 * structure is allocated when first entering vm_fault
3748 * and deallocated when leaving vm_fault.
3749 */
3750
3751 extern uint64_t get_current_unique_pid(void);
3752
3753 unsigned long vm_fault_collapse_total = 0;
3754 unsigned long vm_fault_collapse_skipped = 0;
3755
3756
3757 kern_return_t
3758 vm_fault_external(
3759 vm_map_t map,
3760 vm_map_offset_t vaddr,
3761 vm_prot_t fault_type,
3762 boolean_t change_wiring,
3763 int interruptible,
3764 pmap_t caller_pmap,
3765 vm_map_offset_t caller_pmap_addr)
3766 {
3767 return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3768 change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3769 interruptible, caller_pmap, caller_pmap_addr,
3770 NULL);
3771 }
3772
3773 kern_return_t
3774 vm_fault(
3775 vm_map_t map,
3776 vm_map_offset_t vaddr,
3777 vm_prot_t fault_type,
3778 boolean_t change_wiring,
3779 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3780 int interruptible,
3781 pmap_t caller_pmap,
3782 vm_map_offset_t caller_pmap_addr)
3783 {
3784 return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3785 interruptible, caller_pmap, caller_pmap_addr,
3786 NULL);
3787 }
3788
3789 static boolean_t
3790 current_proc_is_privileged(void)
3791 {
3792 return csproc_get_platform_binary(current_proc());
3793 }
3794
3795 uint64_t vm_copied_on_read = 0;
3796
3797 /*
3798 * Cleanup after a vm_fault_enter.
3799 * At this point, the fault should either have failed (kr != KERN_SUCCESS)
3800 * or the page should be in the pmap and on the correct paging queue.
3801 *
3802 * Precondition:
3803 * map must be locked shared.
3804 * m_object must be locked.
3805 * If top_object != VM_OBJECT_NULL, it must be locked.
3806 * real_map must be locked.
3807 *
3808 * Postcondition:
3809 * map will be unlocked
3810 * m_object will be unlocked
3811 * top_object will be unlocked
3812 * If real_map != map, it will be unlocked
3813 */
3814 static void
3815 vm_fault_complete(
3816 vm_map_t map,
3817 vm_map_t real_map,
3818 vm_object_t object,
3819 vm_object_t m_object,
3820 vm_page_t m,
3821 vm_map_offset_t offset,
3822 vm_map_offset_t trace_real_vaddr,
3823 vm_object_fault_info_t fault_info,
3824 vm_prot_t caller_prot,
3825 #if CONFIG_DTRACE
3826 vm_map_offset_t real_vaddr,
3827 #else
3828 __unused vm_map_offset_t real_vaddr,
3829 #endif /* CONFIG_DTRACE */
3830 int type_of_fault,
3831 boolean_t need_retry,
3832 kern_return_t kr,
3833 ppnum_t *physpage_p,
3834 vm_prot_t prot,
3835 vm_object_t top_object,
3836 boolean_t need_collapse,
3837 vm_map_offset_t cur_offset,
3838 vm_prot_t fault_type,
3839 vm_object_t *written_on_object,
3840 memory_object_t *written_on_pager,
3841 vm_object_offset_t *written_on_offset)
3842 {
3843 int event_code = 0;
3844 vm_map_lock_assert_shared(map);
3845 vm_object_lock_assert_held(m_object);
3846 if (top_object != VM_OBJECT_NULL) {
3847 vm_object_lock_assert_held(top_object);
3848 }
3849 vm_map_lock_assert_held(real_map);
3850
3851 if (m_object->internal) {
3852 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3853 } else if (m_object->object_is_shared_cache) {
3854 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3855 } else {
3856 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3857 }
3858
3859 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info->user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
3860 if (need_retry == FALSE) {
3861 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid(), 0, 0, 0, 0);
3862 }
3863 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
3864 if (kr == KERN_SUCCESS &&
3865 physpage_p != NULL) {
3866 /* for vm_map_wire_and_extract() */
3867 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3868 if (prot & VM_PROT_WRITE) {
3869 vm_object_lock_assert_exclusive(m_object);
3870 m->vmp_dirty = TRUE;
3871 }
3872 }
3873
3874 if (top_object != VM_OBJECT_NULL) {
3875 /*
3876 * It's safe to drop the top object
3877 * now that we've done our
3878 * vm_fault_enter(). Any other fault
3879 * in progress for that virtual
3880 * address will either find our page
3881 * and translation or put in a new page
3882 * and translation.
3883 */
3884 vm_object_unlock(top_object);
3885 top_object = VM_OBJECT_NULL;
3886 }
3887
3888 if (need_collapse == TRUE) {
3889 vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
3890 }
3891
3892 if (need_retry == FALSE &&
3893 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3894 /*
3895 * evaluate access pattern and update state
3896 * vm_fault_deactivate_behind depends on the
3897 * state being up to date
3898 */
3899 vm_fault_is_sequential(m_object, cur_offset, fault_info->behavior);
3900
3901 vm_fault_deactivate_behind(m_object, cur_offset, fault_info->behavior);
3902 }
3903 /*
3904 * That's it, clean up and return.
3905 */
3906 if (m->vmp_busy) {
3907 vm_object_lock_assert_exclusive(m_object);
3908 PAGE_WAKEUP_DONE(m);
3909 }
3910
3911 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
3912 vm_object_paging_begin(m_object);
3913
3914 assert(*written_on_object == VM_OBJECT_NULL);
3915 *written_on_object = m_object;
3916 *written_on_pager = m_object->pager;
3917 *written_on_offset = m_object->paging_offset + m->vmp_offset;
3918 }
3919 vm_object_unlock(object);
3920
3921 vm_map_unlock_read(map);
3922 if (real_map != map) {
3923 vm_map_unlock(real_map);
3924 }
3925 }
3926
3927 static inline int
3928 vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
3929 {
3930 if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
3931 return DBG_COR_FAULT;
3932 }
3933 return type_of_fault;
3934 }
3935
3936 kern_return_t
3937 vm_fault_internal(
3938 vm_map_t map,
3939 vm_map_offset_t vaddr,
3940 vm_prot_t caller_prot,
3941 boolean_t change_wiring,
3942 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3943 int interruptible,
3944 pmap_t caller_pmap,
3945 vm_map_offset_t caller_pmap_addr,
3946 ppnum_t *physpage_p)
3947 {
3948 vm_map_version_t version; /* Map version for verificiation */
3949 boolean_t wired; /* Should mapping be wired down? */
3950 vm_object_t object; /* Top-level object */
3951 vm_object_offset_t offset; /* Top-level offset */
3952 vm_prot_t prot; /* Protection for mapping */
3953 vm_object_t old_copy_object; /* Saved copy object */
3954 vm_page_t result_page; /* Result of vm_fault_page */
3955 vm_page_t top_page; /* Placeholder page */
3956 kern_return_t kr;
3957
3958 vm_page_t m; /* Fast access to result_page */
3959 kern_return_t error_code;
3960 vm_object_t cur_object;
3961 vm_object_t m_object = NULL;
3962 vm_object_offset_t cur_offset;
3963 vm_page_t cur_m;
3964 vm_object_t new_object;
3965 int type_of_fault;
3966 pmap_t pmap;
3967 wait_interrupt_t interruptible_state;
3968 vm_map_t real_map = map;
3969 vm_map_t original_map = map;
3970 bool object_locks_dropped = FALSE;
3971 vm_prot_t fault_type;
3972 vm_prot_t original_fault_type;
3973 struct vm_object_fault_info fault_info = {};
3974 bool need_collapse = FALSE;
3975 boolean_t need_retry = FALSE;
3976 boolean_t *need_retry_ptr = NULL;
3977 uint8_t object_lock_type = 0;
3978 uint8_t cur_object_lock_type;
3979 vm_object_t top_object = VM_OBJECT_NULL;
3980 vm_object_t written_on_object = VM_OBJECT_NULL;
3981 memory_object_t written_on_pager = NULL;
3982 vm_object_offset_t written_on_offset = 0;
3983 int throttle_delay;
3984 int compressed_count_delta;
3985 uint8_t grab_options;
3986 bool need_copy;
3987 bool need_copy_on_read;
3988 vm_map_offset_t trace_vaddr;
3989 vm_map_offset_t trace_real_vaddr;
3990 vm_map_size_t fault_page_size;
3991 vm_map_size_t fault_page_mask;
3992 vm_map_offset_t fault_phys_offset;
3993 vm_map_offset_t real_vaddr;
3994 bool resilient_media_retry = FALSE;
3995 vm_object_t resilient_media_object = VM_OBJECT_NULL;
3996 vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-1;
3997 bool page_needs_data_sync = false;
3998 /*
3999 * Was the VM object contended when vm_map_lookup_locked locked it?
4000 * If so, the zero fill path will drop the lock
4001 * NB: Ideally we would always drop the lock rather than rely on
4002 * this heuristic, but vm_object_unlock currently takes > 30 cycles.
4003 */
4004 bool object_is_contended = false;
4005
4006 real_vaddr = vaddr;
4007 trace_real_vaddr = vaddr;
4008
4009 if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4010 fault_phys_offset = (vm_map_offset_t)-1;
4011 fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4012 fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4013 if (fault_page_size < PAGE_SIZE) {
4014 DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4015 vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4016 }
4017 } else {
4018 fault_phys_offset = 0;
4019 fault_page_size = PAGE_SIZE;
4020 fault_page_mask = PAGE_MASK;
4021 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4022 }
4023
4024 if (map == kernel_map) {
4025 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4026 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4027 } else {
4028 trace_vaddr = vaddr;
4029 }
4030
4031 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4032 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
4033 ((uint64_t)trace_vaddr >> 32),
4034 trace_vaddr,
4035 (map == kernel_map),
4036 0,
4037 0);
4038
4039 if (get_preemption_level() != 0) {
4040 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4041 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4042 ((uint64_t)trace_vaddr >> 32),
4043 trace_vaddr,
4044 KERN_FAILURE,
4045 0,
4046 0);
4047
4048 return KERN_FAILURE;
4049 }
4050
4051 thread_t cthread = current_thread();
4052 bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4053 uint64_t fstart = 0;
4054
4055 if (rtfault) {
4056 fstart = mach_continuous_time();
4057 }
4058
4059 interruptible_state = thread_interrupt_level(interruptible);
4060
4061 fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4062
4063 VM_STAT_INCR(faults);
4064 current_task()->faults++;
4065 original_fault_type = fault_type;
4066
4067 need_copy = FALSE;
4068 if (fault_type & VM_PROT_WRITE) {
4069 need_copy = TRUE;
4070 }
4071
4072 if (need_copy || change_wiring) {
4073 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4074 } else {
4075 object_lock_type = OBJECT_LOCK_SHARED;
4076 }
4077
4078 cur_object_lock_type = OBJECT_LOCK_SHARED;
4079
4080 if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4081 if (compressor_map) {
4082 if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4083 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
4084 }
4085 }
4086 }
4087 RetryFault:
4088 assert(written_on_object == VM_OBJECT_NULL);
4089
4090 /*
4091 * assume we will hit a page in the cache
4092 * otherwise, explicitly override with
4093 * the real fault type once we determine it
4094 */
4095 type_of_fault = DBG_CACHE_HIT_FAULT;
4096
4097 /*
4098 * Find the backing store object and offset into
4099 * it to begin the search.
4100 */
4101 fault_type = original_fault_type;
4102 map = original_map;
4103 vm_map_lock_read(map);
4104
4105 if (resilient_media_retry) {
4106 /*
4107 * If we have to insert a fake zero-filled page to hide
4108 * a media failure to provide the real page, we need to
4109 * resolve any pending copy-on-write on this mapping.
4110 * VM_PROT_COPY tells vm_map_lookup_locked() to deal
4111 * with that even if this is not a "write" fault.
4112 */
4113 need_copy = TRUE;
4114 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4115 }
4116
4117 kr = vm_map_lookup_locked(&map, vaddr,
4118 (fault_type | (need_copy ? VM_PROT_COPY : 0)),
4119 object_lock_type, &version,
4120 &object, &offset, &prot, &wired,
4121 &fault_info,
4122 &real_map,
4123 &object_is_contended);
4124
4125 if (kr != KERN_SUCCESS) {
4126 vm_map_unlock_read(map);
4127 goto done;
4128 }
4129
4130
4131 pmap = real_map->pmap;
4132 fault_info.interruptible = interruptible;
4133 fault_info.stealth = FALSE;
4134 fault_info.io_sync = FALSE;
4135 fault_info.mark_zf_absent = FALSE;
4136 fault_info.batch_pmap_op = FALSE;
4137
4138 if (resilient_media_retry) {
4139 /*
4140 * We're retrying this fault after having detected a media
4141 * failure from a "resilient_media" mapping.
4142 * Check that the mapping is still pointing at the object
4143 * that just failed to provide a page.
4144 */
4145 assert(resilient_media_object != VM_OBJECT_NULL);
4146 assert(resilient_media_offset != (vm_object_offset_t)-1);
4147 if (object != VM_OBJECT_NULL &&
4148 object == resilient_media_object &&
4149 offset == resilient_media_offset &&
4150 fault_info.resilient_media) {
4151 /*
4152 * This mapping still points at the same object
4153 * and is still "resilient_media": proceed in
4154 * "recovery-from-media-failure" mode, where we'll
4155 * insert a zero-filled page in the top object.
4156 */
4157 // printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4158 } else {
4159 /* not recovering: reset state */
4160 // printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4161 resilient_media_retry = FALSE;
4162 /* release our extra reference on failed object */
4163 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4164 vm_object_deallocate(resilient_media_object);
4165 resilient_media_object = VM_OBJECT_NULL;
4166 resilient_media_offset = (vm_object_offset_t)-1;
4167 }
4168 } else {
4169 assert(resilient_media_object == VM_OBJECT_NULL);
4170 resilient_media_offset = (vm_object_offset_t)-1;
4171 }
4172
4173 /*
4174 * If the page is wired, we must fault for the current protection
4175 * value, to avoid further faults.
4176 */
4177 if (wired) {
4178 fault_type = prot | VM_PROT_WRITE;
4179 }
4180 if (wired || need_copy) {
4181 /*
4182 * since we're treating this fault as a 'write'
4183 * we must hold the top object lock exclusively
4184 */
4185 if (object_lock_type == OBJECT_LOCK_SHARED) {
4186 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4187
4188 if (vm_object_lock_upgrade(object) == FALSE) {
4189 /*
4190 * couldn't upgrade, so explictly
4191 * take the lock exclusively
4192 */
4193 vm_object_lock(object);
4194 }
4195 }
4196 }
4197
4198 #if VM_FAULT_CLASSIFY
4199 /*
4200 * Temporary data gathering code
4201 */
4202 vm_fault_classify(object, offset, fault_type);
4203 #endif
4204 /*
4205 * Fast fault code. The basic idea is to do as much as
4206 * possible while holding the map lock and object locks.
4207 * Busy pages are not used until the object lock has to
4208 * be dropped to do something (copy, zero fill, pmap enter).
4209 * Similarly, paging references aren't acquired until that
4210 * point, and object references aren't used.
4211 *
4212 * If we can figure out what to do
4213 * (zero fill, copy on write, pmap enter) while holding
4214 * the locks, then it gets done. Otherwise, we give up,
4215 * and use the original fault path (which doesn't hold
4216 * the map lock, and relies on busy pages).
4217 * The give up cases include:
4218 * - Have to talk to pager.
4219 * - Page is busy, absent or in error.
4220 * - Pager has locked out desired access.
4221 * - Fault needs to be restarted.
4222 * - Have to push page into copy object.
4223 *
4224 * The code is an infinite loop that moves one level down
4225 * the shadow chain each time. cur_object and cur_offset
4226 * refer to the current object being examined. object and offset
4227 * are the original object from the map. The loop is at the
4228 * top level if and only if object and cur_object are the same.
4229 *
4230 * Invariants: Map lock is held throughout. Lock is held on
4231 * original object and cur_object (if different) when
4232 * continuing or exiting loop.
4233 *
4234 */
4235
4236 #if defined(__arm64__)
4237 /*
4238 * Fail if reading an execute-only page in a
4239 * pmap that enforces execute-only protection.
4240 */
4241 if (fault_type == VM_PROT_READ &&
4242 (prot & VM_PROT_EXECUTE) &&
4243 !(prot & VM_PROT_READ) &&
4244 pmap_enforces_execute_only(pmap)) {
4245 vm_object_unlock(object);
4246 vm_map_unlock_read(map);
4247 if (real_map != map) {
4248 vm_map_unlock(real_map);
4249 }
4250 kr = KERN_PROTECTION_FAILURE;
4251 goto done;
4252 }
4253 #endif
4254
4255 fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4256
4257 /*
4258 * If this page is to be inserted in a copy delay object
4259 * for writing, and if the object has a copy, then the
4260 * copy delay strategy is implemented in the slow fault page.
4261 */
4262 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
4263 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4264 goto handle_copy_delay;
4265 }
4266
4267 cur_object = object;
4268 cur_offset = offset;
4269
4270 grab_options = 0;
4271 #if CONFIG_SECLUDED_MEMORY
4272 if (object->can_grab_secluded) {
4273 grab_options |= VM_PAGE_GRAB_SECLUDED;
4274 }
4275 #endif /* CONFIG_SECLUDED_MEMORY */
4276
4277 while (TRUE) {
4278 if (!cur_object->pager_created &&
4279 cur_object->phys_contiguous) { /* superpage */
4280 break;
4281 }
4282
4283 if (cur_object->blocked_access) {
4284 /*
4285 * Access to this VM object has been blocked.
4286 * Let the slow path handle it.
4287 */
4288 break;
4289 }
4290
4291 m = vm_page_lookup(cur_object, vm_object_trunc_page(cur_offset));
4292 m_object = NULL;
4293
4294 if (m != VM_PAGE_NULL) {
4295 m_object = cur_object;
4296
4297 if (m->vmp_busy) {
4298 wait_result_t result;
4299
4300 /*
4301 * in order to do the PAGE_ASSERT_WAIT, we must
4302 * have object that 'm' belongs to locked exclusively
4303 */
4304 if (object != cur_object) {
4305 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4306 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4307
4308 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4309 /*
4310 * couldn't upgrade so go do a full retry
4311 * immediately since we can no longer be
4312 * certain about cur_object (since we
4313 * don't hold a reference on it)...
4314 * first drop the top object lock
4315 */
4316 vm_object_unlock(object);
4317
4318 vm_map_unlock_read(map);
4319 if (real_map != map) {
4320 vm_map_unlock(real_map);
4321 }
4322
4323 goto RetryFault;
4324 }
4325 }
4326 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4327 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4328
4329 if (vm_object_lock_upgrade(object) == FALSE) {
4330 /*
4331 * couldn't upgrade, so explictly take the lock
4332 * exclusively and go relookup the page since we
4333 * will have dropped the object lock and
4334 * a different thread could have inserted
4335 * a page at this offset
4336 * no need for a full retry since we're
4337 * at the top level of the object chain
4338 */
4339 vm_object_lock(object);
4340
4341 continue;
4342 }
4343 }
4344 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4345 /*
4346 * m->vmp_busy == TRUE and the object is locked exclusively
4347 * if m->pageout_queue == TRUE after we acquire the
4348 * queues lock, we are guaranteed that it is stable on
4349 * the pageout queue and therefore reclaimable
4350 *
4351 * NOTE: this is only true for the internal pageout queue
4352 * in the compressor world
4353 */
4354 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4355
4356 vm_page_lock_queues();
4357
4358 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4359 vm_pageout_throttle_up(m);
4360 vm_page_unlock_queues();
4361
4362 PAGE_WAKEUP_DONE(m);
4363 goto reclaimed_from_pageout;
4364 }
4365 vm_page_unlock_queues();
4366 }
4367 if (object != cur_object) {
4368 vm_object_unlock(object);
4369 }
4370
4371 vm_map_unlock_read(map);
4372 if (real_map != map) {
4373 vm_map_unlock(real_map);
4374 }
4375
4376 result = PAGE_ASSERT_WAIT(m, interruptible);
4377
4378 vm_object_unlock(cur_object);
4379
4380 if (result == THREAD_WAITING) {
4381 result = thread_block(THREAD_CONTINUE_NULL);
4382
4383 counter(c_vm_fault_page_block_busy_kernel++);
4384 }
4385 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
4386 goto RetryFault;
4387 }
4388
4389 kr = KERN_ABORTED;
4390 goto done;
4391 }
4392 reclaimed_from_pageout:
4393 if (m->vmp_laundry) {
4394 if (object != cur_object) {
4395 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4396 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4397
4398 vm_object_unlock(object);
4399 vm_object_unlock(cur_object);
4400
4401 vm_map_unlock_read(map);
4402 if (real_map != map) {
4403 vm_map_unlock(real_map);
4404 }
4405
4406 goto RetryFault;
4407 }
4408 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4409 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4410
4411 if (vm_object_lock_upgrade(object) == FALSE) {
4412 /*
4413 * couldn't upgrade, so explictly take the lock
4414 * exclusively and go relookup the page since we
4415 * will have dropped the object lock and
4416 * a different thread could have inserted
4417 * a page at this offset
4418 * no need for a full retry since we're
4419 * at the top level of the object chain
4420 */
4421 vm_object_lock(object);
4422
4423 continue;
4424 }
4425 }
4426 vm_pageout_steal_laundry(m, FALSE);
4427 }
4428
4429 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4430 /*
4431 * Guard page: let the slow path deal with it
4432 */
4433 break;
4434 }
4435 if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
4436 /*
4437 * Unusual case... let the slow path deal with it
4438 */
4439 break;
4440 }
4441 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4442 if (object != cur_object) {
4443 vm_object_unlock(object);
4444 }
4445 vm_map_unlock_read(map);
4446 if (real_map != map) {
4447 vm_map_unlock(real_map);
4448 }
4449 vm_object_unlock(cur_object);
4450 kr = KERN_MEMORY_ERROR;
4451 goto done;
4452 }
4453 assert(m_object == VM_PAGE_OBJECT(m));
4454
4455 if (vm_fault_cs_need_validation(map->pmap, m, m_object,
4456 PAGE_SIZE, 0) ||
4457 (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4458 upgrade_lock_and_retry:
4459 /*
4460 * We might need to validate this page
4461 * against its code signature, so we
4462 * want to hold the VM object exclusively.
4463 */
4464 if (object != cur_object) {
4465 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4466 vm_object_unlock(object);
4467 vm_object_unlock(cur_object);
4468
4469 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4470
4471 vm_map_unlock_read(map);
4472 if (real_map != map) {
4473 vm_map_unlock(real_map);
4474 }
4475
4476 goto RetryFault;
4477 }
4478 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4479 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4480
4481 if (vm_object_lock_upgrade(object) == FALSE) {
4482 /*
4483 * couldn't upgrade, so explictly take the lock
4484 * exclusively and go relookup the page since we
4485 * will have dropped the object lock and
4486 * a different thread could have inserted
4487 * a page at this offset
4488 * no need for a full retry since we're
4489 * at the top level of the object chain
4490 */
4491 vm_object_lock(object);
4492
4493 continue;
4494 }
4495 }
4496 }
4497 /*
4498 * Two cases of map in faults:
4499 * - At top level w/o copy object.
4500 * - Read fault anywhere.
4501 * --> must disallow write.
4502 */
4503
4504 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
4505 goto FastPmapEnter;
4506 }
4507
4508 if (!need_copy &&
4509 !fault_info.no_copy_on_read &&
4510 cur_object != object &&
4511 !cur_object->internal &&
4512 !cur_object->pager_trusted &&
4513 vm_protect_privileged_from_untrusted &&
4514 !((prot & VM_PROT_EXECUTE) &&
4515 cur_object->code_signed &&
4516 pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
4517 current_proc_is_privileged()) {
4518 /*
4519 * We're faulting on a page in "object" and
4520 * went down the shadow chain to "cur_object"
4521 * to find out that "cur_object"'s pager
4522 * is not "trusted", i.e. we can not trust it
4523 * to always return the same contents.
4524 * Since the target is a "privileged" process,
4525 * let's treat this as a copy-on-read fault, as
4526 * if it was a copy-on-write fault.
4527 * Once "object" gets a copy of this page, it
4528 * won't have to rely on "cur_object" to
4529 * provide the contents again.
4530 *
4531 * This is done by setting "need_copy" and
4532 * retrying the fault from the top with the
4533 * appropriate locking.
4534 *
4535 * Special case: if the mapping is executable
4536 * and the untrusted object is code-signed and
4537 * the process is "cs_enforced", we do not
4538 * copy-on-read because that would break
4539 * code-signing enforcement expectations (an
4540 * executable page must belong to a code-signed
4541 * object) and we can rely on code-signing
4542 * to re-validate the page if it gets evicted
4543 * and paged back in.
4544 */
4545 // printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4546 vm_copied_on_read++;
4547 need_copy = TRUE;
4548
4549 vm_object_unlock(object);
4550 vm_object_unlock(cur_object);
4551 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4552 vm_map_unlock_read(map);
4553 if (real_map != map) {
4554 vm_map_unlock(real_map);
4555 }
4556 goto RetryFault;
4557 }
4558
4559 if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4560 if (!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4561 prot &= ~VM_PROT_WRITE;
4562 } else {
4563 /*
4564 * For a protection that the pmap cares
4565 * about, we must hand over the full
4566 * set of protections (so that the pmap
4567 * layer can apply any desired policy).
4568 * This means that cs_bypass must be
4569 * set, as this can force us to pass
4570 * RWX.
4571 */
4572 assert(fault_info.cs_bypass);
4573 }
4574
4575 if (object != cur_object) {
4576 /*
4577 * We still need to hold the top object
4578 * lock here to prevent a race between
4579 * a read fault (taking only "shared"
4580 * locks) and a write fault (taking
4581 * an "exclusive" lock on the top
4582 * object.
4583 * Otherwise, as soon as we release the
4584 * top lock, the write fault could
4585 * proceed and actually complete before
4586 * the read fault, and the copied page's
4587 * translation could then be overwritten
4588 * by the read fault's translation for
4589 * the original page.
4590 *
4591 * Let's just record what the top object
4592 * is and we'll release it later.
4593 */
4594 top_object = object;
4595
4596 /*
4597 * switch to the object that has the new page
4598 */
4599 object = cur_object;
4600 object_lock_type = cur_object_lock_type;
4601 }
4602 FastPmapEnter:
4603 assert(m_object == VM_PAGE_OBJECT(m));
4604
4605 /*
4606 * prepare for the pmap_enter...
4607 * object and map are both locked
4608 * m contains valid data
4609 * object == m->vmp_object
4610 * cur_object == NULL or it's been unlocked
4611 * no paging references on either object or cur_object
4612 */
4613 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4614 need_retry_ptr = &need_retry;
4615 } else {
4616 need_retry_ptr = NULL;
4617 }
4618
4619 if (fault_page_size < PAGE_SIZE) {
4620 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4621 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4622 fault_phys_offset < PAGE_SIZE),
4623 "0x%llx\n", (uint64_t)fault_phys_offset);
4624 } else {
4625 assertf(fault_phys_offset == 0,
4626 "0x%llx\n", (uint64_t)fault_phys_offset);
4627 }
4628
4629 if (caller_pmap) {
4630 kr = vm_fault_enter(m,
4631 caller_pmap,
4632 caller_pmap_addr,
4633 fault_page_size,
4634 fault_phys_offset,
4635 prot,
4636 caller_prot,
4637 wired,
4638 change_wiring,
4639 wire_tag,
4640 &fault_info,
4641 need_retry_ptr,
4642 &type_of_fault);
4643 } else {
4644 kr = vm_fault_enter(m,
4645 pmap,
4646 vaddr,
4647 fault_page_size,
4648 fault_phys_offset,
4649 prot,
4650 caller_prot,
4651 wired,
4652 change_wiring,
4653 wire_tag,
4654 &fault_info,
4655 need_retry_ptr,
4656 &type_of_fault);
4657 }
4658
4659 vm_fault_complete(
4660 map,
4661 real_map,
4662 object,
4663 m_object,
4664 m,
4665 offset,
4666 trace_real_vaddr,
4667 &fault_info,
4668 caller_prot,
4669 real_vaddr,
4670 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4671 need_retry,
4672 kr,
4673 physpage_p,
4674 prot,
4675 top_object,
4676 need_collapse,
4677 cur_offset,
4678 fault_type,
4679 &written_on_object,
4680 &written_on_pager,
4681 &written_on_offset);
4682 top_object = VM_OBJECT_NULL;
4683 if (need_retry == TRUE) {
4684 /*
4685 * vm_fault_enter couldn't complete the PMAP_ENTER...
4686 * at this point we don't hold any locks so it's safe
4687 * to ask the pmap layer to expand the page table to
4688 * accommodate this mapping... once expanded, we'll
4689 * re-drive the fault which should result in vm_fault_enter
4690 * being able to successfully enter the mapping this time around
4691 */
4692 (void)pmap_enter_options(
4693 pmap, vaddr, 0, 0, 0, 0, 0,
4694 PMAP_OPTIONS_NOENTER, NULL);
4695
4696 need_retry = FALSE;
4697 goto RetryFault;
4698 }
4699 goto done;
4700 }
4701 /*
4702 * COPY ON WRITE FAULT
4703 */
4704 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4705
4706 /*
4707 * If objects match, then
4708 * object->copy must not be NULL (else control
4709 * would be in previous code block), and we
4710 * have a potential push into the copy object
4711 * with which we can't cope with here.
4712 */
4713 if (cur_object == object) {
4714 /*
4715 * must take the slow path to
4716 * deal with the copy push
4717 */
4718 break;
4719 }
4720
4721 /*
4722 * This is now a shadow based copy on write
4723 * fault -- it requires a copy up the shadow
4724 * chain.
4725 */
4726 assert(m_object == VM_PAGE_OBJECT(m));
4727
4728 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4729 vm_fault_cs_need_validation(NULL, m, m_object,
4730 PAGE_SIZE, 0)) {
4731 goto upgrade_lock_and_retry;
4732 }
4733
4734 /*
4735 * Allocate a page in the original top level
4736 * object. Give up if allocate fails. Also
4737 * need to remember current page, as it's the
4738 * source of the copy.
4739 *
4740 * at this point we hold locks on both
4741 * object and cur_object... no need to take
4742 * paging refs or mark pages BUSY since
4743 * we don't drop either object lock until
4744 * the page has been copied and inserted
4745 */
4746 cur_m = m;
4747 m = vm_page_grab_options(grab_options);
4748 m_object = NULL;
4749
4750 if (m == VM_PAGE_NULL) {
4751 /*
4752 * no free page currently available...
4753 * must take the slow path
4754 */
4755 break;
4756 }
4757 /*
4758 * Now do the copy. Mark the source page busy...
4759 *
4760 * NOTE: This code holds the map lock across
4761 * the page copy.
4762 */
4763 vm_page_copy(cur_m, m);
4764 vm_page_insert(m, object, vm_object_trunc_page(offset));
4765 if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
4766 DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4767 }
4768 m_object = object;
4769 SET_PAGE_DIRTY(m, FALSE);
4770
4771 /*
4772 * Now cope with the source page and object
4773 */
4774 if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4775 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4776 } else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
4777 /*
4778 * We've copied the full 16K page but we're
4779 * about to call vm_fault_enter() only for
4780 * the 4K chunk we're faulting on. The other
4781 * three 4K chunks in that page could still
4782 * be pmapped in this pmap.
4783 * Since the VM object layer thinks that the
4784 * entire page has been dealt with and the
4785 * original page might no longer be needed,
4786 * it might collapse/bypass the original VM
4787 * object and free its pages, which would be
4788 * bad (and would trigger pmap_verify_free()
4789 * assertions) if the other 4K chunks are still
4790 * pmapped.
4791 */
4792 /*
4793 * XXX FBDP TODO4K: to be revisisted
4794 * Technically, we need to pmap_disconnect()
4795 * only the target pmap's mappings for the 4K
4796 * chunks of this 16K VM page. If other pmaps
4797 * have PTEs on these chunks, that means that
4798 * the associated VM map must have a reference
4799 * on the VM object, so no need to worry about
4800 * those.
4801 * pmap_protect() for each 4K chunk would be
4802 * better but we'd have to check which chunks
4803 * are actually mapped before and after this
4804 * one.
4805 * A full-blown pmap_disconnect() is easier
4806 * for now but not efficient.
4807 */
4808 DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
4809 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4810 }
4811
4812 if (cur_m->vmp_clustered) {
4813 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4814 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4815 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4816 }
4817 need_collapse = TRUE;
4818
4819 if (!cur_object->internal &&
4820 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4821 /*
4822 * The object from which we've just
4823 * copied a page is most probably backed
4824 * by a vnode. We don't want to waste too
4825 * much time trying to collapse the VM objects
4826 * and create a bottleneck when several tasks
4827 * map the same file.
4828 */
4829 if (cur_object->copy == object) {
4830 /*
4831 * Shared mapping or no COW yet.
4832 * We can never collapse a copy
4833 * object into its backing object.
4834 */
4835 need_collapse = FALSE;
4836 } else if (cur_object->copy == object->shadow &&
4837 object->shadow->resident_page_count == 0) {
4838 /*
4839 * Shared mapping after a COW occurred.
4840 */
4841 need_collapse = FALSE;
4842 }
4843 }
4844 vm_object_unlock(cur_object);
4845
4846 if (need_collapse == FALSE) {
4847 vm_fault_collapse_skipped++;
4848 }
4849 vm_fault_collapse_total++;
4850
4851 type_of_fault = DBG_COW_FAULT;
4852 VM_STAT_INCR(cow_faults);
4853 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4854 current_task()->cow_faults++;
4855
4856 goto FastPmapEnter;
4857 } else {
4858 /*
4859 * No page at cur_object, cur_offset... m == NULL
4860 */
4861 if (cur_object->pager_created) {
4862 vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4863
4864 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4865 int my_fault_type;
4866 uint8_t c_flags = C_DONT_BLOCK;
4867 bool insert_cur_object = FALSE;
4868
4869 /*
4870 * May have to talk to a pager...
4871 * if so, take the slow path by
4872 * doing a 'break' from the while (TRUE) loop
4873 *
4874 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4875 * if the compressor is active and the page exists there
4876 */
4877 if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4878 break;
4879 }
4880
4881 if (map == kernel_map || real_map == kernel_map) {
4882 /*
4883 * can't call into the compressor with the kernel_map
4884 * lock held, since the compressor may try to operate
4885 * on the kernel map in order to return an empty c_segment
4886 */
4887 break;
4888 }
4889 if (object != cur_object) {
4890 if (fault_type & VM_PROT_WRITE) {
4891 c_flags |= C_KEEP;
4892 } else {
4893 insert_cur_object = TRUE;
4894 }
4895 }
4896 if (insert_cur_object == TRUE) {
4897 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4898 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4899
4900 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4901 /*
4902 * couldn't upgrade so go do a full retry
4903 * immediately since we can no longer be
4904 * certain about cur_object (since we
4905 * don't hold a reference on it)...
4906 * first drop the top object lock
4907 */
4908 vm_object_unlock(object);
4909
4910 vm_map_unlock_read(map);
4911 if (real_map != map) {
4912 vm_map_unlock(real_map);
4913 }
4914
4915 goto RetryFault;
4916 }
4917 }
4918 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4919 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4920
4921 if (object != cur_object) {
4922 /*
4923 * we can't go for the upgrade on the top
4924 * lock since the upgrade may block waiting
4925 * for readers to drain... since we hold
4926 * cur_object locked at this point, waiting
4927 * for the readers to drain would represent
4928 * a lock order inversion since the lock order
4929 * for objects is the reference order in the
4930 * shadown chain
4931 */
4932 vm_object_unlock(object);
4933 vm_object_unlock(cur_object);
4934
4935 vm_map_unlock_read(map);
4936 if (real_map != map) {
4937 vm_map_unlock(real_map);
4938 }
4939
4940 goto RetryFault;
4941 }
4942 if (vm_object_lock_upgrade(object) == FALSE) {
4943 /*
4944 * couldn't upgrade, so explictly take the lock
4945 * exclusively and go relookup the page since we
4946 * will have dropped the object lock and
4947 * a different thread could have inserted
4948 * a page at this offset
4949 * no need for a full retry since we're
4950 * at the top level of the object chain
4951 */
4952 vm_object_lock(object);
4953
4954 continue;
4955 }
4956 }
4957 m = vm_page_grab_options(grab_options);
4958 m_object = NULL;
4959
4960 if (m == VM_PAGE_NULL) {
4961 /*
4962 * no free page currently available...
4963 * must take the slow path
4964 */
4965 break;
4966 }
4967
4968 /*
4969 * The object is and remains locked
4970 * so no need to take a
4971 * "paging_in_progress" reference.
4972 */
4973 bool shared_lock;
4974 if ((object == cur_object &&
4975 object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4976 (object != cur_object &&
4977 cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4978 shared_lock = FALSE;
4979 } else {
4980 shared_lock = TRUE;
4981 }
4982
4983 kr = vm_compressor_pager_get(
4984 cur_object->pager,
4985 (vm_object_trunc_page(cur_offset)
4986 + cur_object->paging_offset),
4987 VM_PAGE_GET_PHYS_PAGE(m),
4988 &my_fault_type,
4989 c_flags,
4990 &compressed_count_delta);
4991
4992 vm_compressor_pager_count(
4993 cur_object->pager,
4994 compressed_count_delta,
4995 shared_lock,
4996 cur_object);
4997
4998 if (kr != KERN_SUCCESS) {
4999 vm_page_release(m, FALSE);
5000 m = VM_PAGE_NULL;
5001 }
5002 /*
5003 * If vm_compressor_pager_get() returns
5004 * KERN_MEMORY_FAILURE, then the
5005 * compressed data is permanently lost,
5006 * so return this error immediately.
5007 */
5008 if (kr == KERN_MEMORY_FAILURE) {
5009 if (object != cur_object) {
5010 vm_object_unlock(cur_object);
5011 }
5012 vm_object_unlock(object);
5013 vm_map_unlock_read(map);
5014 if (real_map != map) {
5015 vm_map_unlock(real_map);
5016 }
5017 goto done;
5018 } else if (kr != KERN_SUCCESS) {
5019 break;
5020 }
5021 m->vmp_dirty = TRUE;
5022
5023 /*
5024 * If the object is purgeable, its
5025 * owner's purgeable ledgers will be
5026 * updated in vm_page_insert() but the
5027 * page was also accounted for in a
5028 * "compressed purgeable" ledger, so
5029 * update that now.
5030 */
5031 if (object != cur_object &&
5032 !insert_cur_object) {
5033 /*
5034 * We're not going to insert
5035 * the decompressed page into
5036 * the object it came from.
5037 *
5038 * We're dealing with a
5039 * copy-on-write fault on
5040 * "object".
5041 * We're going to decompress
5042 * the page directly into the
5043 * target "object" while
5044 * keepin the compressed
5045 * page for "cur_object", so
5046 * no ledger update in that
5047 * case.
5048 */
5049 } else if (((cur_object->purgable ==
5050 VM_PURGABLE_DENY) &&
5051 (!cur_object->vo_ledger_tag)) ||
5052 (cur_object->vo_owner ==
5053 NULL)) {
5054 /*
5055 * "cur_object" is not purgeable
5056 * and is not ledger-taged, or
5057 * there's no owner for it,
5058 * so no owner's ledgers to
5059 * update.
5060 */
5061 } else {
5062 /*
5063 * One less compressed
5064 * purgeable/tagged page for
5065 * cur_object's owner.
5066 */
5067 vm_object_owner_compressed_update(
5068 cur_object,
5069 -1);
5070 }
5071
5072 if (insert_cur_object) {
5073 vm_page_insert(m, cur_object, vm_object_trunc_page(cur_offset));
5074 m_object = cur_object;
5075 } else {
5076 vm_page_insert(m, object, vm_object_trunc_page(offset));
5077 m_object = object;
5078 }
5079
5080 if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5081 /*
5082 * If the page is not cacheable,
5083 * we can't let its contents
5084 * linger in the data cache
5085 * after the decompression.
5086 */
5087 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
5088 }
5089
5090 type_of_fault = my_fault_type;
5091
5092 VM_STAT_DECOMPRESSIONS();
5093
5094 if (cur_object != object) {
5095 if (insert_cur_object) {
5096 top_object = object;
5097 /*
5098 * switch to the object that has the new page
5099 */
5100 object = cur_object;
5101 object_lock_type = cur_object_lock_type;
5102 } else {
5103 vm_object_unlock(cur_object);
5104 cur_object = object;
5105 }
5106 }
5107 goto FastPmapEnter;
5108 }
5109 /*
5110 * existence map present and indicates
5111 * that the pager doesn't have this page
5112 */
5113 }
5114 if (cur_object->shadow == VM_OBJECT_NULL ||
5115 resilient_media_retry) {
5116 /*
5117 * Zero fill fault. Page gets
5118 * inserted into the original object.
5119 */
5120 if (cur_object->shadow_severed ||
5121 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
5122 cur_object == compressor_object ||
5123 cur_object == kernel_object ||
5124 cur_object == vm_submap_object) {
5125 if (object != cur_object) {
5126 vm_object_unlock(cur_object);
5127 }
5128 vm_object_unlock(object);
5129
5130 vm_map_unlock_read(map);
5131 if (real_map != map) {
5132 vm_map_unlock(real_map);
5133 }
5134
5135 kr = KERN_MEMORY_ERROR;
5136 goto done;
5137 }
5138 if (cur_object != object) {
5139 vm_object_unlock(cur_object);
5140
5141 cur_object = object;
5142 }
5143 if (object_lock_type == OBJECT_LOCK_SHARED) {
5144 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5145
5146 if (vm_object_lock_upgrade(object) == FALSE) {
5147 /*
5148 * couldn't upgrade so do a full retry on the fault
5149 * since we dropped the object lock which
5150 * could allow another thread to insert
5151 * a page at this offset
5152 */
5153 vm_map_unlock_read(map);
5154 if (real_map != map) {
5155 vm_map_unlock(real_map);
5156 }
5157
5158 goto RetryFault;
5159 }
5160 }
5161 if (!object->internal) {
5162 panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5163 }
5164 m = vm_page_alloc(object, vm_object_trunc_page(offset));
5165 m_object = NULL;
5166
5167 if (m == VM_PAGE_NULL) {
5168 /*
5169 * no free page currently available...
5170 * must take the slow path
5171 */
5172 break;
5173 }
5174 m_object = object;
5175
5176 /*
5177 * Zeroing the page and entering into it into the pmap
5178 * represents a significant amount of the zero fill fault handler's work.
5179 *
5180 * To improve fault scalability, we'll drop the object lock, if it appears contended,
5181 * now that we've inserted the page into the vm object.
5182 * Before dropping the lock, we need to check protection bits and set the
5183 * mapped bits on the page. Then we can mark the page busy, drop the lock,
5184 * zero it, and do the pmap enter. We'll need to reacquire the lock
5185 * to clear the busy bit and wake up any waiters.
5186 */
5187 vm_fault_cs_clear(m);
5188 m->vmp_pmapped = TRUE;
5189 if (map->no_zero_fill) {
5190 type_of_fault = DBG_NZF_PAGE_FAULT;
5191 } else {
5192 type_of_fault = DBG_ZERO_FILL_FAULT;
5193 }
5194 {
5195 pmap_t destination_pmap;
5196 vm_map_offset_t destination_pmap_vaddr;
5197 vm_prot_t enter_fault_type;
5198 if (caller_pmap) {
5199 destination_pmap = caller_pmap;
5200 destination_pmap_vaddr = caller_pmap_addr;
5201 } else {
5202 destination_pmap = pmap;
5203 destination_pmap_vaddr = vaddr;
5204 }
5205 if (change_wiring) {
5206 enter_fault_type = VM_PROT_NONE;
5207 } else {
5208 enter_fault_type = caller_prot;
5209 }
5210 kr = vm_fault_enter_prepare(m,
5211 destination_pmap,
5212 destination_pmap_vaddr,
5213 &prot,
5214 caller_prot,
5215 fault_page_size,
5216 fault_phys_offset,
5217 change_wiring,
5218 enter_fault_type,
5219 &fault_info,
5220 &type_of_fault,
5221 &page_needs_data_sync);
5222 if (kr != KERN_SUCCESS) {
5223 goto zero_fill_cleanup;
5224 }
5225
5226 if (object_is_contended) {
5227 /*
5228 * At this point the page is in the vm object, but not on a paging queue.
5229 * Since it's accessible to another thread but its contents are invalid
5230 * (it hasn't been zeroed) mark it busy before dropping the object lock.
5231 */
5232 m->vmp_busy = TRUE;
5233 vm_object_unlock(object);
5234 }
5235 if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5236 /*
5237 * Now zero fill page...
5238 * the page is probably going to
5239 * be written soon, so don't bother
5240 * to clear the modified bit
5241 *
5242 * NOTE: This code holds the map
5243 * lock across the zero fill.
5244 */
5245 vm_page_zero_fill(m);
5246 VM_STAT_INCR(zero_fill_count);
5247 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
5248 }
5249 if (page_needs_data_sync) {
5250 pmap_sync_page_data_phys(VM_PAGE_GET_PHYS_PAGE(m));
5251 }
5252
5253 if (top_object != VM_OBJECT_NULL) {
5254 need_retry_ptr = &need_retry;
5255 } else {
5256 need_retry_ptr = NULL;
5257 }
5258 if (object_is_contended) {
5259 kr = vm_fault_pmap_enter(destination_pmap, destination_pmap_vaddr,
5260 fault_page_size, fault_phys_offset,
5261 m, &prot, caller_prot, enter_fault_type, wired,
5262 fault_info.pmap_options, need_retry_ptr);
5263 vm_object_lock(object);
5264 } else {
5265 kr = vm_fault_pmap_enter_with_object_lock(object, destination_pmap, destination_pmap_vaddr,
5266 fault_page_size, fault_phys_offset,
5267 m, &prot, caller_prot, enter_fault_type, wired,
5268 fault_info.pmap_options, need_retry_ptr);
5269 }
5270 }
5271 zero_fill_cleanup:
5272 if (!VM_DYNAMIC_PAGING_ENABLED() &&
5273 (object->purgable == VM_PURGABLE_DENY ||
5274 object->purgable == VM_PURGABLE_NONVOLATILE ||
5275 object->purgable == VM_PURGABLE_VOLATILE)) {
5276 vm_page_lockspin_queues();
5277 if (!VM_DYNAMIC_PAGING_ENABLED()) {
5278 vm_fault_enqueue_throttled_locked(m);
5279 }
5280 vm_page_unlock_queues();
5281 }
5282 vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, fault_info.no_cache, &type_of_fault, kr);
5283
5284 vm_fault_complete(
5285 map,
5286 real_map,
5287 object,
5288 m_object,
5289 m,
5290 offset,
5291 trace_real_vaddr,
5292 &fault_info,
5293 caller_prot,
5294 real_vaddr,
5295 type_of_fault,
5296 need_retry,
5297 kr,
5298 physpage_p,
5299 prot,
5300 top_object,
5301 need_collapse,
5302 cur_offset,
5303 fault_type,
5304 &written_on_object,
5305 &written_on_pager,
5306 &written_on_offset);
5307 top_object = VM_OBJECT_NULL;
5308 if (need_retry == TRUE) {
5309 /*
5310 * vm_fault_enter couldn't complete the PMAP_ENTER...
5311 * at this point we don't hold any locks so it's safe
5312 * to ask the pmap layer to expand the page table to
5313 * accommodate this mapping... once expanded, we'll
5314 * re-drive the fault which should result in vm_fault_enter
5315 * being able to successfully enter the mapping this time around
5316 */
5317 (void)pmap_enter_options(
5318 pmap, vaddr, 0, 0, 0, 0, 0,
5319 PMAP_OPTIONS_NOENTER, NULL);
5320
5321 need_retry = FALSE;
5322 goto RetryFault;
5323 }
5324 goto done;
5325 }
5326 /*
5327 * On to the next level in the shadow chain
5328 */
5329 cur_offset += cur_object->vo_shadow_offset;
5330 new_object = cur_object->shadow;
5331 fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5332
5333 /*
5334 * take the new_object's lock with the indicated state
5335 */
5336 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5337 vm_object_lock_shared(new_object);
5338 } else {
5339 vm_object_lock(new_object);
5340 }
5341
5342 if (cur_object != object) {
5343 vm_object_unlock(cur_object);
5344 }
5345
5346 cur_object = new_object;
5347
5348 continue;
5349 }
5350 }
5351 /*
5352 * Cleanup from fast fault failure. Drop any object
5353 * lock other than original and drop map lock.
5354 */
5355 if (object != cur_object) {
5356 vm_object_unlock(cur_object);
5357 }
5358
5359 /*
5360 * must own the object lock exclusively at this point
5361 */
5362 if (object_lock_type == OBJECT_LOCK_SHARED) {
5363 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5364
5365 if (vm_object_lock_upgrade(object) == FALSE) {
5366 /*
5367 * couldn't upgrade, so explictly
5368 * take the lock exclusively
5369 * no need to retry the fault at this
5370 * point since "vm_fault_page" will
5371 * completely re-evaluate the state
5372 */
5373 vm_object_lock(object);
5374 }
5375 }
5376
5377 handle_copy_delay:
5378 vm_map_unlock_read(map);
5379 if (real_map != map) {
5380 vm_map_unlock(real_map);
5381 }
5382
5383 if (__improbable(object == compressor_object ||
5384 object == kernel_object ||
5385 object == vm_submap_object)) {
5386 /*
5387 * These objects are explicitly managed and populated by the
5388 * kernel. The virtual ranges backed by these objects should
5389 * either have wired pages or "holes" that are not supposed to
5390 * be accessed at all until they get explicitly populated.
5391 * We should never have to resolve a fault on a mapping backed
5392 * by one of these VM objects and providing a zero-filled page
5393 * would be wrong here, so let's fail the fault and let the
5394 * caller crash or recover.
5395 */
5396 vm_object_unlock(object);
5397 kr = KERN_MEMORY_ERROR;
5398 goto done;
5399 }
5400
5401 assert(object != compressor_object);
5402 assert(object != kernel_object);
5403 assert(object != vm_submap_object);
5404
5405 if (resilient_media_retry) {
5406 /*
5407 * We could get here if we failed to get a free page
5408 * to zero-fill and had to take the slow path again.
5409 * Reset our "recovery-from-failed-media" state.
5410 */
5411 assert(resilient_media_object != VM_OBJECT_NULL);
5412 assert(resilient_media_offset != (vm_object_offset_t)-1);
5413 /* release our extra reference on failed object */
5414 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5415 vm_object_deallocate(resilient_media_object);
5416 resilient_media_object = VM_OBJECT_NULL;
5417 resilient_media_offset = (vm_object_offset_t)-1;
5418 resilient_media_retry = FALSE;
5419 }
5420
5421 /*
5422 * Make a reference to this object to
5423 * prevent its disposal while we are messing with
5424 * it. Once we have the reference, the map is free
5425 * to be diddled. Since objects reference their
5426 * shadows (and copies), they will stay around as well.
5427 */
5428 vm_object_reference_locked(object);
5429 vm_object_paging_begin(object);
5430
5431 set_thread_pagein_error(cthread, 0);
5432 error_code = 0;
5433
5434 result_page = VM_PAGE_NULL;
5435 kr = vm_fault_page(object, offset, fault_type,
5436 (change_wiring && !wired),
5437 FALSE, /* page not looked up */
5438 &prot, &result_page, &top_page,
5439 &type_of_fault,
5440 &error_code, map->no_zero_fill,
5441 FALSE, &fault_info);
5442
5443 /*
5444 * if kr != VM_FAULT_SUCCESS, then the paging reference
5445 * has been dropped and the object unlocked... the ref_count
5446 * is still held
5447 *
5448 * if kr == VM_FAULT_SUCCESS, then the paging reference
5449 * is still held along with the ref_count on the original object
5450 *
5451 * the object is returned locked with a paging reference
5452 *
5453 * if top_page != NULL, then it's BUSY and the
5454 * object it belongs to has a paging reference
5455 * but is returned unlocked
5456 */
5457 if (kr != VM_FAULT_SUCCESS &&
5458 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5459 if (kr == VM_FAULT_MEMORY_ERROR &&
5460 fault_info.resilient_media) {
5461 assertf(object->internal, "object %p", object);
5462 /*
5463 * This fault failed but the mapping was
5464 * "media resilient", so we'll retry the fault in
5465 * recovery mode to get a zero-filled page in the
5466 * top object.
5467 * Keep the reference on the failing object so
5468 * that we can check that the mapping is still
5469 * pointing to it when we retry the fault.
5470 */
5471 // printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5472 assert(!resilient_media_retry); /* no double retry */
5473 assert(resilient_media_object == VM_OBJECT_NULL);
5474 assert(resilient_media_offset == (vm_object_offset_t)-1);
5475 resilient_media_retry = TRUE;
5476 resilient_media_object = object;
5477 resilient_media_offset = offset;
5478 // printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5479 goto RetryFault;
5480 } else {
5481 /*
5482 * we didn't succeed, lose the object reference
5483 * immediately.
5484 */
5485 vm_object_deallocate(object);
5486 object = VM_OBJECT_NULL; /* no longer valid */
5487 }
5488
5489 /*
5490 * See why we failed, and take corrective action.
5491 */
5492 switch (kr) {
5493 case VM_FAULT_MEMORY_SHORTAGE:
5494 if (vm_page_wait((change_wiring) ?
5495 THREAD_UNINT :
5496 THREAD_ABORTSAFE)) {
5497 goto RetryFault;
5498 }
5499 OS_FALLTHROUGH;
5500 case VM_FAULT_INTERRUPTED:
5501 kr = KERN_ABORTED;
5502 goto done;
5503 case VM_FAULT_RETRY:
5504 goto RetryFault;
5505 case VM_FAULT_MEMORY_ERROR:
5506 if (error_code) {
5507 kr = error_code;
5508 } else {
5509 kr = KERN_MEMORY_ERROR;
5510 }
5511 goto done;
5512 default:
5513 panic("vm_fault: unexpected error 0x%x from "
5514 "vm_fault_page()\n", kr);
5515 }
5516 }
5517 m = result_page;
5518 m_object = NULL;
5519
5520 if (m != VM_PAGE_NULL) {
5521 m_object = VM_PAGE_OBJECT(m);
5522 assert((change_wiring && !wired) ?
5523 (top_page == VM_PAGE_NULL) :
5524 ((top_page == VM_PAGE_NULL) == (m_object == object)));
5525 }
5526
5527 /*
5528 * What to do with the resulting page from vm_fault_page
5529 * if it doesn't get entered into the physical map:
5530 */
5531 #define RELEASE_PAGE(m) \
5532 MACRO_BEGIN \
5533 PAGE_WAKEUP_DONE(m); \
5534 if ( !VM_PAGE_PAGEABLE(m)) { \
5535 vm_page_lockspin_queues(); \
5536 if ( !VM_PAGE_PAGEABLE(m)) \
5537 vm_page_activate(m); \
5538 vm_page_unlock_queues(); \
5539 } \
5540 MACRO_END
5541
5542
5543 object_locks_dropped = FALSE;
5544 /*
5545 * We must verify that the maps have not changed
5546 * since our last lookup. vm_map_verify() needs the
5547 * map lock (shared) but we are holding object locks.
5548 * So we do a try_lock() first and, if that fails, we
5549 * drop the object locks and go in for the map lock again.
5550 */
5551 if (!vm_map_try_lock_read(original_map)) {
5552 if (m != VM_PAGE_NULL) {
5553 old_copy_object = m_object->copy;
5554 vm_object_unlock(m_object);
5555 } else {
5556 old_copy_object = VM_OBJECT_NULL;
5557 vm_object_unlock(object);
5558 }
5559
5560 object_locks_dropped = TRUE;
5561
5562 vm_map_lock_read(original_map);
5563 }
5564
5565 if ((map != original_map) || !vm_map_verify(map, &version)) {
5566 if (object_locks_dropped == FALSE) {
5567 if (m != VM_PAGE_NULL) {
5568 old_copy_object = m_object->copy;
5569 vm_object_unlock(m_object);
5570 } else {
5571 old_copy_object = VM_OBJECT_NULL;
5572 vm_object_unlock(object);
5573 }
5574
5575 object_locks_dropped = TRUE;
5576 }
5577
5578 /*
5579 * no object locks are held at this point
5580 */
5581 vm_object_t retry_object;
5582 vm_object_offset_t retry_offset;
5583 vm_prot_t retry_prot;
5584
5585 /*
5586 * To avoid trying to write_lock the map while another
5587 * thread has it read_locked (in vm_map_pageable), we
5588 * do not try for write permission. If the page is
5589 * still writable, we will get write permission. If it
5590 * is not, or has been marked needs_copy, we enter the
5591 * mapping without write permission, and will merely
5592 * take another fault.
5593 */
5594 map = original_map;
5595
5596 kr = vm_map_lookup_locked(&map, vaddr,
5597 fault_type & ~VM_PROT_WRITE,
5598 OBJECT_LOCK_EXCLUSIVE, &version,
5599 &retry_object, &retry_offset, &retry_prot,
5600 &wired,
5601 &fault_info,
5602 &real_map,
5603 NULL);
5604 pmap = real_map->pmap;
5605
5606 if (kr != KERN_SUCCESS) {
5607 vm_map_unlock_read(map);
5608
5609 if (m != VM_PAGE_NULL) {
5610 assert(VM_PAGE_OBJECT(m) == m_object);
5611
5612 /*
5613 * retake the lock so that
5614 * we can drop the paging reference
5615 * in vm_fault_cleanup and do the
5616 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5617 */
5618 vm_object_lock(m_object);
5619
5620 RELEASE_PAGE(m);
5621
5622 vm_fault_cleanup(m_object, top_page);
5623 } else {
5624 /*
5625 * retake the lock so that
5626 * we can drop the paging reference
5627 * in vm_fault_cleanup
5628 */
5629 vm_object_lock(object);
5630
5631 vm_fault_cleanup(object, top_page);
5632 }
5633 vm_object_deallocate(object);
5634
5635 goto done;
5636 }
5637 vm_object_unlock(retry_object);
5638
5639 if ((retry_object != object) || (retry_offset != offset)) {
5640 vm_map_unlock_read(map);
5641 if (real_map != map) {
5642 vm_map_unlock(real_map);
5643 }
5644
5645 if (m != VM_PAGE_NULL) {
5646 assert(VM_PAGE_OBJECT(m) == m_object);
5647
5648 /*
5649 * retake the lock so that
5650 * we can drop the paging reference
5651 * in vm_fault_cleanup and do the
5652 * PAGE_WAKEUP_DONE in RELEASE_PAGE
5653 */
5654 vm_object_lock(m_object);
5655
5656 RELEASE_PAGE(m);
5657
5658 vm_fault_cleanup(m_object, top_page);
5659 } else {
5660 /*
5661 * retake the lock so that
5662 * we can drop the paging reference
5663 * in vm_fault_cleanup
5664 */
5665 vm_object_lock(object);
5666
5667 vm_fault_cleanup(object, top_page);
5668 }
5669 vm_object_deallocate(object);
5670
5671 goto RetryFault;
5672 }
5673 /*
5674 * Check whether the protection has changed or the object
5675 * has been copied while we left the map unlocked.
5676 */
5677 if (pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, retry_prot)) {
5678 /* If the pmap layer cares, pass the full set. */
5679 prot = retry_prot;
5680 } else {
5681 prot &= retry_prot;
5682 }
5683 }
5684
5685 if (object_locks_dropped == TRUE) {
5686 if (m != VM_PAGE_NULL) {
5687 vm_object_lock(m_object);
5688
5689 if (m_object->copy != old_copy_object) {
5690 /*
5691 * The copy object changed while the top-level object
5692 * was unlocked, so take away write permission.
5693 */
5694 assert(!pmap_has_prot_policy(pmap, fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot));
5695 prot &= ~VM_PROT_WRITE;
5696 }
5697 } else {
5698 vm_object_lock(object);
5699 }
5700
5701 object_locks_dropped = FALSE;
5702 }
5703
5704 if (!need_copy &&
5705 !fault_info.no_copy_on_read &&
5706 m != VM_PAGE_NULL &&
5707 VM_PAGE_OBJECT(m) != object &&
5708 !VM_PAGE_OBJECT(m)->pager_trusted &&
5709 vm_protect_privileged_from_untrusted &&
5710 !((prot & VM_PROT_EXECUTE) &&
5711 VM_PAGE_OBJECT(m)->code_signed &&
5712 pmap_get_vm_map_cs_enforced(caller_pmap ? caller_pmap : pmap)) &&
5713 current_proc_is_privileged()) {
5714 /*
5715 * We found the page we want in an "untrusted" VM object
5716 * down the shadow chain. Since the target is "privileged"
5717 * we want to perform a copy-on-read of that page, so that the
5718 * mapped object gets a stable copy and does not have to
5719 * rely on the "untrusted" object to provide the same
5720 * contents if the page gets reclaimed and has to be paged
5721 * in again later on.
5722 *
5723 * Special case: if the mapping is executable and the untrusted
5724 * object is code-signed and the process is "cs_enforced", we
5725 * do not copy-on-read because that would break code-signing
5726 * enforcement expectations (an executable page must belong
5727 * to a code-signed object) and we can rely on code-signing
5728 * to re-validate the page if it gets evicted and paged back in.
5729 */
5730 // printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5731 vm_copied_on_read++;
5732 need_copy_on_read = TRUE;
5733 need_copy = TRUE;
5734 } else {
5735 need_copy_on_read = FALSE;
5736 }
5737
5738 /*
5739 * If we want to wire down this page, but no longer have
5740 * adequate permissions, we must start all over.
5741 * If we decided to copy-on-read, we must also start all over.
5742 */
5743 if ((wired && (fault_type != (prot | VM_PROT_WRITE))) ||
5744 need_copy_on_read) {
5745 vm_map_unlock_read(map);
5746 if (real_map != map) {
5747 vm_map_unlock(real_map);
5748 }
5749
5750 if (m != VM_PAGE_NULL) {
5751 assert(VM_PAGE_OBJECT(m) == m_object);
5752
5753 RELEASE_PAGE(m);
5754
5755 vm_fault_cleanup(m_object, top_page);
5756 } else {
5757 vm_fault_cleanup(object, top_page);
5758 }
5759
5760 vm_object_deallocate(object);
5761
5762 goto RetryFault;
5763 }
5764 if (m != VM_PAGE_NULL) {
5765 /*
5766 * Put this page into the physical map.
5767 * We had to do the unlock above because pmap_enter
5768 * may cause other faults. The page may be on
5769 * the pageout queues. If the pageout daemon comes
5770 * across the page, it will remove it from the queues.
5771 */
5772 if (fault_page_size < PAGE_SIZE) {
5773 DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
5774 assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
5775 fault_phys_offset < PAGE_SIZE),
5776 "0x%llx\n", (uint64_t)fault_phys_offset);
5777 } else {
5778 assertf(fault_phys_offset == 0,
5779 "0x%llx\n", (uint64_t)fault_phys_offset);
5780 }
5781 if (caller_pmap) {
5782 kr = vm_fault_enter(m,
5783 caller_pmap,
5784 caller_pmap_addr,
5785 fault_page_size,
5786 fault_phys_offset,
5787 prot,
5788 caller_prot,
5789 wired,
5790 change_wiring,
5791 wire_tag,
5792 &fault_info,
5793 NULL,
5794 &type_of_fault);
5795 } else {
5796 kr = vm_fault_enter(m,
5797 pmap,
5798 vaddr,
5799 fault_page_size,
5800 fault_phys_offset,
5801 prot,
5802 caller_prot,
5803 wired,
5804 change_wiring,
5805 wire_tag,
5806 &fault_info,
5807 NULL,
5808 &type_of_fault);
5809 }
5810 assert(VM_PAGE_OBJECT(m) == m_object);
5811
5812 {
5813 int event_code = 0;
5814
5815 if (m_object->internal) {
5816 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
5817 } else if (m_object->object_is_shared_cache) {
5818 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
5819 } else {
5820 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
5821 }
5822
5823 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid(), 0);
5824 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid(), 0, 0, 0, 0);
5825
5826 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
5827 }
5828 if (kr != KERN_SUCCESS) {
5829 /* abort this page fault */
5830 vm_map_unlock_read(map);
5831 if (real_map != map) {
5832 vm_map_unlock(real_map);
5833 }
5834 PAGE_WAKEUP_DONE(m);
5835 vm_fault_cleanup(m_object, top_page);
5836 vm_object_deallocate(object);
5837 goto done;
5838 }
5839 if (physpage_p != NULL) {
5840 /* for vm_map_wire_and_extract() */
5841 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5842 if (prot & VM_PROT_WRITE) {
5843 vm_object_lock_assert_exclusive(m_object);
5844 m->vmp_dirty = TRUE;
5845 }
5846 }
5847 } else {
5848 vm_map_entry_t entry;
5849 vm_map_offset_t laddr;
5850 vm_map_offset_t ldelta, hdelta;
5851
5852 /*
5853 * do a pmap block mapping from the physical address
5854 * in the object
5855 */
5856
5857 if (real_map != map) {
5858 vm_map_unlock(real_map);
5859 }
5860
5861 if (original_map != map) {
5862 vm_map_unlock_read(map);
5863 vm_map_lock_read(original_map);
5864 map = original_map;
5865 }
5866 real_map = map;
5867
5868 laddr = vaddr;
5869 hdelta = 0xFFFFF000;
5870 ldelta = 0xFFFFF000;
5871
5872 while (vm_map_lookup_entry(map, laddr, &entry)) {
5873 if (ldelta > (laddr - entry->vme_start)) {
5874 ldelta = laddr - entry->vme_start;
5875 }
5876 if (hdelta > (entry->vme_end - laddr)) {
5877 hdelta = entry->vme_end - laddr;
5878 }
5879 if (entry->is_sub_map) {
5880 laddr = ((laddr - entry->vme_start)
5881 + VME_OFFSET(entry));
5882 vm_map_lock_read(VME_SUBMAP(entry));
5883
5884 if (map != real_map) {
5885 vm_map_unlock_read(map);
5886 }
5887 if (entry->use_pmap) {
5888 vm_map_unlock_read(real_map);
5889 real_map = VME_SUBMAP(entry);
5890 }
5891 map = VME_SUBMAP(entry);
5892 } else {
5893 break;
5894 }
5895 }
5896
5897 if (vm_map_lookup_entry(map, laddr, &entry) &&
5898 (VME_OBJECT(entry) != NULL) &&
5899 (VME_OBJECT(entry) == object)) {
5900 uint16_t superpage;
5901
5902 if (!object->pager_created &&
5903 object->phys_contiguous &&
5904 VME_OFFSET(entry) == 0 &&
5905 (entry->vme_end - entry->vme_start == object->vo_size) &&
5906 VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
5907 superpage = VM_MEM_SUPERPAGE;
5908 } else {
5909 superpage = 0;
5910 }
5911
5912 if (superpage && physpage_p) {
5913 /* for vm_map_wire_and_extract() */
5914 *physpage_p = (ppnum_t)
5915 ((((vm_map_offset_t)
5916 object->vo_shadow_offset)
5917 + VME_OFFSET(entry)
5918 + (laddr - entry->vme_start))
5919 >> PAGE_SHIFT);
5920 }
5921
5922 if (caller_pmap) {
5923 /*
5924 * Set up a block mapped area
5925 */
5926 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5927 kr = pmap_map_block(caller_pmap,
5928 (addr64_t)(caller_pmap_addr - ldelta),
5929 (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5930 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5931 (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5932 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5933
5934 if (kr != KERN_SUCCESS) {
5935 goto cleanup;
5936 }
5937 } else {
5938 /*
5939 * Set up a block mapped area
5940 */
5941 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5942 kr = pmap_map_block(real_map->pmap,
5943 (addr64_t)(vaddr - ldelta),
5944 (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5945 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5946 (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5947 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5948
5949 if (kr != KERN_SUCCESS) {
5950 goto cleanup;
5951 }
5952 }
5953 }
5954 }
5955
5956 /*
5957 * Success
5958 */
5959 kr = KERN_SUCCESS;
5960
5961 /*
5962 * TODO: could most of the done cases just use cleanup?
5963 */
5964 cleanup:
5965 /*
5966 * Unlock everything, and return
5967 */
5968 vm_map_unlock_read(map);
5969 if (real_map != map) {
5970 vm_map_unlock(real_map);
5971 }
5972
5973 if (m != VM_PAGE_NULL) {
5974 assert(VM_PAGE_OBJECT(m) == m_object);
5975
5976 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5977 vm_object_paging_begin(m_object);
5978
5979 assert(written_on_object == VM_OBJECT_NULL);
5980 written_on_object = m_object;
5981 written_on_pager = m_object->pager;
5982 written_on_offset = m_object->paging_offset + m->vmp_offset;
5983 }
5984 PAGE_WAKEUP_DONE(m);
5985
5986 vm_fault_cleanup(m_object, top_page);
5987 } else {
5988 vm_fault_cleanup(object, top_page);
5989 }
5990
5991 vm_object_deallocate(object);
5992
5993 #undef RELEASE_PAGE
5994
5995 done:
5996 thread_interrupt_level(interruptible_state);
5997
5998 if (resilient_media_object != VM_OBJECT_NULL) {
5999 assert(resilient_media_retry);
6000 assert(resilient_media_offset != (vm_object_offset_t)-1);
6001 /* release extra reference on failed object */
6002 // printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6003 vm_object_deallocate(resilient_media_object);
6004 resilient_media_object = VM_OBJECT_NULL;
6005 resilient_media_offset = (vm_object_offset_t)-1;
6006 resilient_media_retry = FALSE;
6007 }
6008 assert(!resilient_media_retry);
6009
6010 /*
6011 * Only I/O throttle on faults which cause a pagein/swapin.
6012 */
6013 if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6014 throttle_lowpri_io(1);
6015 } else {
6016 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6017 if ((throttle_delay = vm_page_throttled(TRUE))) {
6018 if (vm_debug_events) {
6019 if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6020 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6021 } else if (type_of_fault == DBG_COW_FAULT) {
6022 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6023 } else {
6024 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
6025 }
6026 }
6027 delay(throttle_delay);
6028 }
6029 }
6030 }
6031
6032 if (written_on_object) {
6033 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6034
6035 vm_object_lock(written_on_object);
6036 vm_object_paging_end(written_on_object);
6037 vm_object_unlock(written_on_object);
6038
6039 written_on_object = VM_OBJECT_NULL;
6040 }
6041
6042 if (rtfault) {
6043 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6044 }
6045
6046 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
6047 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
6048 ((uint64_t)trace_vaddr >> 32),
6049 trace_vaddr,
6050 kr,
6051 vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
6052 0);
6053
6054 if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6055 DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6056 }
6057
6058 return kr;
6059 }
6060
6061 /*
6062 * vm_fault_wire:
6063 *
6064 * Wire down a range of virtual addresses in a map.
6065 */
6066 kern_return_t
6067 vm_fault_wire(
6068 vm_map_t map,
6069 vm_map_entry_t entry,
6070 vm_prot_t prot,
6071 vm_tag_t wire_tag,
6072 pmap_t pmap,
6073 vm_map_offset_t pmap_addr,
6074 ppnum_t *physpage_p)
6075 {
6076 vm_map_offset_t va;
6077 vm_map_offset_t end_addr = entry->vme_end;
6078 kern_return_t rc;
6079 vm_map_size_t effective_page_size;
6080
6081 assert(entry->in_transition);
6082
6083 if ((VME_OBJECT(entry) != NULL) &&
6084 !entry->is_sub_map &&
6085 VME_OBJECT(entry)->phys_contiguous) {
6086 return KERN_SUCCESS;
6087 }
6088
6089 /*
6090 * Inform the physical mapping system that the
6091 * range of addresses may not fault, so that
6092 * page tables and such can be locked down as well.
6093 */
6094
6095 pmap_pageable(pmap, pmap_addr,
6096 pmap_addr + (end_addr - entry->vme_start), FALSE);
6097
6098 /*
6099 * We simulate a fault to get the page and enter it
6100 * in the physical map.
6101 */
6102
6103 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6104 for (va = entry->vme_start;
6105 va < end_addr;
6106 va += effective_page_size) {
6107 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6108 pmap_addr + (va - entry->vme_start),
6109 physpage_p);
6110 if (rc != KERN_SUCCESS) {
6111 rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
6112 ((pmap == kernel_pmap)
6113 ? THREAD_UNINT
6114 : THREAD_ABORTSAFE),
6115 pmap,
6116 (pmap_addr +
6117 (va - entry->vme_start)),
6118 physpage_p);
6119 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
6120 }
6121
6122 if (rc != KERN_SUCCESS) {
6123 struct vm_map_entry tmp_entry = *entry;
6124
6125 /* unwire wired pages */
6126 tmp_entry.vme_end = va;
6127 vm_fault_unwire(map,
6128 &tmp_entry, FALSE, pmap, pmap_addr);
6129
6130 return rc;
6131 }
6132 }
6133 return KERN_SUCCESS;
6134 }
6135
6136 /*
6137 * vm_fault_unwire:
6138 *
6139 * Unwire a range of virtual addresses in a map.
6140 */
6141 void
6142 vm_fault_unwire(
6143 vm_map_t map,
6144 vm_map_entry_t entry,
6145 boolean_t deallocate,
6146 pmap_t pmap,
6147 vm_map_offset_t pmap_addr)
6148 {
6149 vm_map_offset_t va;
6150 vm_map_offset_t end_addr = entry->vme_end;
6151 vm_object_t object;
6152 struct vm_object_fault_info fault_info = {};
6153 unsigned int unwired_pages;
6154 vm_map_size_t effective_page_size;
6155
6156 object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6157
6158 /*
6159 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6160 * do anything since such memory is wired by default. So we don't have
6161 * anything to undo here.
6162 */
6163
6164 if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6165 return;
6166 }
6167
6168 fault_info.interruptible = THREAD_UNINT;
6169 fault_info.behavior = entry->behavior;
6170 fault_info.user_tag = VME_ALIAS(entry);
6171 if (entry->iokit_acct ||
6172 (!entry->is_sub_map && !entry->use_pmap)) {
6173 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6174 }
6175 fault_info.lo_offset = VME_OFFSET(entry);
6176 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6177 fault_info.no_cache = entry->no_cache;
6178 fault_info.stealth = TRUE;
6179
6180 unwired_pages = 0;
6181
6182 /*
6183 * Since the pages are wired down, we must be able to
6184 * get their mappings from the physical map system.
6185 */
6186
6187 effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6188 for (va = entry->vme_start;
6189 va < end_addr;
6190 va += effective_page_size) {
6191 if (object == VM_OBJECT_NULL) {
6192 if (pmap) {
6193 pmap_change_wiring(pmap,
6194 pmap_addr + (va - entry->vme_start), FALSE);
6195 }
6196 (void) vm_fault(map, va, VM_PROT_NONE,
6197 TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
6198 } else {
6199 vm_prot_t prot;
6200 vm_page_t result_page;
6201 vm_page_t top_page;
6202 vm_object_t result_object;
6203 vm_fault_return_t result;
6204
6205 /* cap cluster size at maximum UPL size */
6206 upl_size_t cluster_size;
6207 if (os_sub_overflow(end_addr, va, &cluster_size)) {
6208 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6209 }
6210 fault_info.cluster_size = cluster_size;
6211
6212 do {
6213 prot = VM_PROT_NONE;
6214
6215 vm_object_lock(object);
6216 vm_object_paging_begin(object);
6217 result_page = VM_PAGE_NULL;
6218 result = vm_fault_page(
6219 object,
6220 (VME_OFFSET(entry) +
6221 (va - entry->vme_start)),
6222 VM_PROT_NONE, TRUE,
6223 FALSE, /* page not looked up */
6224 &prot, &result_page, &top_page,
6225 (int *)0,
6226 NULL, map->no_zero_fill,
6227 FALSE, &fault_info);
6228 } while (result == VM_FAULT_RETRY);
6229
6230 /*
6231 * If this was a mapping to a file on a device that has been forcibly
6232 * unmounted, then we won't get a page back from vm_fault_page(). Just
6233 * move on to the next one in case the remaining pages are mapped from
6234 * different objects. During a forced unmount, the object is terminated
6235 * so the alive flag will be false if this happens. A forced unmount will
6236 * will occur when an external disk is unplugged before the user does an
6237 * eject, so we don't want to panic in that situation.
6238 */
6239
6240 if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
6241 continue;
6242 }
6243
6244 if (result == VM_FAULT_MEMORY_ERROR &&
6245 object == kernel_object) {
6246 /*
6247 * This must have been allocated with
6248 * KMA_KOBJECT and KMA_VAONLY and there's
6249 * no physical page at this offset.
6250 * We're done (no page to free).
6251 */
6252 assert(deallocate);
6253 continue;
6254 }
6255
6256 if (result != VM_FAULT_SUCCESS) {
6257 panic("vm_fault_unwire: failure");
6258 }
6259
6260 result_object = VM_PAGE_OBJECT(result_page);
6261
6262 if (deallocate) {
6263 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6264 vm_page_fictitious_addr);
6265 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
6266 if (VM_PAGE_WIRED(result_page)) {
6267 unwired_pages++;
6268 }
6269 VM_PAGE_FREE(result_page);
6270 } else {
6271 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
6272 pmap_change_wiring(pmap,
6273 pmap_addr + (va - entry->vme_start), FALSE);
6274 }
6275
6276
6277 if (VM_PAGE_WIRED(result_page)) {
6278 vm_page_lockspin_queues();
6279 vm_page_unwire(result_page, TRUE);
6280 vm_page_unlock_queues();
6281 unwired_pages++;
6282 }
6283 if (entry->zero_wired_pages) {
6284 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
6285 entry->zero_wired_pages = FALSE;
6286 }
6287
6288 PAGE_WAKEUP_DONE(result_page);
6289 }
6290 vm_fault_cleanup(result_object, top_page);
6291 }
6292 }
6293
6294 /*
6295 * Inform the physical mapping system that the range
6296 * of addresses may fault, so that page tables and
6297 * such may be unwired themselves.
6298 */
6299
6300 pmap_pageable(pmap, pmap_addr,
6301 pmap_addr + (end_addr - entry->vme_start), TRUE);
6302
6303 if (kernel_object == object) {
6304 /*
6305 * Would like to make user_tag in vm_object_fault_info
6306 * vm_tag_t (unsigned short) but user_tag derives its value from
6307 * VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6308 * to an _unsigned int_ which is used by non-fault_info paths throughout the
6309 * code at many places.
6310 *
6311 * So, for now, an explicit truncation to unsigned short (vm_tag_t).
6312 */
6313 assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6314 "VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6315 vm_tag_update_size((vm_tag_t) fault_info.user_tag, -ptoa_64(unwired_pages));
6316 }
6317 }
6318
6319 /*
6320 * vm_fault_wire_fast:
6321 *
6322 * Handle common case of a wire down page fault at the given address.
6323 * If successful, the page is inserted into the associated physical map.
6324 * The map entry is passed in to avoid the overhead of a map lookup.
6325 *
6326 * NOTE: the given address should be truncated to the
6327 * proper page address.
6328 *
6329 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
6330 * a standard error specifying why the fault is fatal is returned.
6331 *
6332 * The map in question must be referenced, and remains so.
6333 * Caller has a read lock on the map.
6334 *
6335 * This is a stripped version of vm_fault() for wiring pages. Anything
6336 * other than the common case will return KERN_FAILURE, and the caller
6337 * is expected to call vm_fault().
6338 */
6339 static kern_return_t
6340 vm_fault_wire_fast(
6341 __unused vm_map_t map,
6342 vm_map_offset_t va,
6343 __unused vm_prot_t caller_prot,
6344 vm_tag_t wire_tag,
6345 vm_map_entry_t entry,
6346 pmap_t pmap,
6347 vm_map_offset_t pmap_addr,
6348 ppnum_t *physpage_p)
6349 {
6350 vm_object_t object;
6351 vm_object_offset_t offset;
6352 vm_page_t m;
6353 vm_prot_t prot;
6354 thread_t thread = current_thread();
6355 int type_of_fault;
6356 kern_return_t kr;
6357 vm_map_size_t fault_page_size;
6358 vm_map_offset_t fault_phys_offset;
6359 struct vm_object_fault_info fault_info = {};
6360
6361 VM_STAT_INCR(faults);
6362
6363 if (thread != THREAD_NULL && thread->task != TASK_NULL) {
6364 thread->task->faults++;
6365 }
6366
6367 /*
6368 * Recovery actions
6369 */
6370
6371 #undef RELEASE_PAGE
6372 #define RELEASE_PAGE(m) { \
6373 PAGE_WAKEUP_DONE(m); \
6374 vm_page_lockspin_queues(); \
6375 vm_page_unwire(m, TRUE); \
6376 vm_page_unlock_queues(); \
6377 }
6378
6379
6380 #undef UNLOCK_THINGS
6381 #define UNLOCK_THINGS { \
6382 vm_object_paging_end(object); \
6383 vm_object_unlock(object); \
6384 }
6385
6386 #undef UNLOCK_AND_DEALLOCATE
6387 #define UNLOCK_AND_DEALLOCATE { \
6388 UNLOCK_THINGS; \
6389 vm_object_deallocate(object); \
6390 }
6391 /*
6392 * Give up and have caller do things the hard way.
6393 */
6394
6395 #define GIVE_UP { \
6396 UNLOCK_AND_DEALLOCATE; \
6397 return(KERN_FAILURE); \
6398 }
6399
6400
6401 /*
6402 * If this entry is not directly to a vm_object, bail out.
6403 */
6404 if (entry->is_sub_map) {
6405 assert(physpage_p == NULL);
6406 return KERN_FAILURE;
6407 }
6408
6409 /*
6410 * Find the backing store object and offset into it.
6411 */
6412
6413 object = VME_OBJECT(entry);
6414 offset = (va - entry->vme_start) + VME_OFFSET(entry);
6415 prot = entry->protection;
6416
6417 /*
6418 * Make a reference to this object to prevent its
6419 * disposal while we are messing with it.
6420 */
6421
6422 vm_object_lock(object);
6423 vm_object_reference_locked(object);
6424 vm_object_paging_begin(object);
6425
6426 /*
6427 * INVARIANTS (through entire routine):
6428 *
6429 * 1) At all times, we must either have the object
6430 * lock or a busy page in some object to prevent
6431 * some other thread from trying to bring in
6432 * the same page.
6433 *
6434 * 2) Once we have a busy page, we must remove it from
6435 * the pageout queues, so that the pageout daemon
6436 * will not grab it away.
6437 *
6438 */
6439
6440 /*
6441 * Look for page in top-level object. If it's not there or
6442 * there's something going on, give up.
6443 */
6444 m = vm_page_lookup(object, vm_object_trunc_page(offset));
6445 if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
6446 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
6447 GIVE_UP;
6448 }
6449 if (m->vmp_fictitious &&
6450 VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6451 /*
6452 * Guard pages are fictitious pages and are never
6453 * entered into a pmap, so let's say it's been wired...
6454 */
6455 kr = KERN_SUCCESS;
6456 goto done;
6457 }
6458
6459 /*
6460 * Wire the page down now. All bail outs beyond this
6461 * point must unwire the page.
6462 */
6463
6464 vm_page_lockspin_queues();
6465 vm_page_wire(m, wire_tag, TRUE);
6466 vm_page_unlock_queues();
6467
6468 /*
6469 * Mark page busy for other threads.
6470 */
6471 assert(!m->vmp_busy);
6472 m->vmp_busy = TRUE;
6473 assert(!m->vmp_absent);
6474
6475 /*
6476 * Give up if the page is being written and there's a copy object
6477 */
6478 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6479 RELEASE_PAGE(m);
6480 GIVE_UP;
6481 }
6482
6483 fault_info.user_tag = VME_ALIAS(entry);
6484 fault_info.pmap_options = 0;
6485 if (entry->iokit_acct ||
6486 (!entry->is_sub_map && !entry->use_pmap)) {
6487 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
6488 }
6489
6490 fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6491 fault_phys_offset = offset - vm_object_trunc_page(offset);
6492
6493 /*
6494 * Put this page into the physical map.
6495 */
6496 type_of_fault = DBG_CACHE_HIT_FAULT;
6497 kr = vm_fault_enter(m,
6498 pmap,
6499 pmap_addr,
6500 fault_page_size,
6501 fault_phys_offset,
6502 prot,
6503 prot,
6504 TRUE, /* wired */
6505 FALSE, /* change_wiring */
6506 wire_tag,
6507 &fault_info,
6508 NULL,
6509 &type_of_fault);
6510 if (kr != KERN_SUCCESS) {
6511 RELEASE_PAGE(m);
6512 GIVE_UP;
6513 }
6514
6515 done:
6516 /*
6517 * Unlock everything, and return
6518 */
6519
6520 if (physpage_p) {
6521 /* for vm_map_wire_and_extract() */
6522 if (kr == KERN_SUCCESS) {
6523 assert(object == VM_PAGE_OBJECT(m));
6524 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6525 if (prot & VM_PROT_WRITE) {
6526 vm_object_lock_assert_exclusive(object);
6527 m->vmp_dirty = TRUE;
6528 }
6529 } else {
6530 *physpage_p = 0;
6531 }
6532 }
6533
6534 PAGE_WAKEUP_DONE(m);
6535 UNLOCK_AND_DEALLOCATE;
6536
6537 return kr;
6538 }
6539
6540 /*
6541 * Routine: vm_fault_copy_cleanup
6542 * Purpose:
6543 * Release a page used by vm_fault_copy.
6544 */
6545
6546 static void
6547 vm_fault_copy_cleanup(
6548 vm_page_t page,
6549 vm_page_t top_page)
6550 {
6551 vm_object_t object = VM_PAGE_OBJECT(page);
6552
6553 vm_object_lock(object);
6554 PAGE_WAKEUP_DONE(page);
6555 if (!VM_PAGE_PAGEABLE(page)) {
6556 vm_page_lockspin_queues();
6557 if (!VM_PAGE_PAGEABLE(page)) {
6558 vm_page_activate(page);
6559 }
6560 vm_page_unlock_queues();
6561 }
6562 vm_fault_cleanup(object, top_page);
6563 }
6564
6565 static void
6566 vm_fault_copy_dst_cleanup(
6567 vm_page_t page)
6568 {
6569 vm_object_t object;
6570
6571 if (page != VM_PAGE_NULL) {
6572 object = VM_PAGE_OBJECT(page);
6573 vm_object_lock(object);
6574 vm_page_lockspin_queues();
6575 vm_page_unwire(page, TRUE);
6576 vm_page_unlock_queues();
6577 vm_object_paging_end(object);
6578 vm_object_unlock(object);
6579 }
6580 }
6581
6582 /*
6583 * Routine: vm_fault_copy
6584 *
6585 * Purpose:
6586 * Copy pages from one virtual memory object to another --
6587 * neither the source nor destination pages need be resident.
6588 *
6589 * Before actually copying a page, the version associated with
6590 * the destination address map wil be verified.
6591 *
6592 * In/out conditions:
6593 * The caller must hold a reference, but not a lock, to
6594 * each of the source and destination objects and to the
6595 * destination map.
6596 *
6597 * Results:
6598 * Returns KERN_SUCCESS if no errors were encountered in
6599 * reading or writing the data. Returns KERN_INTERRUPTED if
6600 * the operation was interrupted (only possible if the
6601 * "interruptible" argument is asserted). Other return values
6602 * indicate a permanent error in copying the data.
6603 *
6604 * The actual amount of data copied will be returned in the
6605 * "copy_size" argument. In the event that the destination map
6606 * verification failed, this amount may be less than the amount
6607 * requested.
6608 */
6609 kern_return_t
6610 vm_fault_copy(
6611 vm_object_t src_object,
6612 vm_object_offset_t src_offset,
6613 vm_map_size_t *copy_size, /* INOUT */
6614 vm_object_t dst_object,
6615 vm_object_offset_t dst_offset,
6616 vm_map_t dst_map,
6617 vm_map_version_t *dst_version,
6618 int interruptible)
6619 {
6620 vm_page_t result_page;
6621
6622 vm_page_t src_page;
6623 vm_page_t src_top_page;
6624 vm_prot_t src_prot;
6625
6626 vm_page_t dst_page;
6627 vm_page_t dst_top_page;
6628 vm_prot_t dst_prot;
6629
6630 vm_map_size_t amount_left;
6631 vm_object_t old_copy_object;
6632 vm_object_t result_page_object = NULL;
6633 kern_return_t error = 0;
6634 vm_fault_return_t result;
6635
6636 vm_map_size_t part_size;
6637 struct vm_object_fault_info fault_info_src = {};
6638 struct vm_object_fault_info fault_info_dst = {};
6639
6640 /*
6641 * In order not to confuse the clustered pageins, align
6642 * the different offsets on a page boundary.
6643 */
6644
6645 #define RETURN(x) \
6646 MACRO_BEGIN \
6647 *copy_size -= amount_left; \
6648 MACRO_RETURN(x); \
6649 MACRO_END
6650
6651 amount_left = *copy_size;
6652
6653 fault_info_src.interruptible = interruptible;
6654 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
6655 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
6656 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
6657 fault_info_src.stealth = TRUE;
6658
6659 fault_info_dst.interruptible = interruptible;
6660 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
6661 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
6662 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
6663 fault_info_dst.stealth = TRUE;
6664
6665 do { /* while (amount_left > 0) */
6666 /*
6667 * There may be a deadlock if both source and destination
6668 * pages are the same. To avoid this deadlock, the copy must
6669 * start by getting the destination page in order to apply
6670 * COW semantics if any.
6671 */
6672
6673 RetryDestinationFault:;
6674
6675 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
6676
6677 vm_object_lock(dst_object);
6678 vm_object_paging_begin(dst_object);
6679
6680 /* cap cluster size at maximum UPL size */
6681 upl_size_t cluster_size;
6682 if (os_convert_overflow(amount_left, &cluster_size)) {
6683 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6684 }
6685 fault_info_dst.cluster_size = cluster_size;
6686
6687 dst_page = VM_PAGE_NULL;
6688 result = vm_fault_page(dst_object,
6689 vm_object_trunc_page(dst_offset),
6690 VM_PROT_WRITE | VM_PROT_READ,
6691 FALSE,
6692 FALSE, /* page not looked up */
6693 &dst_prot, &dst_page, &dst_top_page,
6694 (int *)0,
6695 &error,
6696 dst_map->no_zero_fill,
6697 FALSE, &fault_info_dst);
6698 switch (result) {
6699 case VM_FAULT_SUCCESS:
6700 break;
6701 case VM_FAULT_RETRY:
6702 goto RetryDestinationFault;
6703 case VM_FAULT_MEMORY_SHORTAGE:
6704 if (vm_page_wait(interruptible)) {
6705 goto RetryDestinationFault;
6706 }
6707 OS_FALLTHROUGH;
6708 case VM_FAULT_INTERRUPTED:
6709 RETURN(MACH_SEND_INTERRUPTED);
6710 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6711 /* success but no VM page: fail the copy */
6712 vm_object_paging_end(dst_object);
6713 vm_object_unlock(dst_object);
6714 OS_FALLTHROUGH;
6715 case VM_FAULT_MEMORY_ERROR:
6716 if (error) {
6717 return error;
6718 } else {
6719 return KERN_MEMORY_ERROR;
6720 }
6721 default:
6722 panic("vm_fault_copy: unexpected error 0x%x from "
6723 "vm_fault_page()\n", result);
6724 }
6725 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
6726
6727 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6728 old_copy_object = dst_object->copy;
6729
6730 /*
6731 * There exists the possiblity that the source and
6732 * destination page are the same. But we can't
6733 * easily determine that now. If they are the
6734 * same, the call to vm_fault_page() for the
6735 * destination page will deadlock. To prevent this we
6736 * wire the page so we can drop busy without having
6737 * the page daemon steal the page. We clean up the
6738 * top page but keep the paging reference on the object
6739 * holding the dest page so it doesn't go away.
6740 */
6741
6742 vm_page_lockspin_queues();
6743 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
6744 vm_page_unlock_queues();
6745 PAGE_WAKEUP_DONE(dst_page);
6746 vm_object_unlock(dst_object);
6747
6748 if (dst_top_page != VM_PAGE_NULL) {
6749 vm_object_lock(dst_object);
6750 VM_PAGE_FREE(dst_top_page);
6751 vm_object_paging_end(dst_object);
6752 vm_object_unlock(dst_object);
6753 }
6754
6755 RetrySourceFault:;
6756
6757 if (src_object == VM_OBJECT_NULL) {
6758 /*
6759 * No source object. We will just
6760 * zero-fill the page in dst_object.
6761 */
6762 src_page = VM_PAGE_NULL;
6763 result_page = VM_PAGE_NULL;
6764 } else {
6765 vm_object_lock(src_object);
6766 src_page = vm_page_lookup(src_object,
6767 vm_object_trunc_page(src_offset));
6768 if (src_page == dst_page) {
6769 src_prot = dst_prot;
6770 result_page = VM_PAGE_NULL;
6771 } else {
6772 src_prot = VM_PROT_READ;
6773 vm_object_paging_begin(src_object);
6774
6775 /* cap cluster size at maximum UPL size */
6776 if (os_convert_overflow(amount_left, &cluster_size)) {
6777 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
6778 }
6779 fault_info_src.cluster_size = cluster_size;
6780
6781 result_page = VM_PAGE_NULL;
6782 result = vm_fault_page(
6783 src_object,
6784 vm_object_trunc_page(src_offset),
6785 VM_PROT_READ, FALSE,
6786 FALSE, /* page not looked up */
6787 &src_prot,
6788 &result_page, &src_top_page,
6789 (int *)0, &error, FALSE,
6790 FALSE, &fault_info_src);
6791
6792 switch (result) {
6793 case VM_FAULT_SUCCESS:
6794 break;
6795 case VM_FAULT_RETRY:
6796 goto RetrySourceFault;
6797 case VM_FAULT_MEMORY_SHORTAGE:
6798 if (vm_page_wait(interruptible)) {
6799 goto RetrySourceFault;
6800 }
6801 OS_FALLTHROUGH;
6802 case VM_FAULT_INTERRUPTED:
6803 vm_fault_copy_dst_cleanup(dst_page);
6804 RETURN(MACH_SEND_INTERRUPTED);
6805 case VM_FAULT_SUCCESS_NO_VM_PAGE:
6806 /* success but no VM page: fail */
6807 vm_object_paging_end(src_object);
6808 vm_object_unlock(src_object);
6809 OS_FALLTHROUGH;
6810 case VM_FAULT_MEMORY_ERROR:
6811 vm_fault_copy_dst_cleanup(dst_page);
6812 if (error) {
6813 return error;
6814 } else {
6815 return KERN_MEMORY_ERROR;
6816 }
6817 default:
6818 panic("vm_fault_copy(2): unexpected "
6819 "error 0x%x from "
6820 "vm_fault_page()\n", result);
6821 }
6822
6823 result_page_object = VM_PAGE_OBJECT(result_page);
6824 assert((src_top_page == VM_PAGE_NULL) ==
6825 (result_page_object == src_object));
6826 }
6827 assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
6828 vm_object_unlock(result_page_object);
6829 }
6830
6831 vm_map_lock_read(dst_map);
6832
6833 if (!vm_map_verify(dst_map, dst_version)) {
6834 vm_map_unlock_read(dst_map);
6835 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6836 vm_fault_copy_cleanup(result_page, src_top_page);
6837 }
6838 vm_fault_copy_dst_cleanup(dst_page);
6839 break;
6840 }
6841 assert(dst_object == VM_PAGE_OBJECT(dst_page));
6842
6843 vm_object_lock(dst_object);
6844
6845 if (dst_object->copy != old_copy_object) {
6846 vm_object_unlock(dst_object);
6847 vm_map_unlock_read(dst_map);
6848 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6849 vm_fault_copy_cleanup(result_page, src_top_page);
6850 }
6851 vm_fault_copy_dst_cleanup(dst_page);
6852 break;
6853 }
6854 vm_object_unlock(dst_object);
6855
6856 /*
6857 * Copy the page, and note that it is dirty
6858 * immediately.
6859 */
6860
6861 if (!page_aligned(src_offset) ||
6862 !page_aligned(dst_offset) ||
6863 !page_aligned(amount_left)) {
6864 vm_object_offset_t src_po,
6865 dst_po;
6866
6867 src_po = src_offset - vm_object_trunc_page(src_offset);
6868 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
6869
6870 if (dst_po > src_po) {
6871 part_size = PAGE_SIZE - dst_po;
6872 } else {
6873 part_size = PAGE_SIZE - src_po;
6874 }
6875 if (part_size > (amount_left)) {
6876 part_size = amount_left;
6877 }
6878
6879 if (result_page == VM_PAGE_NULL) {
6880 assert((vm_offset_t) dst_po == dst_po);
6881 assert((vm_size_t) part_size == part_size);
6882 vm_page_part_zero_fill(dst_page,
6883 (vm_offset_t) dst_po,
6884 (vm_size_t) part_size);
6885 } else {
6886 assert((vm_offset_t) src_po == src_po);
6887 assert((vm_offset_t) dst_po == dst_po);
6888 assert((vm_size_t) part_size == part_size);
6889 vm_page_part_copy(result_page,
6890 (vm_offset_t) src_po,
6891 dst_page,
6892 (vm_offset_t) dst_po,
6893 (vm_size_t)part_size);
6894 if (!dst_page->vmp_dirty) {
6895 vm_object_lock(dst_object);
6896 SET_PAGE_DIRTY(dst_page, TRUE);
6897 vm_object_unlock(dst_object);
6898 }
6899 }
6900 } else {
6901 part_size = PAGE_SIZE;
6902
6903 if (result_page == VM_PAGE_NULL) {
6904 vm_page_zero_fill(dst_page);
6905 } else {
6906 vm_object_lock(result_page_object);
6907 vm_page_copy(result_page, dst_page);
6908 vm_object_unlock(result_page_object);
6909
6910 if (!dst_page->vmp_dirty) {
6911 vm_object_lock(dst_object);
6912 SET_PAGE_DIRTY(dst_page, TRUE);
6913 vm_object_unlock(dst_object);
6914 }
6915 }
6916 }
6917
6918 /*
6919 * Unlock everything, and return
6920 */
6921
6922 vm_map_unlock_read(dst_map);
6923
6924 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
6925 vm_fault_copy_cleanup(result_page, src_top_page);
6926 }
6927 vm_fault_copy_dst_cleanup(dst_page);
6928
6929 amount_left -= part_size;
6930 src_offset += part_size;
6931 dst_offset += part_size;
6932 } while (amount_left > 0);
6933
6934 RETURN(KERN_SUCCESS);
6935 #undef RETURN
6936
6937 /*NOTREACHED*/
6938 }
6939
6940 #if VM_FAULT_CLASSIFY
6941 /*
6942 * Temporary statistics gathering support.
6943 */
6944
6945 /*
6946 * Statistics arrays:
6947 */
6948 #define VM_FAULT_TYPES_MAX 5
6949 #define VM_FAULT_LEVEL_MAX 8
6950
6951 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6952
6953 #define VM_FAULT_TYPE_ZERO_FILL 0
6954 #define VM_FAULT_TYPE_MAP_IN 1
6955 #define VM_FAULT_TYPE_PAGER 2
6956 #define VM_FAULT_TYPE_COPY 3
6957 #define VM_FAULT_TYPE_OTHER 4
6958
6959
6960 void
6961 vm_fault_classify(vm_object_t object,
6962 vm_object_offset_t offset,
6963 vm_prot_t fault_type)
6964 {
6965 int type, level = 0;
6966 vm_page_t m;
6967
6968 while (TRUE) {
6969 m = vm_page_lookup(object, offset);
6970 if (m != VM_PAGE_NULL) {
6971 if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6972 type = VM_FAULT_TYPE_OTHER;
6973 break;
6974 }
6975 if (((fault_type & VM_PROT_WRITE) == 0) ||
6976 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6977 type = VM_FAULT_TYPE_MAP_IN;
6978 break;
6979 }
6980 type = VM_FAULT_TYPE_COPY;
6981 break;
6982 } else {
6983 if (object->pager_created) {
6984 type = VM_FAULT_TYPE_PAGER;
6985 break;
6986 }
6987 if (object->shadow == VM_OBJECT_NULL) {
6988 type = VM_FAULT_TYPE_ZERO_FILL;
6989 break;
6990 }
6991
6992 offset += object->vo_shadow_offset;
6993 object = object->shadow;
6994 level++;
6995 continue;
6996 }
6997 }
6998
6999 if (level > VM_FAULT_LEVEL_MAX) {
7000 level = VM_FAULT_LEVEL_MAX;
7001 }
7002
7003 vm_fault_stats[type][level] += 1;
7004
7005 return;
7006 }
7007
7008 /* cleanup routine to call from debugger */
7009
7010 void
7011 vm_fault_classify_init(void)
7012 {
7013 int type, level;
7014
7015 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
7016 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
7017 vm_fault_stats[type][level] = 0;
7018 }
7019 }
7020
7021 return;
7022 }
7023 #endif /* VM_FAULT_CLASSIFY */
7024
7025 vm_offset_t
7026 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7027 {
7028 vm_map_entry_t entry;
7029 vm_object_t object;
7030 vm_offset_t object_offset;
7031 vm_page_t m;
7032 int compressor_external_state, compressed_count_delta;
7033 int compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
7034 int my_fault_type = VM_PROT_READ;
7035 kern_return_t kr;
7036 int effective_page_mask, effective_page_size;
7037
7038 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7039 effective_page_mask = VM_MAP_PAGE_MASK(map);
7040 effective_page_size = VM_MAP_PAGE_SIZE(map);
7041 } else {
7042 effective_page_mask = PAGE_MASK;
7043 effective_page_size = PAGE_SIZE;
7044 }
7045
7046 if (not_in_kdp) {
7047 panic("kdp_lightweight_fault called from outside of debugger context");
7048 }
7049
7050 assert(map != VM_MAP_NULL);
7051
7052 assert((cur_target_addr & effective_page_mask) == 0);
7053 if ((cur_target_addr & effective_page_mask) != 0) {
7054 return 0;
7055 }
7056
7057 if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
7058 return 0;
7059 }
7060
7061 if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
7062 return 0;
7063 }
7064
7065 if (entry->is_sub_map) {
7066 return 0;
7067 }
7068
7069 object = VME_OBJECT(entry);
7070 if (object == VM_OBJECT_NULL) {
7071 return 0;
7072 }
7073
7074 object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7075
7076 while (TRUE) {
7077 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
7078 return 0;
7079 }
7080
7081 if (object->pager_created && (object->paging_in_progress ||
7082 object->activity_in_progress)) {
7083 return 0;
7084 }
7085
7086 m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7087
7088 if (m != VM_PAGE_NULL) {
7089 if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7090 return 0;
7091 }
7092
7093 if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
7094 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
7095 return 0;
7096 }
7097
7098 assert(!m->vmp_private);
7099 if (m->vmp_private) {
7100 return 0;
7101 }
7102
7103 assert(!m->vmp_fictitious);
7104 if (m->vmp_fictitious) {
7105 return 0;
7106 }
7107
7108 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7109 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7110 return 0;
7111 }
7112
7113 return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7114 }
7115
7116 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7117
7118 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7119 if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7120 kr = vm_compressor_pager_get(object->pager,
7121 vm_object_trunc_page(object_offset + object->paging_offset),
7122 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
7123 compressor_flags, &compressed_count_delta);
7124 if (kr == KERN_SUCCESS) {
7125 return kdp_compressor_decompressed_page_paddr;
7126 } else {
7127 return 0;
7128 }
7129 }
7130 }
7131
7132 if (object->shadow == VM_OBJECT_NULL) {
7133 return 0;
7134 }
7135
7136 object_offset += object->vo_shadow_offset;
7137 object = object->shadow;
7138 }
7139 }
7140
7141 /*
7142 * vm_page_validate_cs_fast():
7143 * Performs a few quick checks to determine if the page's code signature
7144 * really needs to be fully validated. It could:
7145 * 1. have been modified (i.e. automatically tainted),
7146 * 2. have already been validated,
7147 * 3. have already been found to be tainted,
7148 * 4. no longer have a backing store.
7149 * Returns FALSE if the page needs to be fully validated.
7150 */
7151 static boolean_t
7152 vm_page_validate_cs_fast(
7153 vm_page_t page,
7154 vm_map_size_t fault_page_size,
7155 vm_map_offset_t fault_phys_offset)
7156 {
7157 vm_object_t object;
7158
7159 object = VM_PAGE_OBJECT(page);
7160 vm_object_lock_assert_held(object);
7161
7162 if (page->vmp_wpmapped &&
7163 !VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7164 /*
7165 * This page was mapped for "write" access sometime in the
7166 * past and could still be modifiable in the future.
7167 * Consider it tainted.
7168 * [ If the page was already found to be "tainted", no
7169 * need to re-validate. ]
7170 */
7171 vm_object_lock_assert_exclusive(object);
7172 VMP_CS_SET_VALIDATED(page, fault_page_size, fault_phys_offset, TRUE);
7173 VMP_CS_SET_TAINTED(page, fault_page_size, fault_phys_offset, TRUE);
7174 if (cs_debug) {
7175 printf("CODESIGNING: %s: "
7176 "page %p obj %p off 0x%llx "
7177 "was modified\n",
7178 __FUNCTION__,
7179 page, object, page->vmp_offset);
7180 }
7181 vm_cs_validated_dirtied++;
7182 }
7183
7184 if (VMP_CS_VALIDATED(page, fault_page_size, fault_phys_offset) ||
7185 VMP_CS_TAINTED(page, fault_page_size, fault_phys_offset)) {
7186 return TRUE;
7187 }
7188 vm_object_lock_assert_exclusive(object);
7189
7190 #if CHECK_CS_VALIDATION_BITMAP
7191 kern_return_t kr;
7192
7193 kr = vnode_pager_cs_check_validation_bitmap(
7194 object->pager,
7195 page->vmp_offset + object->paging_offset,
7196 CS_BITMAP_CHECK);
7197 if (kr == KERN_SUCCESS) {
7198 page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7199 page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7200 vm_cs_bitmap_validated++;
7201 return TRUE;
7202 }
7203 #endif /* CHECK_CS_VALIDATION_BITMAP */
7204
7205 if (!object->alive || object->terminating || object->pager == NULL) {
7206 /*
7207 * The object is terminating and we don't have its pager
7208 * so we can't validate the data...
7209 */
7210 return TRUE;
7211 }
7212
7213 /* we need to really validate this page */
7214 vm_object_lock_assert_exclusive(object);
7215 return FALSE;
7216 }
7217
7218 void
7219 vm_page_validate_cs_mapped_slow(
7220 vm_page_t page,
7221 const void *kaddr)
7222 {
7223 vm_object_t object;
7224 memory_object_offset_t mo_offset;
7225 memory_object_t pager;
7226 struct vnode *vnode;
7227 int validated, tainted, nx;
7228
7229 assert(page->vmp_busy);
7230 object = VM_PAGE_OBJECT(page);
7231 vm_object_lock_assert_exclusive(object);
7232
7233 vm_cs_validates++;
7234
7235 /*
7236 * Since we get here to validate a page that was brought in by
7237 * the pager, we know that this pager is all setup and ready
7238 * by now.
7239 */
7240 assert(object->code_signed);
7241 assert(!object->internal);
7242 assert(object->pager != NULL);
7243 assert(object->pager_ready);
7244
7245 pager = object->pager;
7246 assert(object->paging_in_progress);
7247 vnode = vnode_pager_lookup_vnode(pager);
7248 mo_offset = page->vmp_offset + object->paging_offset;
7249
7250 /* verify the SHA1 hash for this page */
7251 validated = 0;
7252 tainted = 0;
7253 nx = 0;
7254 cs_validate_page(vnode,
7255 pager,
7256 mo_offset,
7257 (const void *)((const char *)kaddr),
7258 &validated,
7259 &tainted,
7260 &nx);
7261
7262 page->vmp_cs_validated |= validated;
7263 page->vmp_cs_tainted |= tainted;
7264 page->vmp_cs_nx |= nx;
7265
7266 #if CHECK_CS_VALIDATION_BITMAP
7267 if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7268 page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7269 vnode_pager_cs_check_validation_bitmap(object->pager,
7270 mo_offset,
7271 CS_BITMAP_SET);
7272 }
7273 #endif /* CHECK_CS_VALIDATION_BITMAP */
7274 }
7275
7276 void
7277 vm_page_validate_cs_mapped(
7278 vm_page_t page,
7279 vm_map_size_t fault_page_size,
7280 vm_map_offset_t fault_phys_offset,
7281 const void *kaddr)
7282 {
7283 if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7284 vm_page_validate_cs_mapped_slow(page, kaddr);
7285 }
7286 }
7287
7288 void
7289 vm_page_validate_cs(
7290 vm_page_t page,
7291 vm_map_size_t fault_page_size,
7292 vm_map_offset_t fault_phys_offset)
7293 {
7294 vm_object_t object;
7295 vm_object_offset_t offset;
7296 vm_map_offset_t koffset;
7297 vm_map_size_t ksize;
7298 vm_offset_t kaddr;
7299 kern_return_t kr;
7300 boolean_t busy_page;
7301 boolean_t need_unmap;
7302
7303 object = VM_PAGE_OBJECT(page);
7304 vm_object_lock_assert_held(object);
7305
7306 if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7307 return;
7308 }
7309 vm_object_lock_assert_exclusive(object);
7310
7311 assert(object->code_signed);
7312 offset = page->vmp_offset;
7313
7314 busy_page = page->vmp_busy;
7315 if (!busy_page) {
7316 /* keep page busy while we map (and unlock) the VM object */
7317 page->vmp_busy = TRUE;
7318 }
7319
7320 /*
7321 * Take a paging reference on the VM object
7322 * to protect it from collapse or bypass,
7323 * and keep it from disappearing too.
7324 */
7325 vm_object_paging_begin(object);
7326
7327 /* map the page in the kernel address space */
7328 ksize = PAGE_SIZE_64;
7329 koffset = 0;
7330 need_unmap = FALSE;
7331 kr = vm_paging_map_object(page,
7332 object,
7333 offset,
7334 VM_PROT_READ,
7335 FALSE, /* can't unlock object ! */
7336 &ksize,
7337 &koffset,
7338 &need_unmap);
7339 if (kr != KERN_SUCCESS) {
7340 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
7341 }
7342 kaddr = CAST_DOWN(vm_offset_t, koffset);
7343
7344 /* validate the mapped page */
7345 vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
7346
7347 assert(page->vmp_busy);
7348 assert(object == VM_PAGE_OBJECT(page));
7349 vm_object_lock_assert_exclusive(object);
7350
7351 if (!busy_page) {
7352 PAGE_WAKEUP_DONE(page);
7353 }
7354 if (need_unmap) {
7355 /* unmap the map from the kernel address space */
7356 vm_paging_unmap_object(object, koffset, koffset + ksize);
7357 koffset = 0;
7358 ksize = 0;
7359 kaddr = 0;
7360 }
7361 vm_object_paging_end(object);
7362 }
7363
7364 void
7365 vm_page_validate_cs_mapped_chunk(
7366 vm_page_t page,
7367 const void *kaddr,
7368 vm_offset_t chunk_offset,
7369 vm_size_t chunk_size,
7370 boolean_t *validated_p,
7371 unsigned *tainted_p)
7372 {
7373 vm_object_t object;
7374 vm_object_offset_t offset, offset_in_page;
7375 memory_object_t pager;
7376 struct vnode *vnode;
7377 boolean_t validated;
7378 unsigned tainted;
7379
7380 *validated_p = FALSE;
7381 *tainted_p = 0;
7382
7383 assert(page->vmp_busy);
7384 object = VM_PAGE_OBJECT(page);
7385 vm_object_lock_assert_exclusive(object);
7386
7387 assert(object->code_signed);
7388 offset = page->vmp_offset;
7389
7390 if (!object->alive || object->terminating || object->pager == NULL) {
7391 /*
7392 * The object is terminating and we don't have its pager
7393 * so we can't validate the data...
7394 */
7395 return;
7396 }
7397 /*
7398 * Since we get here to validate a page that was brought in by
7399 * the pager, we know that this pager is all setup and ready
7400 * by now.
7401 */
7402 assert(!object->internal);
7403 assert(object->pager != NULL);
7404 assert(object->pager_ready);
7405
7406 pager = object->pager;
7407 assert(object->paging_in_progress);
7408 vnode = vnode_pager_lookup_vnode(pager);
7409
7410 /* verify the signature for this chunk */
7411 offset_in_page = chunk_offset;
7412 assert(offset_in_page < PAGE_SIZE);
7413
7414 tainted = 0;
7415 validated = cs_validate_range(vnode,
7416 pager,
7417 (object->paging_offset +
7418 offset +
7419 offset_in_page),
7420 (const void *)((const char *)kaddr
7421 + offset_in_page),
7422 chunk_size,
7423 &tainted);
7424 if (validated) {
7425 *validated_p = TRUE;
7426 }
7427 if (tainted) {
7428 *tainted_p = tainted;
7429 }
7430 }
7431
7432 static void
7433 vm_rtfrecord_lock(void)
7434 {
7435 lck_spin_lock(&vm_rtfr_slock);
7436 }
7437
7438 static void
7439 vm_rtfrecord_unlock(void)
7440 {
7441 lck_spin_unlock(&vm_rtfr_slock);
7442 }
7443
7444 unsigned int
7445 vmrtfaultinfo_bufsz(void)
7446 {
7447 return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7448 }
7449
7450 #include <kern/backtrace.h>
7451
7452 __attribute__((noinline))
7453 static void
7454 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7455 {
7456 uint64_t fend = mach_continuous_time();
7457
7458 uint64_t cfpc = 0;
7459 uint64_t ctid = cthread->thread_id;
7460 uint64_t cupid = get_current_unique_pid();
7461
7462 uintptr_t bpc = 0;
7463 int btr = 0;
7464 bool u64 = false;
7465
7466 /* Capture a single-frame backtrace; this extracts just the program
7467 * counter at the point of the fault into "bpc", and should perform no
7468 * further user stack traversals, thus avoiding copyin()s and further
7469 * faults.
7470 */
7471 unsigned int bfrs = backtrace_thread_user(cthread, &bpc, 1U, &btr, &u64, NULL, false);
7472
7473 if ((btr == 0) && (bfrs > 0)) {
7474 cfpc = bpc;
7475 }
7476
7477 assert((fstart != 0) && fend >= fstart);
7478 vm_rtfrecord_lock();
7479 assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
7480
7481 vmrtfrs.vmrtf_total++;
7482 vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
7483
7484 cvmr->rtfabstime = fstart;
7485 cvmr->rtfduration = fend - fstart;
7486 cvmr->rtfaddr = fault_vaddr;
7487 cvmr->rtfpc = cfpc;
7488 cvmr->rtftype = type_of_fault;
7489 cvmr->rtfupid = cupid;
7490 cvmr->rtftid = ctid;
7491
7492 if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
7493 vmrtfrs.vmrtfr_curi = 0;
7494 }
7495
7496 vm_rtfrecord_unlock();
7497 }
7498
7499 int
7500 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void *vrecords, unsigned long *vmrtfrv)
7501 {
7502 vm_rtfault_record_t *cvmrd = vrecords;
7503 size_t residue = vrecordsz;
7504 size_t numextracted = 0;
7505 boolean_t early_exit = FALSE;
7506
7507 vm_rtfrecord_lock();
7508
7509 for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
7510 if (residue < sizeof(vm_rtfault_record_t)) {
7511 early_exit = TRUE;
7512 break;
7513 }
7514
7515 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
7516 #if DEVELOPMENT || DEBUG
7517 if (isroot == FALSE) {
7518 continue;
7519 }
7520 #else
7521 continue;
7522 #endif /* DEVDEBUG */
7523 }
7524
7525 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
7526 cvmrd++;
7527 residue -= sizeof(vm_rtfault_record_t);
7528 numextracted++;
7529 }
7530
7531 vm_rtfrecord_unlock();
7532
7533 *vmrtfrv = numextracted;
7534 return early_exit;
7535 }