]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
f4c6f17ef3394218289fd4abfd64a18c1550f869
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <libkern/OSAtomic.h>
68
69 #include <mach/mach_types.h>
70 #include <mach/kern_return.h>
71 #include <mach/message.h> /* for error codes */
72 #include <mach/vm_param.h>
73 #include <mach/vm_behavior.h>
74 #include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
76 #include <mach/sdt.h>
77
78 #include <kern/kern_types.h>
79 #include <kern/host_statistics.h>
80 #include <kern/counters.h>
81 #include <kern/task.h>
82 #include <kern/thread.h>
83 #include <kern/sched_prim.h>
84 #include <kern/host.h>
85 #include <kern/xpr.h>
86 #include <kern/mach_param.h>
87 #include <kern/macro_help.h>
88 #include <kern/zalloc.h>
89 #include <kern/misc_protos.h>
90
91 #include <vm/vm_fault.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_kern.h>
96 #include <vm/pmap.h>
97 #include <vm/vm_pageout.h>
98 #include <vm/vm_protos.h>
99 #include <vm/vm_external.h>
100 #include <vm/memory_object.h>
101 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
102 #include <vm/vm_shared_region.h>
103
104 #define VM_FAULT_CLASSIFY 0
105
106 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
107
108 int vm_object_pagein_throttle = 16;
109
110 /*
111 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
112 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
113 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
114 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
115 * keep the UI active so that the user has a chance to kill the offending task before the system
116 * completely hangs.
117 *
118 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
119 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
120 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
121 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
122 */
123
124 extern boolean_t thread_is_io_throttled(void);
125 extern void throttle_lowpri_io(int);
126
127 uint64_t vm_hard_throttle_threshold;
128
129 extern unsigned int dp_pages_free, dp_pages_reserve;
130
131 #define NEED_TO_HARD_THROTTLE_THIS_TASK() (((dp_pages_free + dp_pages_reserve < 2000) && \
132 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
133 (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
134 (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
135 (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
136
137
138 #define HARD_THROTTLE_DELAY 20000 /* 20000 us == 20 ms */
139 #define SOFT_THROTTLE_DELAY 2000 /* 2000 us == 2 ms */
140
141
142 extern int cs_debug;
143
144 boolean_t current_thread_aborted(void);
145
146 /* Forward declarations of internal routines. */
147 extern kern_return_t vm_fault_wire_fast(
148 vm_map_t map,
149 vm_map_offset_t va,
150 vm_map_entry_t entry,
151 pmap_t pmap,
152 vm_map_offset_t pmap_addr);
153
154 extern void vm_fault_continue(void);
155
156 extern void vm_fault_copy_cleanup(
157 vm_page_t page,
158 vm_page_t top_page);
159
160 extern void vm_fault_copy_dst_cleanup(
161 vm_page_t page);
162
163 #if VM_FAULT_CLASSIFY
164 extern void vm_fault_classify(vm_object_t object,
165 vm_object_offset_t offset,
166 vm_prot_t fault_type);
167
168 extern void vm_fault_classify_init(void);
169 #endif
170
171 unsigned long vm_pmap_enter_blocked = 0;
172 unsigned long vm_pmap_enter_retried = 0;
173
174 unsigned long vm_cs_validates = 0;
175 unsigned long vm_cs_revalidates = 0;
176 unsigned long vm_cs_query_modified = 0;
177 unsigned long vm_cs_validated_dirtied = 0;
178 unsigned long vm_cs_bitmap_validated = 0;
179 #if CONFIG_ENFORCE_SIGNED_CODE
180 int cs_enforcement_disable=0;
181 #else
182 static const int cs_enforcement_disable=1;
183 #endif
184
185 /*
186 * Routine: vm_fault_init
187 * Purpose:
188 * Initialize our private data structures.
189 */
190 void
191 vm_fault_init(void)
192 {
193 #if !SECURE_KERNEL
194 #if CONFIG_ENFORCE_SIGNED_CODE
195 PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
196 sizeof (cs_enforcement_disable));
197 #endif
198 PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
199 #endif
200
201 /*
202 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
203 * computed as a percentage of available memory, and the percentage used is scaled inversely with
204 * the amount of memory. The pertange runs between 10% and 35%. We use 35% for small memory systems
205 * and reduce the value down to 10% for very large memory configurations. This helps give us a
206 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
207 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
208 */
209
210 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
211 }
212
213 /*
214 * Routine: vm_fault_cleanup
215 * Purpose:
216 * Clean up the result of vm_fault_page.
217 * Results:
218 * The paging reference for "object" is released.
219 * "object" is unlocked.
220 * If "top_page" is not null, "top_page" is
221 * freed and the paging reference for the object
222 * containing it is released.
223 *
224 * In/out conditions:
225 * "object" must be locked.
226 */
227 void
228 vm_fault_cleanup(
229 register vm_object_t object,
230 register vm_page_t top_page)
231 {
232 vm_object_paging_end(object);
233 vm_object_unlock(object);
234
235 if (top_page != VM_PAGE_NULL) {
236 object = top_page->object;
237
238 vm_object_lock(object);
239 VM_PAGE_FREE(top_page);
240 vm_object_paging_end(object);
241 vm_object_unlock(object);
242 }
243 }
244
245 #if MACH_CLUSTER_STATS
246 #define MAXCLUSTERPAGES 16
247 struct {
248 unsigned long pages_in_cluster;
249 unsigned long pages_at_higher_offsets;
250 unsigned long pages_at_lower_offsets;
251 } cluster_stats_in[MAXCLUSTERPAGES];
252 #define CLUSTER_STAT(clause) clause
253 #define CLUSTER_STAT_HIGHER(x) \
254 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
255 #define CLUSTER_STAT_LOWER(x) \
256 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
257 #define CLUSTER_STAT_CLUSTER(x) \
258 ((cluster_stats_in[(x)].pages_in_cluster)++)
259 #else /* MACH_CLUSTER_STATS */
260 #define CLUSTER_STAT(clause)
261 #endif /* MACH_CLUSTER_STATS */
262
263 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
264
265
266 boolean_t vm_page_deactivate_behind = TRUE;
267 /*
268 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
269 */
270 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
271 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
272 /* we use it to size an array on the stack */
273
274 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
275
276 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
277
278 /*
279 * vm_page_is_sequential
280 *
281 * Determine if sequential access is in progress
282 * in accordance with the behavior specified.
283 * Update state to indicate current access pattern.
284 *
285 * object must have at least the shared lock held
286 */
287 static
288 void
289 vm_fault_is_sequential(
290 vm_object_t object,
291 vm_object_offset_t offset,
292 vm_behavior_t behavior)
293 {
294 vm_object_offset_t last_alloc;
295 int sequential;
296 int orig_sequential;
297
298 last_alloc = object->last_alloc;
299 sequential = object->sequential;
300 orig_sequential = sequential;
301
302 switch (behavior) {
303 case VM_BEHAVIOR_RANDOM:
304 /*
305 * reset indicator of sequential behavior
306 */
307 sequential = 0;
308 break;
309
310 case VM_BEHAVIOR_SEQUENTIAL:
311 if (offset && last_alloc == offset - PAGE_SIZE_64) {
312 /*
313 * advance indicator of sequential behavior
314 */
315 if (sequential < MAX_SEQUENTIAL_RUN)
316 sequential += PAGE_SIZE;
317 } else {
318 /*
319 * reset indicator of sequential behavior
320 */
321 sequential = 0;
322 }
323 break;
324
325 case VM_BEHAVIOR_RSEQNTL:
326 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
327 /*
328 * advance indicator of sequential behavior
329 */
330 if (sequential > -MAX_SEQUENTIAL_RUN)
331 sequential -= PAGE_SIZE;
332 } else {
333 /*
334 * reset indicator of sequential behavior
335 */
336 sequential = 0;
337 }
338 break;
339
340 case VM_BEHAVIOR_DEFAULT:
341 default:
342 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
343 /*
344 * advance indicator of sequential behavior
345 */
346 if (sequential < 0)
347 sequential = 0;
348 if (sequential < MAX_SEQUENTIAL_RUN)
349 sequential += PAGE_SIZE;
350
351 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
352 /*
353 * advance indicator of sequential behavior
354 */
355 if (sequential > 0)
356 sequential = 0;
357 if (sequential > -MAX_SEQUENTIAL_RUN)
358 sequential -= PAGE_SIZE;
359 } else {
360 /*
361 * reset indicator of sequential behavior
362 */
363 sequential = 0;
364 }
365 break;
366 }
367 if (sequential != orig_sequential) {
368 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
369 /*
370 * if someone else has already updated object->sequential
371 * don't bother trying to update it or object->last_alloc
372 */
373 return;
374 }
375 }
376 /*
377 * I'd like to do this with a OSCompareAndSwap64, but that
378 * doesn't exist for PPC... however, it shouldn't matter
379 * that much... last_alloc is maintained so that we can determine
380 * if a sequential access pattern is taking place... if only
381 * one thread is banging on this object, no problem with the unprotected
382 * update... if 2 or more threads are banging away, we run the risk of
383 * someone seeing a mangled update... however, in the face of multiple
384 * accesses, no sequential access pattern can develop anyway, so we
385 * haven't lost any real info.
386 */
387 object->last_alloc = offset;
388 }
389
390
391 int vm_page_deactivate_behind_count = 0;
392
393 /*
394 * vm_page_deactivate_behind
395 *
396 * Determine if sequential access is in progress
397 * in accordance with the behavior specified. If
398 * so, compute a potential page to deactivate and
399 * deactivate it.
400 *
401 * object must be locked.
402 *
403 * return TRUE if we actually deactivate a page
404 */
405 static
406 boolean_t
407 vm_fault_deactivate_behind(
408 vm_object_t object,
409 vm_object_offset_t offset,
410 vm_behavior_t behavior)
411 {
412 int n;
413 int pages_in_run = 0;
414 int max_pages_in_run = 0;
415 int sequential_run;
416 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
417 vm_object_offset_t run_offset = 0;
418 vm_object_offset_t pg_offset = 0;
419 vm_page_t m;
420 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
421
422 pages_in_run = 0;
423 #if TRACEFAULTPAGE
424 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
425 #endif
426
427 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
428 /*
429 * Do not deactivate pages from the kernel object: they
430 * are not intended to become pageable.
431 * or we've disabled the deactivate behind mechanism
432 */
433 return FALSE;
434 }
435 if ((sequential_run = object->sequential)) {
436 if (sequential_run < 0) {
437 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
438 sequential_run = 0 - sequential_run;
439 } else {
440 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
441 }
442 }
443 switch (behavior) {
444 case VM_BEHAVIOR_RANDOM:
445 break;
446 case VM_BEHAVIOR_SEQUENTIAL:
447 if (sequential_run >= (int)PAGE_SIZE) {
448 run_offset = 0 - PAGE_SIZE_64;
449 max_pages_in_run = 1;
450 }
451 break;
452 case VM_BEHAVIOR_RSEQNTL:
453 if (sequential_run >= (int)PAGE_SIZE) {
454 run_offset = PAGE_SIZE_64;
455 max_pages_in_run = 1;
456 }
457 break;
458 case VM_BEHAVIOR_DEFAULT:
459 default:
460 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
461
462 /*
463 * determine if the run of sequential accesss has been
464 * long enough on an object with default access behavior
465 * to consider it for deactivation
466 */
467 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
468 /*
469 * the comparisons between offset and behind are done
470 * in this kind of odd fashion in order to prevent wrap around
471 * at the end points
472 */
473 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
474 if (offset >= behind) {
475 run_offset = 0 - behind;
476 pg_offset = PAGE_SIZE_64;
477 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
478 }
479 } else {
480 if (offset < -behind) {
481 run_offset = behind;
482 pg_offset = 0 - PAGE_SIZE_64;
483 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
484 }
485 }
486 }
487 break;
488 }
489 }
490 for (n = 0; n < max_pages_in_run; n++) {
491 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
492
493 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
494 page_run[pages_in_run++] = m;
495 pmap_clear_reference(m->phys_page);
496 }
497 }
498 if (pages_in_run) {
499 vm_page_lockspin_queues();
500
501 for (n = 0; n < pages_in_run; n++) {
502
503 m = page_run[n];
504
505 vm_page_deactivate_internal(m, FALSE);
506
507 vm_page_deactivate_behind_count++;
508 #if TRACEFAULTPAGE
509 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
510 #endif
511 }
512 vm_page_unlock_queues();
513
514 return TRUE;
515 }
516 return FALSE;
517 }
518
519
520 static int
521 vm_page_throttled(void)
522 {
523 clock_sec_t elapsed_sec;
524 clock_sec_t tv_sec;
525 clock_usec_t tv_usec;
526
527 thread_t thread = current_thread();
528
529 if (thread->options & TH_OPT_VMPRIV)
530 return (0);
531
532 thread->t_page_creation_count++;
533
534 if (NEED_TO_HARD_THROTTLE_THIS_TASK())
535 return (HARD_THROTTLE_DELAY);
536
537 if (vm_page_free_count < vm_page_throttle_limit &&
538 thread->t_page_creation_count > vm_page_creation_throttle) {
539
540 clock_get_system_microtime(&tv_sec, &tv_usec);
541
542 elapsed_sec = tv_sec - thread->t_page_creation_time;
543
544 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
545
546 if (elapsed_sec >= 60) {
547 /*
548 * we'll reset our stats to give a well behaved app
549 * that was unlucky enough to accumulate a bunch of pages
550 * over a long period of time a chance to get out of
551 * the throttled state... we reset the counter and timestamp
552 * so that if it stays under the rate limit for the next second
553 * it will be back in our good graces... if it exceeds it, it
554 * will remain in the throttled state
555 */
556 thread->t_page_creation_time = tv_sec;
557 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
558 }
559 ++vm_page_throttle_count;
560
561 return (SOFT_THROTTLE_DELAY);
562 }
563 thread->t_page_creation_time = tv_sec;
564 thread->t_page_creation_count = 0;
565 }
566 return (0);
567 }
568
569
570 /*
571 * check for various conditions that would
572 * prevent us from creating a ZF page...
573 * cleanup is based on being called from vm_fault_page
574 *
575 * object must be locked
576 * object == m->object
577 */
578 static vm_fault_return_t
579 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
580 {
581 int throttle_delay;
582
583 if (object->shadow_severed ||
584 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
585 /*
586 * Either:
587 * 1. the shadow chain was severed,
588 * 2. the purgeable object is volatile or empty and is marked
589 * to fault on access while volatile.
590 * Just have to return an error at this point
591 */
592 if (m != VM_PAGE_NULL)
593 VM_PAGE_FREE(m);
594 vm_fault_cleanup(object, first_m);
595
596 thread_interrupt_level(interruptible_state);
597
598 return (VM_FAULT_MEMORY_ERROR);
599 }
600 if (vm_backing_store_low) {
601 /*
602 * are we protecting the system from
603 * backing store exhaustion. If so
604 * sleep unless we are privileged.
605 */
606 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
607
608 if (m != VM_PAGE_NULL)
609 VM_PAGE_FREE(m);
610 vm_fault_cleanup(object, first_m);
611
612 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
613
614 thread_block(THREAD_CONTINUE_NULL);
615 thread_interrupt_level(interruptible_state);
616
617 return (VM_FAULT_RETRY);
618 }
619 }
620 if ((throttle_delay = vm_page_throttled())) {
621 /*
622 * we're throttling zero-fills...
623 * treat this as if we couldn't grab a page
624 */
625 if (m != VM_PAGE_NULL)
626 VM_PAGE_FREE(m);
627 vm_fault_cleanup(object, first_m);
628
629 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
630
631 delay(throttle_delay);
632
633 if (current_thread_aborted()) {
634 thread_interrupt_level(interruptible_state);
635 return VM_FAULT_INTERRUPTED;
636 }
637 thread_interrupt_level(interruptible_state);
638
639 return (VM_FAULT_MEMORY_SHORTAGE);
640 }
641 return (VM_FAULT_SUCCESS);
642 }
643
644
645 /*
646 * do the work to zero fill a page and
647 * inject it into the correct paging queue
648 *
649 * m->object must be locked
650 * page queue lock must NOT be held
651 */
652 static int
653 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
654 {
655 int my_fault = DBG_ZERO_FILL_FAULT;
656
657 /*
658 * This is is a zero-fill page fault...
659 *
660 * Checking the page lock is a waste of
661 * time; this page was absent, so
662 * it can't be page locked by a pager.
663 *
664 * we also consider it undefined
665 * with respect to instruction
666 * execution. i.e. it is the responsibility
667 * of higher layers to call for an instruction
668 * sync after changing the contents and before
669 * sending a program into this area. We
670 * choose this approach for performance
671 */
672 m->pmapped = TRUE;
673
674 m->cs_validated = FALSE;
675 m->cs_tainted = FALSE;
676
677 if (no_zero_fill == TRUE) {
678 my_fault = DBG_NZF_PAGE_FAULT;
679 } else {
680 vm_page_zero_fill(m);
681
682 VM_STAT_INCR(zero_fill_count);
683 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
684 }
685 assert(!m->laundry);
686 assert(m->object != kernel_object);
687 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
688
689 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
690 (m->object->purgable == VM_PURGABLE_DENY ||
691 m->object->purgable == VM_PURGABLE_NONVOLATILE ||
692 m->object->purgable == VM_PURGABLE_VOLATILE )) {
693
694 vm_page_lockspin_queues();
695
696 assert(!VM_PAGE_WIRED(m));
697
698 /*
699 * can't be on the pageout queue since we don't
700 * have a pager to try and clean to
701 */
702 assert(!m->pageout_queue);
703
704 VM_PAGE_QUEUES_REMOVE(m);
705
706 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
707 m->throttled = TRUE;
708 vm_page_throttled_count++;
709
710 vm_page_unlock_queues();
711 }
712 return (my_fault);
713 }
714
715
716 /*
717 * Routine: vm_fault_page
718 * Purpose:
719 * Find the resident page for the virtual memory
720 * specified by the given virtual memory object
721 * and offset.
722 * Additional arguments:
723 * The required permissions for the page is given
724 * in "fault_type". Desired permissions are included
725 * in "protection".
726 * fault_info is passed along to determine pagein cluster
727 * limits... it contains the expected reference pattern,
728 * cluster size if available, etc...
729 *
730 * If the desired page is known to be resident (for
731 * example, because it was previously wired down), asserting
732 * the "unwiring" parameter will speed the search.
733 *
734 * If the operation can be interrupted (by thread_abort
735 * or thread_terminate), then the "interruptible"
736 * parameter should be asserted.
737 *
738 * Results:
739 * The page containing the proper data is returned
740 * in "result_page".
741 *
742 * In/out conditions:
743 * The source object must be locked and referenced,
744 * and must donate one paging reference. The reference
745 * is not affected. The paging reference and lock are
746 * consumed.
747 *
748 * If the call succeeds, the object in which "result_page"
749 * resides is left locked and holding a paging reference.
750 * If this is not the original object, a busy page in the
751 * original object is returned in "top_page", to prevent other
752 * callers from pursuing this same data, along with a paging
753 * reference for the original object. The "top_page" should
754 * be destroyed when this guarantee is no longer required.
755 * The "result_page" is also left busy. It is not removed
756 * from the pageout queues.
757 * Special Case:
758 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
759 * fault succeeded but there's no VM page (i.e. the VM object
760 * does not actually hold VM pages, but device memory or
761 * large pages). The object is still locked and we still hold a
762 * paging_in_progress reference.
763 */
764 unsigned int vm_fault_page_blocked_access = 0;
765 unsigned int vm_fault_page_forced_retry = 0;
766
767 vm_fault_return_t
768 vm_fault_page(
769 /* Arguments: */
770 vm_object_t first_object, /* Object to begin search */
771 vm_object_offset_t first_offset, /* Offset into object */
772 vm_prot_t fault_type, /* What access is requested */
773 boolean_t must_be_resident,/* Must page be resident? */
774 /* Modifies in place: */
775 vm_prot_t *protection, /* Protection for mapping */
776 /* Returns: */
777 vm_page_t *result_page, /* Page found, if successful */
778 vm_page_t *top_page, /* Page in top object, if
779 * not result_page. */
780 int *type_of_fault, /* if non-null, fill in with type of fault
781 * COW, zero-fill, etc... returned in trace point */
782 /* More arguments: */
783 kern_return_t *error_code, /* code if page is in error */
784 boolean_t no_zero_fill, /* don't zero fill absent pages */
785 #if MACH_PAGEMAP
786 boolean_t data_supply, /* treat as data_supply if
787 * it is a write fault and a full
788 * page is provided */
789 #else
790 __unused boolean_t data_supply,
791 #endif
792 vm_object_fault_info_t fault_info)
793 {
794 vm_page_t m;
795 vm_object_t object;
796 vm_object_offset_t offset;
797 vm_page_t first_m;
798 vm_object_t next_object;
799 vm_object_t copy_object;
800 boolean_t look_for_page;
801 boolean_t force_fault_retry = FALSE;
802 vm_prot_t access_required = fault_type;
803 vm_prot_t wants_copy_flag;
804 CLUSTER_STAT(int pages_at_higher_offsets;)
805 CLUSTER_STAT(int pages_at_lower_offsets;)
806 kern_return_t wait_result;
807 boolean_t interruptible_state;
808 boolean_t data_already_requested = FALSE;
809 vm_behavior_t orig_behavior;
810 vm_size_t orig_cluster_size;
811 vm_fault_return_t error;
812 int my_fault;
813 uint32_t try_failed_count;
814 int interruptible; /* how may fault be interrupted? */
815 memory_object_t pager;
816 vm_fault_return_t retval;
817
818 /*
819 * MACH page map - an optional optimization where a bit map is maintained
820 * by the VM subsystem for internal objects to indicate which pages of
821 * the object currently reside on backing store. This existence map
822 * duplicates information maintained by the vnode pager. It is
823 * created at the time of the first pageout against the object, i.e.
824 * at the same time pager for the object is created. The optimization
825 * is designed to eliminate pager interaction overhead, if it is
826 * 'known' that the page does not exist on backing store.
827 *
828 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
829 * either marked as paged out in the existence map for the object or no
830 * existence map exists for the object. MUST_ASK_PAGER() is one of the
831 * criteria in the decision to invoke the pager. It is also used as one
832 * of the criteria to terminate the scan for adjacent pages in a clustered
833 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
834 * permanent objects. Note also that if the pager for an internal object
835 * has not been created, the pager is not invoked regardless of the value
836 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
837 * for which a pager has been created.
838 *
839 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
840 * is marked as paged out in the existence map for the object. PAGED_OUT()
841 * PAGED_OUT() is used to determine if a page has already been pushed
842 * into a copy object in order to avoid a redundant page out operation.
843 */
844 #if MACH_PAGEMAP
845 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
846 != VM_EXTERNAL_STATE_ABSENT)
847 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
848 == VM_EXTERNAL_STATE_EXISTS)
849 #else
850 #define MUST_ASK_PAGER(o, f) (TRUE)
851 #define PAGED_OUT(o, f) (FALSE)
852 #endif
853
854 /*
855 * Recovery actions
856 */
857 #define RELEASE_PAGE(m) \
858 MACRO_BEGIN \
859 PAGE_WAKEUP_DONE(m); \
860 if (!m->active && !m->inactive && !m->throttled) { \
861 vm_page_lockspin_queues(); \
862 if (!m->active && !m->inactive && !m->throttled) \
863 vm_page_activate(m); \
864 vm_page_unlock_queues(); \
865 } \
866 MACRO_END
867
868 #if TRACEFAULTPAGE
869 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
870 #endif
871
872 interruptible = fault_info->interruptible;
873 interruptible_state = thread_interrupt_level(interruptible);
874
875 /*
876 * INVARIANTS (through entire routine):
877 *
878 * 1) At all times, we must either have the object
879 * lock or a busy page in some object to prevent
880 * some other thread from trying to bring in
881 * the same page.
882 *
883 * Note that we cannot hold any locks during the
884 * pager access or when waiting for memory, so
885 * we use a busy page then.
886 *
887 * 2) To prevent another thread from racing us down the
888 * shadow chain and entering a new page in the top
889 * object before we do, we must keep a busy page in
890 * the top object while following the shadow chain.
891 *
892 * 3) We must increment paging_in_progress on any object
893 * for which we have a busy page before dropping
894 * the object lock
895 *
896 * 4) We leave busy pages on the pageout queues.
897 * If the pageout daemon comes across a busy page,
898 * it will remove the page from the pageout queues.
899 */
900
901 object = first_object;
902 offset = first_offset;
903 first_m = VM_PAGE_NULL;
904 access_required = fault_type;
905
906
907 XPR(XPR_VM_FAULT,
908 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
909 object, offset, fault_type, *protection, 0);
910
911 /*
912 * default type of fault
913 */
914 my_fault = DBG_CACHE_HIT_FAULT;
915
916 while (TRUE) {
917 #if TRACEFAULTPAGE
918 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
919 #endif
920 if (!object->alive) {
921 /*
922 * object is no longer valid
923 * clean up and return error
924 */
925 vm_fault_cleanup(object, first_m);
926 thread_interrupt_level(interruptible_state);
927
928 return (VM_FAULT_MEMORY_ERROR);
929 }
930
931 if (!object->pager_created && object->phys_contiguous) {
932 /*
933 * A physically-contiguous object without a pager:
934 * must be a "large page" object. We do not deal
935 * with VM pages for this object.
936 */
937 m = VM_PAGE_NULL;
938 goto phys_contig_object;
939 }
940
941 if (object->blocked_access) {
942 /*
943 * Access to this VM object has been blocked.
944 * Replace our "paging_in_progress" reference with
945 * a "activity_in_progress" reference and wait for
946 * access to be unblocked.
947 */
948 vm_object_activity_begin(object);
949 vm_object_paging_end(object);
950 while (object->blocked_access) {
951 vm_object_sleep(object,
952 VM_OBJECT_EVENT_UNBLOCKED,
953 THREAD_UNINT);
954 }
955 vm_fault_page_blocked_access++;
956 vm_object_paging_begin(object);
957 vm_object_activity_end(object);
958 }
959
960 /*
961 * See whether the page at 'offset' is resident
962 */
963 m = vm_page_lookup(object, offset);
964 #if TRACEFAULTPAGE
965 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
966 #endif
967 if (m != VM_PAGE_NULL) {
968
969 if (m->busy) {
970 /*
971 * The page is being brought in,
972 * wait for it and then retry.
973 */
974 #if TRACEFAULTPAGE
975 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
976 #endif
977 wait_result = PAGE_SLEEP(object, m, interruptible);
978
979 XPR(XPR_VM_FAULT,
980 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
981 object, offset,
982 m, 0, 0);
983 counter(c_vm_fault_page_block_busy_kernel++);
984
985 if (wait_result != THREAD_AWAKENED) {
986 vm_fault_cleanup(object, first_m);
987 thread_interrupt_level(interruptible_state);
988
989 if (wait_result == THREAD_RESTART)
990 return (VM_FAULT_RETRY);
991 else
992 return (VM_FAULT_INTERRUPTED);
993 }
994 continue;
995 }
996 if (m->laundry) {
997 m->pageout = FALSE;
998
999 if (!m->cleaning)
1000 vm_pageout_steal_laundry(m, FALSE);
1001 }
1002 if (m->phys_page == vm_page_guard_addr) {
1003 /*
1004 * Guard page: off limits !
1005 */
1006 if (fault_type == VM_PROT_NONE) {
1007 /*
1008 * The fault is not requesting any
1009 * access to the guard page, so it must
1010 * be just to wire or unwire it.
1011 * Let's pretend it succeeded...
1012 */
1013 m->busy = TRUE;
1014 *result_page = m;
1015 assert(first_m == VM_PAGE_NULL);
1016 *top_page = first_m;
1017 if (type_of_fault)
1018 *type_of_fault = DBG_GUARD_FAULT;
1019 thread_interrupt_level(interruptible_state);
1020 return VM_FAULT_SUCCESS;
1021 } else {
1022 /*
1023 * The fault requests access to the
1024 * guard page: let's deny that !
1025 */
1026 vm_fault_cleanup(object, first_m);
1027 thread_interrupt_level(interruptible_state);
1028 return VM_FAULT_MEMORY_ERROR;
1029 }
1030 }
1031
1032 if (m->error) {
1033 /*
1034 * The page is in error, give up now.
1035 */
1036 #if TRACEFAULTPAGE
1037 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1038 #endif
1039 if (error_code)
1040 *error_code = KERN_MEMORY_ERROR;
1041 VM_PAGE_FREE(m);
1042
1043 vm_fault_cleanup(object, first_m);
1044 thread_interrupt_level(interruptible_state);
1045
1046 return (VM_FAULT_MEMORY_ERROR);
1047 }
1048 if (m->restart) {
1049 /*
1050 * The pager wants us to restart
1051 * at the top of the chain,
1052 * typically because it has moved the
1053 * page to another pager, then do so.
1054 */
1055 #if TRACEFAULTPAGE
1056 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1057 #endif
1058 VM_PAGE_FREE(m);
1059
1060 vm_fault_cleanup(object, first_m);
1061 thread_interrupt_level(interruptible_state);
1062
1063 return (VM_FAULT_RETRY);
1064 }
1065 if (m->absent) {
1066 /*
1067 * The page isn't busy, but is absent,
1068 * therefore it's deemed "unavailable".
1069 *
1070 * Remove the non-existent page (unless it's
1071 * in the top object) and move on down to the
1072 * next object (if there is one).
1073 */
1074 #if TRACEFAULTPAGE
1075 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1076 #endif
1077 next_object = object->shadow;
1078
1079 if (next_object == VM_OBJECT_NULL) {
1080 /*
1081 * Absent page at bottom of shadow
1082 * chain; zero fill the page we left
1083 * busy in the first object, and free
1084 * the absent page.
1085 */
1086 assert(!must_be_resident);
1087
1088 /*
1089 * check for any conditions that prevent
1090 * us from creating a new zero-fill page
1091 * vm_fault_check will do all of the
1092 * fault cleanup in the case of an error condition
1093 * including resetting the thread_interrupt_level
1094 */
1095 error = vm_fault_check(object, m, first_m, interruptible_state);
1096
1097 if (error != VM_FAULT_SUCCESS)
1098 return (error);
1099
1100 XPR(XPR_VM_FAULT,
1101 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1102 object, offset,
1103 m,
1104 first_object, 0);
1105
1106 if (object != first_object) {
1107 /*
1108 * free the absent page we just found
1109 */
1110 VM_PAGE_FREE(m);
1111
1112 /*
1113 * drop reference and lock on current object
1114 */
1115 vm_object_paging_end(object);
1116 vm_object_unlock(object);
1117
1118 /*
1119 * grab the original page we
1120 * 'soldered' in place and
1121 * retake lock on 'first_object'
1122 */
1123 m = first_m;
1124 first_m = VM_PAGE_NULL;
1125
1126 object = first_object;
1127 offset = first_offset;
1128
1129 vm_object_lock(object);
1130 } else {
1131 /*
1132 * we're going to use the absent page we just found
1133 * so convert it to a 'busy' page
1134 */
1135 m->absent = FALSE;
1136 m->busy = TRUE;
1137 }
1138 /*
1139 * zero-fill the page and put it on
1140 * the correct paging queue
1141 */
1142 my_fault = vm_fault_zero_page(m, no_zero_fill);
1143
1144 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1145 m->absent = TRUE;
1146
1147 break;
1148 } else {
1149 if (must_be_resident)
1150 vm_object_paging_end(object);
1151 else if (object != first_object) {
1152 vm_object_paging_end(object);
1153 VM_PAGE_FREE(m);
1154 } else {
1155 first_m = m;
1156 m->absent = FALSE;
1157 m->busy = TRUE;
1158
1159 vm_page_lockspin_queues();
1160
1161 assert(!m->pageout_queue);
1162 VM_PAGE_QUEUES_REMOVE(m);
1163
1164 vm_page_unlock_queues();
1165 }
1166 XPR(XPR_VM_FAULT,
1167 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1168 object, offset,
1169 next_object,
1170 offset+object->vo_shadow_offset,0);
1171
1172 offset += object->vo_shadow_offset;
1173 fault_info->lo_offset += object->vo_shadow_offset;
1174 fault_info->hi_offset += object->vo_shadow_offset;
1175 access_required = VM_PROT_READ;
1176
1177 vm_object_lock(next_object);
1178 vm_object_unlock(object);
1179 object = next_object;
1180 vm_object_paging_begin(object);
1181
1182 /*
1183 * reset to default type of fault
1184 */
1185 my_fault = DBG_CACHE_HIT_FAULT;
1186
1187 continue;
1188 }
1189 }
1190 if ((m->cleaning)
1191 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1192 && (fault_type & VM_PROT_WRITE)) {
1193 /*
1194 * This is a copy-on-write fault that will
1195 * cause us to revoke access to this page, but
1196 * this page is in the process of being cleaned
1197 * in a clustered pageout. We must wait until
1198 * the cleaning operation completes before
1199 * revoking access to the original page,
1200 * otherwise we might attempt to remove a
1201 * wired mapping.
1202 */
1203 #if TRACEFAULTPAGE
1204 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1205 #endif
1206 XPR(XPR_VM_FAULT,
1207 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1208 object, offset,
1209 m, 0, 0);
1210 /*
1211 * take an extra ref so that object won't die
1212 */
1213 vm_object_reference_locked(object);
1214
1215 vm_fault_cleanup(object, first_m);
1216
1217 counter(c_vm_fault_page_block_backoff_kernel++);
1218 vm_object_lock(object);
1219 assert(object->ref_count > 0);
1220
1221 m = vm_page_lookup(object, offset);
1222
1223 if (m != VM_PAGE_NULL && m->cleaning) {
1224 PAGE_ASSERT_WAIT(m, interruptible);
1225
1226 vm_object_unlock(object);
1227 wait_result = thread_block(THREAD_CONTINUE_NULL);
1228 vm_object_deallocate(object);
1229
1230 goto backoff;
1231 } else {
1232 vm_object_unlock(object);
1233
1234 vm_object_deallocate(object);
1235 thread_interrupt_level(interruptible_state);
1236
1237 return (VM_FAULT_RETRY);
1238 }
1239 }
1240 if (type_of_fault == NULL && m->speculative &&
1241 !(fault_info != NULL && fault_info->stealth)) {
1242 /*
1243 * If we were passed a non-NULL pointer for
1244 * "type_of_fault", than we came from
1245 * vm_fault... we'll let it deal with
1246 * this condition, since it
1247 * needs to see m->speculative to correctly
1248 * account the pageins, otherwise...
1249 * take it off the speculative queue, we'll
1250 * let the caller of vm_fault_page deal
1251 * with getting it onto the correct queue
1252 *
1253 * If the caller specified in fault_info that
1254 * it wants a "stealth" fault, we also leave
1255 * the page in the speculative queue.
1256 */
1257 vm_page_lockspin_queues();
1258 if (m->speculative)
1259 VM_PAGE_QUEUES_REMOVE(m);
1260 vm_page_unlock_queues();
1261 }
1262
1263 if (m->encrypted) {
1264 /*
1265 * ENCRYPTED SWAP:
1266 * the user needs access to a page that we
1267 * encrypted before paging it out.
1268 * Decrypt the page now.
1269 * Keep it busy to prevent anyone from
1270 * accessing it during the decryption.
1271 */
1272 m->busy = TRUE;
1273 vm_page_decrypt(m, 0);
1274 assert(object == m->object);
1275 assert(m->busy);
1276 PAGE_WAKEUP_DONE(m);
1277
1278 /*
1279 * Retry from the top, in case
1280 * something changed while we were
1281 * decrypting.
1282 */
1283 continue;
1284 }
1285 ASSERT_PAGE_DECRYPTED(m);
1286
1287 if (m->object->code_signed) {
1288 /*
1289 * CODE SIGNING:
1290 * We just paged in a page from a signed
1291 * memory object but we don't need to
1292 * validate it now. We'll validate it if
1293 * when it gets mapped into a user address
1294 * space for the first time or when the page
1295 * gets copied to another object as a result
1296 * of a copy-on-write.
1297 */
1298 }
1299
1300 /*
1301 * We mark the page busy and leave it on
1302 * the pageout queues. If the pageout
1303 * deamon comes across it, then it will
1304 * remove the page from the queue, but not the object
1305 */
1306 #if TRACEFAULTPAGE
1307 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1308 #endif
1309 XPR(XPR_VM_FAULT,
1310 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1311 object, offset, m, 0, 0);
1312 assert(!m->busy);
1313 assert(!m->absent);
1314
1315 m->busy = TRUE;
1316 break;
1317 }
1318
1319
1320 /*
1321 * we get here when there is no page present in the object at
1322 * the offset we're interested in... we'll allocate a page
1323 * at this point if the pager associated with
1324 * this object can provide the data or we're the top object...
1325 * object is locked; m == NULL
1326 */
1327 if (must_be_resident)
1328 goto dont_look_for_page;
1329
1330 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1331
1332 #if TRACEFAULTPAGE
1333 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1334 #endif
1335 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1336 /*
1337 * Allocate a new page for this object/offset pair as a placeholder
1338 */
1339 m = vm_page_grab();
1340 #if TRACEFAULTPAGE
1341 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1342 #endif
1343 if (m == VM_PAGE_NULL) {
1344
1345 vm_fault_cleanup(object, first_m);
1346 thread_interrupt_level(interruptible_state);
1347
1348 return (VM_FAULT_MEMORY_SHORTAGE);
1349 }
1350
1351 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1352 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1353 } else {
1354 vm_page_insert(m, object, offset);
1355 }
1356 }
1357 if (look_for_page) {
1358 kern_return_t rc;
1359
1360 /*
1361 * If the memory manager is not ready, we
1362 * cannot make requests.
1363 */
1364 if (!object->pager_ready) {
1365 #if TRACEFAULTPAGE
1366 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1367 #endif
1368 if (m != VM_PAGE_NULL)
1369 VM_PAGE_FREE(m);
1370
1371 XPR(XPR_VM_FAULT,
1372 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1373 object, offset, 0, 0, 0);
1374
1375 /*
1376 * take an extra ref so object won't die
1377 */
1378 vm_object_reference_locked(object);
1379 vm_fault_cleanup(object, first_m);
1380 counter(c_vm_fault_page_block_backoff_kernel++);
1381
1382 vm_object_lock(object);
1383 assert(object->ref_count > 0);
1384
1385 if (!object->pager_ready) {
1386 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1387
1388 vm_object_unlock(object);
1389 if (wait_result == THREAD_WAITING)
1390 wait_result = thread_block(THREAD_CONTINUE_NULL);
1391 vm_object_deallocate(object);
1392
1393 goto backoff;
1394 } else {
1395 vm_object_unlock(object);
1396 vm_object_deallocate(object);
1397 thread_interrupt_level(interruptible_state);
1398
1399 return (VM_FAULT_RETRY);
1400 }
1401 }
1402 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1403 /*
1404 * If there are too many outstanding page
1405 * requests pending on this external object, we
1406 * wait for them to be resolved now.
1407 */
1408 #if TRACEFAULTPAGE
1409 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1410 #endif
1411 if (m != VM_PAGE_NULL)
1412 VM_PAGE_FREE(m);
1413 /*
1414 * take an extra ref so object won't die
1415 */
1416 vm_object_reference_locked(object);
1417
1418 vm_fault_cleanup(object, first_m);
1419
1420 counter(c_vm_fault_page_block_backoff_kernel++);
1421
1422 vm_object_lock(object);
1423 assert(object->ref_count > 0);
1424
1425 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1426 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1427
1428 vm_object_unlock(object);
1429 wait_result = thread_block(THREAD_CONTINUE_NULL);
1430 vm_object_deallocate(object);
1431
1432 goto backoff;
1433 } else {
1434 vm_object_unlock(object);
1435 vm_object_deallocate(object);
1436 thread_interrupt_level(interruptible_state);
1437
1438 return (VM_FAULT_RETRY);
1439 }
1440 }
1441 if (m != VM_PAGE_NULL) {
1442 VM_PAGE_FREE(m);
1443 m = VM_PAGE_NULL;
1444 }
1445
1446 #if TRACEFAULTPAGE
1447 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1448 #endif
1449
1450 /*
1451 * It's possible someone called vm_object_destroy while we weren't
1452 * holding the object lock. If that has happened, then bail out
1453 * here.
1454 */
1455
1456 pager = object->pager;
1457
1458 if (pager == MEMORY_OBJECT_NULL) {
1459 vm_fault_cleanup(object, first_m);
1460 thread_interrupt_level(interruptible_state);
1461 return VM_FAULT_MEMORY_ERROR;
1462 }
1463
1464 /*
1465 * We have an absent page in place for the faulting offset,
1466 * so we can release the object lock.
1467 */
1468
1469 vm_object_unlock(object);
1470
1471 /*
1472 * If this object uses a copy_call strategy,
1473 * and we are interested in a copy of this object
1474 * (having gotten here only by following a
1475 * shadow chain), then tell the memory manager
1476 * via a flag added to the desired_access
1477 * parameter, so that it can detect a race
1478 * between our walking down the shadow chain
1479 * and its pushing pages up into a copy of
1480 * the object that it manages.
1481 */
1482 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1483 wants_copy_flag = VM_PROT_WANTS_COPY;
1484 else
1485 wants_copy_flag = VM_PROT_NONE;
1486
1487 XPR(XPR_VM_FAULT,
1488 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1489 object, offset, m,
1490 access_required | wants_copy_flag, 0);
1491
1492 if (object->copy == first_object) {
1493 /*
1494 * if we issue the memory_object_data_request in
1495 * this state, we are subject to a deadlock with
1496 * the underlying filesystem if it is trying to
1497 * shrink the file resulting in a push of pages
1498 * into the copy object... that push will stall
1499 * on the placeholder page, and if the pushing thread
1500 * is holding a lock that is required on the pagein
1501 * path (such as a truncate lock), we'll deadlock...
1502 * to avoid this potential deadlock, we throw away
1503 * our placeholder page before calling memory_object_data_request
1504 * and force this thread to retry the vm_fault_page after
1505 * we have issued the I/O. the second time through this path
1506 * we will find the page already in the cache (presumably still
1507 * busy waiting for the I/O to complete) and then complete
1508 * the fault w/o having to go through memory_object_data_request again
1509 */
1510 assert(first_m != VM_PAGE_NULL);
1511 assert(first_m->object == first_object);
1512
1513 vm_object_lock(first_object);
1514 VM_PAGE_FREE(first_m);
1515 vm_object_paging_end(first_object);
1516 vm_object_unlock(first_object);
1517
1518 first_m = VM_PAGE_NULL;
1519 force_fault_retry = TRUE;
1520
1521 vm_fault_page_forced_retry++;
1522 }
1523
1524 if (data_already_requested == TRUE) {
1525 orig_behavior = fault_info->behavior;
1526 orig_cluster_size = fault_info->cluster_size;
1527
1528 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1529 fault_info->cluster_size = PAGE_SIZE;
1530 }
1531 /*
1532 * Call the memory manager to retrieve the data.
1533 */
1534 rc = memory_object_data_request(
1535 pager,
1536 offset + object->paging_offset,
1537 PAGE_SIZE,
1538 access_required | wants_copy_flag,
1539 (memory_object_fault_info_t)fault_info);
1540
1541 if (data_already_requested == TRUE) {
1542 fault_info->behavior = orig_behavior;
1543 fault_info->cluster_size = orig_cluster_size;
1544 } else
1545 data_already_requested = TRUE;
1546
1547 #if TRACEFAULTPAGE
1548 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1549 #endif
1550 vm_object_lock(object);
1551
1552 if (rc != KERN_SUCCESS) {
1553
1554 vm_fault_cleanup(object, first_m);
1555 thread_interrupt_level(interruptible_state);
1556
1557 return ((rc == MACH_SEND_INTERRUPTED) ?
1558 VM_FAULT_INTERRUPTED :
1559 VM_FAULT_MEMORY_ERROR);
1560 } else {
1561 clock_sec_t tv_sec;
1562 clock_usec_t tv_usec;
1563
1564 clock_get_system_microtime(&tv_sec, &tv_usec);
1565 current_thread()->t_page_creation_time = tv_sec;
1566 current_thread()->t_page_creation_count = 0;
1567 }
1568 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1569
1570 vm_fault_cleanup(object, first_m);
1571 thread_interrupt_level(interruptible_state);
1572
1573 return (VM_FAULT_INTERRUPTED);
1574 }
1575 if (force_fault_retry == TRUE) {
1576
1577 vm_fault_cleanup(object, first_m);
1578 thread_interrupt_level(interruptible_state);
1579
1580 return (VM_FAULT_RETRY);
1581 }
1582 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1583 /*
1584 * No page here means that the object we
1585 * initially looked up was "physically
1586 * contiguous" (i.e. device memory). However,
1587 * with Virtual VRAM, the object might not
1588 * be backed by that device memory anymore,
1589 * so we're done here only if the object is
1590 * still "phys_contiguous".
1591 * Otherwise, if the object is no longer
1592 * "phys_contiguous", we need to retry the
1593 * page fault against the object's new backing
1594 * store (different memory object).
1595 */
1596 phys_contig_object:
1597 goto done;
1598 }
1599 /*
1600 * potentially a pagein fault
1601 * if we make it through the state checks
1602 * above, than we'll count it as such
1603 */
1604 my_fault = DBG_PAGEIN_FAULT;
1605
1606 /*
1607 * Retry with same object/offset, since new data may
1608 * be in a different page (i.e., m is meaningless at
1609 * this point).
1610 */
1611 continue;
1612 }
1613 dont_look_for_page:
1614 /*
1615 * We get here if the object has no pager, or an existence map
1616 * exists and indicates the page isn't present on the pager
1617 * or we're unwiring a page. If a pager exists, but there
1618 * is no existence map, then the m->absent case above handles
1619 * the ZF case when the pager can't provide the page
1620 */
1621 #if TRACEFAULTPAGE
1622 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1623 #endif
1624 if (object == first_object)
1625 first_m = m;
1626 else
1627 assert(m == VM_PAGE_NULL);
1628
1629 XPR(XPR_VM_FAULT,
1630 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1631 object, offset, m,
1632 object->shadow, 0);
1633
1634 next_object = object->shadow;
1635
1636 if (next_object == VM_OBJECT_NULL) {
1637 /*
1638 * we've hit the bottom of the shadown chain,
1639 * fill the page in the top object with zeros.
1640 */
1641 assert(!must_be_resident);
1642
1643 if (object != first_object) {
1644 vm_object_paging_end(object);
1645 vm_object_unlock(object);
1646
1647 object = first_object;
1648 offset = first_offset;
1649 vm_object_lock(object);
1650 }
1651 m = first_m;
1652 assert(m->object == object);
1653 first_m = VM_PAGE_NULL;
1654
1655 /*
1656 * check for any conditions that prevent
1657 * us from creating a new zero-fill page
1658 * vm_fault_check will do all of the
1659 * fault cleanup in the case of an error condition
1660 * including resetting the thread_interrupt_level
1661 */
1662 error = vm_fault_check(object, m, first_m, interruptible_state);
1663
1664 if (error != VM_FAULT_SUCCESS)
1665 return (error);
1666
1667 if (m == VM_PAGE_NULL) {
1668 m = vm_page_grab();
1669
1670 if (m == VM_PAGE_NULL) {
1671 vm_fault_cleanup(object, VM_PAGE_NULL);
1672 thread_interrupt_level(interruptible_state);
1673
1674 return (VM_FAULT_MEMORY_SHORTAGE);
1675 }
1676 vm_page_insert(m, object, offset);
1677 }
1678 my_fault = vm_fault_zero_page(m, no_zero_fill);
1679
1680 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1681 m->absent = TRUE;
1682 break;
1683
1684 } else {
1685 /*
1686 * Move on to the next object. Lock the next
1687 * object before unlocking the current one.
1688 */
1689 if ((object != first_object) || must_be_resident)
1690 vm_object_paging_end(object);
1691
1692 offset += object->vo_shadow_offset;
1693 fault_info->lo_offset += object->vo_shadow_offset;
1694 fault_info->hi_offset += object->vo_shadow_offset;
1695 access_required = VM_PROT_READ;
1696
1697 vm_object_lock(next_object);
1698 vm_object_unlock(object);
1699
1700 object = next_object;
1701 vm_object_paging_begin(object);
1702 }
1703 }
1704
1705 /*
1706 * PAGE HAS BEEN FOUND.
1707 *
1708 * This page (m) is:
1709 * busy, so that we can play with it;
1710 * not absent, so that nobody else will fill it;
1711 * possibly eligible for pageout;
1712 *
1713 * The top-level page (first_m) is:
1714 * VM_PAGE_NULL if the page was found in the
1715 * top-level object;
1716 * busy, not absent, and ineligible for pageout.
1717 *
1718 * The current object (object) is locked. A paging
1719 * reference is held for the current and top-level
1720 * objects.
1721 */
1722
1723 #if TRACEFAULTPAGE
1724 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1725 #endif
1726 #if EXTRA_ASSERTIONS
1727 assert(m->busy && !m->absent);
1728 assert((first_m == VM_PAGE_NULL) ||
1729 (first_m->busy && !first_m->absent &&
1730 !first_m->active && !first_m->inactive));
1731 #endif /* EXTRA_ASSERTIONS */
1732
1733 /*
1734 * ENCRYPTED SWAP:
1735 * If we found a page, we must have decrypted it before we
1736 * get here...
1737 */
1738 ASSERT_PAGE_DECRYPTED(m);
1739
1740 XPR(XPR_VM_FAULT,
1741 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1742 object, offset, m,
1743 first_object, first_m);
1744
1745 /*
1746 * If the page is being written, but isn't
1747 * already owned by the top-level object,
1748 * we have to copy it into a new page owned
1749 * by the top-level object.
1750 */
1751 if (object != first_object) {
1752
1753 #if TRACEFAULTPAGE
1754 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1755 #endif
1756 if (fault_type & VM_PROT_WRITE) {
1757 vm_page_t copy_m;
1758
1759 /*
1760 * We only really need to copy if we
1761 * want to write it.
1762 */
1763 assert(!must_be_resident);
1764
1765 /*
1766 * are we protecting the system from
1767 * backing store exhaustion. If so
1768 * sleep unless we are privileged.
1769 */
1770 if (vm_backing_store_low) {
1771 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1772
1773 RELEASE_PAGE(m);
1774 vm_fault_cleanup(object, first_m);
1775
1776 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1777
1778 thread_block(THREAD_CONTINUE_NULL);
1779 thread_interrupt_level(interruptible_state);
1780
1781 return (VM_FAULT_RETRY);
1782 }
1783 }
1784 /*
1785 * If we try to collapse first_object at this
1786 * point, we may deadlock when we try to get
1787 * the lock on an intermediate object (since we
1788 * have the bottom object locked). We can't
1789 * unlock the bottom object, because the page
1790 * we found may move (by collapse) if we do.
1791 *
1792 * Instead, we first copy the page. Then, when
1793 * we have no more use for the bottom object,
1794 * we unlock it and try to collapse.
1795 *
1796 * Note that we copy the page even if we didn't
1797 * need to... that's the breaks.
1798 */
1799
1800 /*
1801 * Allocate a page for the copy
1802 */
1803 copy_m = vm_page_grab();
1804
1805 if (copy_m == VM_PAGE_NULL) {
1806 RELEASE_PAGE(m);
1807
1808 vm_fault_cleanup(object, first_m);
1809 thread_interrupt_level(interruptible_state);
1810
1811 return (VM_FAULT_MEMORY_SHORTAGE);
1812 }
1813 XPR(XPR_VM_FAULT,
1814 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1815 object, offset,
1816 m, copy_m, 0);
1817
1818 vm_page_copy(m, copy_m);
1819
1820 /*
1821 * If another map is truly sharing this
1822 * page with us, we have to flush all
1823 * uses of the original page, since we
1824 * can't distinguish those which want the
1825 * original from those which need the
1826 * new copy.
1827 *
1828 * XXXO If we know that only one map has
1829 * access to this page, then we could
1830 * avoid the pmap_disconnect() call.
1831 */
1832 if (m->pmapped)
1833 pmap_disconnect(m->phys_page);
1834
1835 assert(!m->cleaning);
1836
1837 /*
1838 * We no longer need the old page or object.
1839 */
1840 PAGE_WAKEUP_DONE(m);
1841 vm_object_paging_end(object);
1842 vm_object_unlock(object);
1843
1844 my_fault = DBG_COW_FAULT;
1845 VM_STAT_INCR(cow_faults);
1846 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1847 current_task()->cow_faults++;
1848
1849 object = first_object;
1850 offset = first_offset;
1851
1852 vm_object_lock(object);
1853 /*
1854 * get rid of the place holder
1855 * page that we soldered in earlier
1856 */
1857 VM_PAGE_FREE(first_m);
1858 first_m = VM_PAGE_NULL;
1859
1860 /*
1861 * and replace it with the
1862 * page we just copied into
1863 */
1864 assert(copy_m->busy);
1865 vm_page_insert(copy_m, object, offset);
1866 SET_PAGE_DIRTY(copy_m, TRUE);
1867
1868 m = copy_m;
1869 /*
1870 * Now that we've gotten the copy out of the
1871 * way, let's try to collapse the top object.
1872 * But we have to play ugly games with
1873 * paging_in_progress to do that...
1874 */
1875 vm_object_paging_end(object);
1876 vm_object_collapse(object, offset, TRUE);
1877 vm_object_paging_begin(object);
1878
1879 } else
1880 *protection &= (~VM_PROT_WRITE);
1881 }
1882 /*
1883 * Now check whether the page needs to be pushed into the
1884 * copy object. The use of asymmetric copy on write for
1885 * shared temporary objects means that we may do two copies to
1886 * satisfy the fault; one above to get the page from a
1887 * shadowed object, and one here to push it into the copy.
1888 */
1889 try_failed_count = 0;
1890
1891 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1892 vm_object_offset_t copy_offset;
1893 vm_page_t copy_m;
1894
1895 #if TRACEFAULTPAGE
1896 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1897 #endif
1898 /*
1899 * If the page is being written, but hasn't been
1900 * copied to the copy-object, we have to copy it there.
1901 */
1902 if ((fault_type & VM_PROT_WRITE) == 0) {
1903 *protection &= ~VM_PROT_WRITE;
1904 break;
1905 }
1906
1907 /*
1908 * If the page was guaranteed to be resident,
1909 * we must have already performed the copy.
1910 */
1911 if (must_be_resident)
1912 break;
1913
1914 /*
1915 * Try to get the lock on the copy_object.
1916 */
1917 if (!vm_object_lock_try(copy_object)) {
1918
1919 vm_object_unlock(object);
1920 try_failed_count++;
1921
1922 mutex_pause(try_failed_count); /* wait a bit */
1923 vm_object_lock(object);
1924
1925 continue;
1926 }
1927 try_failed_count = 0;
1928
1929 /*
1930 * Make another reference to the copy-object,
1931 * to keep it from disappearing during the
1932 * copy.
1933 */
1934 vm_object_reference_locked(copy_object);
1935
1936 /*
1937 * Does the page exist in the copy?
1938 */
1939 copy_offset = first_offset - copy_object->vo_shadow_offset;
1940
1941 if (copy_object->vo_size <= copy_offset)
1942 /*
1943 * Copy object doesn't cover this page -- do nothing.
1944 */
1945 ;
1946 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1947 /*
1948 * Page currently exists in the copy object
1949 */
1950 if (copy_m->busy) {
1951 /*
1952 * If the page is being brought
1953 * in, wait for it and then retry.
1954 */
1955 RELEASE_PAGE(m);
1956
1957 /*
1958 * take an extra ref so object won't die
1959 */
1960 vm_object_reference_locked(copy_object);
1961 vm_object_unlock(copy_object);
1962 vm_fault_cleanup(object, first_m);
1963 counter(c_vm_fault_page_block_backoff_kernel++);
1964
1965 vm_object_lock(copy_object);
1966 assert(copy_object->ref_count > 0);
1967 VM_OBJ_RES_DECR(copy_object);
1968 vm_object_lock_assert_exclusive(copy_object);
1969 copy_object->ref_count--;
1970 assert(copy_object->ref_count > 0);
1971 copy_m = vm_page_lookup(copy_object, copy_offset);
1972 /*
1973 * ENCRYPTED SWAP:
1974 * it's OK if the "copy_m" page is encrypted,
1975 * because we're not moving it nor handling its
1976 * contents.
1977 */
1978 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1979 PAGE_ASSERT_WAIT(copy_m, interruptible);
1980
1981 vm_object_unlock(copy_object);
1982 wait_result = thread_block(THREAD_CONTINUE_NULL);
1983 vm_object_deallocate(copy_object);
1984
1985 goto backoff;
1986 } else {
1987 vm_object_unlock(copy_object);
1988 vm_object_deallocate(copy_object);
1989 thread_interrupt_level(interruptible_state);
1990
1991 return (VM_FAULT_RETRY);
1992 }
1993 }
1994 }
1995 else if (!PAGED_OUT(copy_object, copy_offset)) {
1996 /*
1997 * If PAGED_OUT is TRUE, then the page used to exist
1998 * in the copy-object, and has already been paged out.
1999 * We don't need to repeat this. If PAGED_OUT is
2000 * FALSE, then either we don't know (!pager_created,
2001 * for example) or it hasn't been paged out.
2002 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2003 * We must copy the page to the copy object.
2004 */
2005
2006 if (vm_backing_store_low) {
2007 /*
2008 * we are protecting the system from
2009 * backing store exhaustion. If so
2010 * sleep unless we are privileged.
2011 */
2012 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2013 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2014
2015 RELEASE_PAGE(m);
2016 VM_OBJ_RES_DECR(copy_object);
2017 vm_object_lock_assert_exclusive(copy_object);
2018 copy_object->ref_count--;
2019 assert(copy_object->ref_count > 0);
2020
2021 vm_object_unlock(copy_object);
2022 vm_fault_cleanup(object, first_m);
2023 thread_block(THREAD_CONTINUE_NULL);
2024 thread_interrupt_level(interruptible_state);
2025
2026 return (VM_FAULT_RETRY);
2027 }
2028 }
2029 /*
2030 * Allocate a page for the copy
2031 */
2032 copy_m = vm_page_alloc(copy_object, copy_offset);
2033
2034 if (copy_m == VM_PAGE_NULL) {
2035 RELEASE_PAGE(m);
2036
2037 VM_OBJ_RES_DECR(copy_object);
2038 vm_object_lock_assert_exclusive(copy_object);
2039 copy_object->ref_count--;
2040 assert(copy_object->ref_count > 0);
2041
2042 vm_object_unlock(copy_object);
2043 vm_fault_cleanup(object, first_m);
2044 thread_interrupt_level(interruptible_state);
2045
2046 return (VM_FAULT_MEMORY_SHORTAGE);
2047 }
2048 /*
2049 * Must copy page into copy-object.
2050 */
2051 vm_page_copy(m, copy_m);
2052
2053 /*
2054 * If the old page was in use by any users
2055 * of the copy-object, it must be removed
2056 * from all pmaps. (We can't know which
2057 * pmaps use it.)
2058 */
2059 if (m->pmapped)
2060 pmap_disconnect(m->phys_page);
2061
2062 /*
2063 * If there's a pager, then immediately
2064 * page out this page, using the "initialize"
2065 * option. Else, we use the copy.
2066 */
2067 if ((!copy_object->pager_created)
2068 #if MACH_PAGEMAP
2069 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2070 #endif
2071 ) {
2072
2073 vm_page_lockspin_queues();
2074 assert(!m->cleaning);
2075 vm_page_activate(copy_m);
2076 vm_page_unlock_queues();
2077
2078 SET_PAGE_DIRTY(copy_m, TRUE);
2079 PAGE_WAKEUP_DONE(copy_m);
2080
2081 } else if (copy_object->internal) {
2082 /*
2083 * For internal objects check with the pager to see
2084 * if the page already exists in the backing store.
2085 * If yes, then we can drop the copy page. If not,
2086 * then we'll activate it, mark it dirty and keep it
2087 * around.
2088 */
2089
2090 kern_return_t kr = KERN_SUCCESS;
2091
2092 memory_object_t copy_pager = copy_object->pager;
2093 assert(copy_pager != MEMORY_OBJECT_NULL);
2094 vm_object_paging_begin(copy_object);
2095
2096 vm_object_unlock(copy_object);
2097
2098 kr = memory_object_data_request(
2099 copy_pager,
2100 copy_offset + copy_object->paging_offset,
2101 0, /* Only query the pager. */
2102 VM_PROT_READ,
2103 NULL);
2104
2105 vm_object_lock(copy_object);
2106
2107 vm_object_paging_end(copy_object);
2108
2109 /*
2110 * Since we dropped the copy_object's lock,
2111 * check whether we'll have to deallocate
2112 * the hard way.
2113 */
2114 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2115 vm_object_unlock(copy_object);
2116 vm_object_deallocate(copy_object);
2117 vm_object_lock(object);
2118
2119 continue;
2120 }
2121 if (kr == KERN_SUCCESS) {
2122 /*
2123 * The pager has the page. We don't want to overwrite
2124 * that page by sending this one out to the backing store.
2125 * So we drop the copy page.
2126 */
2127 VM_PAGE_FREE(copy_m);
2128
2129 } else {
2130 /*
2131 * The pager doesn't have the page. We'll keep this one
2132 * around in the copy object. It might get sent out to
2133 * the backing store under memory pressure.
2134 */
2135 vm_page_lockspin_queues();
2136 assert(!m->cleaning);
2137 vm_page_activate(copy_m);
2138 vm_page_unlock_queues();
2139
2140 SET_PAGE_DIRTY(copy_m, TRUE);
2141 PAGE_WAKEUP_DONE(copy_m);
2142 }
2143 } else {
2144
2145 assert(copy_m->busy == TRUE);
2146 assert(!m->cleaning);
2147
2148 /*
2149 * dirty is protected by the object lock
2150 */
2151 SET_PAGE_DIRTY(copy_m, TRUE);
2152
2153 /*
2154 * The page is already ready for pageout:
2155 * not on pageout queues and busy.
2156 * Unlock everything except the
2157 * copy_object itself.
2158 */
2159 vm_object_unlock(object);
2160
2161 /*
2162 * Write the page to the copy-object,
2163 * flushing it from the kernel.
2164 */
2165 vm_pageout_initialize_page(copy_m);
2166
2167 /*
2168 * Since the pageout may have
2169 * temporarily dropped the
2170 * copy_object's lock, we
2171 * check whether we'll have
2172 * to deallocate the hard way.
2173 */
2174 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2175 vm_object_unlock(copy_object);
2176 vm_object_deallocate(copy_object);
2177 vm_object_lock(object);
2178
2179 continue;
2180 }
2181 /*
2182 * Pick back up the old object's
2183 * lock. [It is safe to do so,
2184 * since it must be deeper in the
2185 * object tree.]
2186 */
2187 vm_object_lock(object);
2188 }
2189
2190 /*
2191 * Because we're pushing a page upward
2192 * in the object tree, we must restart
2193 * any faults that are waiting here.
2194 * [Note that this is an expansion of
2195 * PAGE_WAKEUP that uses the THREAD_RESTART
2196 * wait result]. Can't turn off the page's
2197 * busy bit because we're not done with it.
2198 */
2199 if (m->wanted) {
2200 m->wanted = FALSE;
2201 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2202 }
2203 }
2204 /*
2205 * The reference count on copy_object must be
2206 * at least 2: one for our extra reference,
2207 * and at least one from the outside world
2208 * (we checked that when we last locked
2209 * copy_object).
2210 */
2211 vm_object_lock_assert_exclusive(copy_object);
2212 copy_object->ref_count--;
2213 assert(copy_object->ref_count > 0);
2214
2215 VM_OBJ_RES_DECR(copy_object);
2216 vm_object_unlock(copy_object);
2217
2218 break;
2219 }
2220
2221 done:
2222 *result_page = m;
2223 *top_page = first_m;
2224
2225 XPR(XPR_VM_FAULT,
2226 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2227 object, offset, m, first_m, 0);
2228
2229 if (m != VM_PAGE_NULL) {
2230 retval = VM_FAULT_SUCCESS;
2231 if (my_fault == DBG_PAGEIN_FAULT) {
2232
2233 VM_STAT_INCR(pageins);
2234 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2235 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2236 current_task()->pageins++;
2237
2238 if (m->object->internal) {
2239 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2240 my_fault = DBG_PAGEIND_FAULT;
2241 } else {
2242 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2243 my_fault = DBG_PAGEINV_FAULT;
2244 }
2245
2246 /*
2247 * evaluate access pattern and update state
2248 * vm_fault_deactivate_behind depends on the
2249 * state being up to date
2250 */
2251 vm_fault_is_sequential(object, offset, fault_info->behavior);
2252
2253 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2254 }
2255 if (type_of_fault)
2256 *type_of_fault = my_fault;
2257 } else {
2258 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2259 assert(first_m == VM_PAGE_NULL);
2260 assert(object == first_object);
2261 }
2262
2263 thread_interrupt_level(interruptible_state);
2264
2265 #if TRACEFAULTPAGE
2266 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2267 #endif
2268 return retval;
2269
2270 backoff:
2271 thread_interrupt_level(interruptible_state);
2272
2273 if (wait_result == THREAD_INTERRUPTED)
2274 return (VM_FAULT_INTERRUPTED);
2275 return (VM_FAULT_RETRY);
2276
2277 #undef RELEASE_PAGE
2278 }
2279
2280
2281
2282 /*
2283 * CODE SIGNING:
2284 * When soft faulting a page, we have to validate the page if:
2285 * 1. the page is being mapped in user space
2286 * 2. the page hasn't already been found to be "tainted"
2287 * 3. the page belongs to a code-signed object
2288 * 4. the page has not been validated yet or has been mapped for write.
2289 */
2290 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page) \
2291 ((pmap) != kernel_pmap /*1*/ && \
2292 !(page)->cs_tainted /*2*/ && \
2293 (page)->object->code_signed /*3*/ && \
2294 (!(page)->cs_validated || (page)->wpmapped /*4*/))
2295
2296
2297 /*
2298 * page queue lock must NOT be held
2299 * m->object must be locked
2300 *
2301 * NOTE: m->object could be locked "shared" only if we are called
2302 * from vm_fault() as part of a soft fault. If so, we must be
2303 * careful not to modify the VM object in any way that is not
2304 * legal under a shared lock...
2305 */
2306 unsigned long cs_enter_tainted_rejected = 0;
2307 unsigned long cs_enter_tainted_accepted = 0;
2308 kern_return_t
2309 vm_fault_enter(vm_page_t m,
2310 pmap_t pmap,
2311 vm_map_offset_t vaddr,
2312 vm_prot_t prot,
2313 vm_prot_t fault_type,
2314 boolean_t wired,
2315 boolean_t change_wiring,
2316 boolean_t no_cache,
2317 boolean_t cs_bypass,
2318 boolean_t *need_retry,
2319 int *type_of_fault)
2320 {
2321 kern_return_t kr, pe_result;
2322 boolean_t previously_pmapped = m->pmapped;
2323 boolean_t must_disconnect = 0;
2324 boolean_t map_is_switched, map_is_switch_protected;
2325
2326 vm_object_lock_assert_held(m->object);
2327 #if DEBUG
2328 lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2329 #endif /* DEBUG */
2330
2331 if (m->phys_page == vm_page_guard_addr) {
2332 assert(m->fictitious);
2333 return KERN_SUCCESS;
2334 }
2335
2336 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2337
2338 vm_object_lock_assert_exclusive(m->object);
2339
2340 } else if ((fault_type & VM_PROT_WRITE) == 0) {
2341 /*
2342 * This is not a "write" fault, so we
2343 * might not have taken the object lock
2344 * exclusively and we might not be able
2345 * to update the "wpmapped" bit in
2346 * vm_fault_enter().
2347 * Let's just grant read access to
2348 * the page for now and we'll
2349 * soft-fault again if we need write
2350 * access later...
2351 */
2352 prot &= ~VM_PROT_WRITE;
2353 }
2354 if (m->pmapped == FALSE) {
2355
2356 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2357 /*
2358 * found it in the cache, but this
2359 * is the first fault-in of the page (m->pmapped == FALSE)
2360 * so it must have come in as part of
2361 * a cluster... account 1 pagein against it
2362 */
2363 VM_STAT_INCR(pageins);
2364 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2365
2366 if (m->object->internal) {
2367 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2368 *type_of_fault = DBG_PAGEIND_FAULT;
2369 } else {
2370 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2371 *type_of_fault = DBG_PAGEINV_FAULT;
2372 }
2373
2374 current_task()->pageins++;
2375 }
2376 VM_PAGE_CONSUME_CLUSTERED(m);
2377
2378 }
2379
2380 if (*type_of_fault != DBG_COW_FAULT) {
2381 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2382
2383 if (pmap == kernel_pmap) {
2384 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2385 }
2386 }
2387
2388 /* Validate code signature if necessary. */
2389 if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2390 vm_object_lock_assert_exclusive(m->object);
2391
2392 if (m->cs_validated) {
2393 vm_cs_revalidates++;
2394 }
2395
2396 /* VM map is locked, so 1 ref will remain on VM object -
2397 * so no harm if vm_page_validate_cs drops the object lock */
2398 vm_page_validate_cs(m);
2399 }
2400
2401 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2402
2403 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2404 (pmap == vm_map_pmap(current_thread()->map)));
2405 map_is_switch_protected = current_thread()->map->switch_protect;
2406
2407 /* If the map is switched, and is switch-protected, we must protect
2408 * some pages from being write-faulted: immutable pages because by
2409 * definition they may not be written, and executable pages because that
2410 * would provide a way to inject unsigned code.
2411 * If the page is immutable, we can simply return. However, we can't
2412 * immediately determine whether a page is executable anywhere. But,
2413 * we can disconnect it everywhere and remove the executable protection
2414 * from the current map. We do that below right before we do the
2415 * PMAP_ENTER.
2416 */
2417 if(!cs_enforcement_disable && map_is_switched &&
2418 map_is_switch_protected && page_immutable(m, prot) &&
2419 (prot & VM_PROT_WRITE))
2420 {
2421 return KERN_CODESIGN_ERROR;
2422 }
2423
2424 /* A page could be tainted, or pose a risk of being tainted later.
2425 * Check whether the receiving process wants it, and make it feel
2426 * the consequences (that hapens in cs_invalid_page()).
2427 * For CS Enforcement, two other conditions will
2428 * cause that page to be tainted as well:
2429 * - pmapping an unsigned page executable - this means unsigned code;
2430 * - writeable mapping of a validated page - the content of that page
2431 * can be changed without the kernel noticing, therefore unsigned
2432 * code can be created
2433 */
2434 if (m->cs_tainted ||
2435 (( !cs_enforcement_disable && !cs_bypass ) &&
2436 (/* The page is unsigned and wants to be executable */
2437 (!m->cs_validated && (prot & VM_PROT_EXECUTE)) ||
2438 /* The page should be immutable, but is in danger of being modified
2439 * This is the case where we want policy from the code directory -
2440 * is the page immutable or not? For now we have to assume that
2441 * code pages will be immutable, data pages not.
2442 * We'll assume a page is a code page if it has a code directory
2443 * and we fault for execution.
2444 * That is good enough since if we faulted the code page for
2445 * writing in another map before, it is wpmapped; if we fault
2446 * it for writing in this map later it will also be faulted for executing
2447 * at the same time; and if we fault for writing in another map
2448 * later, we will disconnect it from this pmap so we'll notice
2449 * the change.
2450 */
2451 (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2452 ))
2453 )
2454 {
2455 /* We will have a tainted page. Have to handle the special case
2456 * of a switched map now. If the map is not switched, standard
2457 * procedure applies - call cs_invalid_page().
2458 * If the map is switched, the real owner is invalid already.
2459 * There is no point in invalidating the switching process since
2460 * it will not be executing from the map. So we don't call
2461 * cs_invalid_page() in that case. */
2462 boolean_t reject_page;
2463 if(map_is_switched) {
2464 assert(pmap==vm_map_pmap(current_thread()->map));
2465 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2466 reject_page = FALSE;
2467 } else {
2468 reject_page = cs_invalid_page((addr64_t) vaddr);
2469 }
2470
2471 if (reject_page) {
2472 /* reject the tainted page: abort the page fault */
2473 kr = KERN_CODESIGN_ERROR;
2474 cs_enter_tainted_rejected++;
2475 } else {
2476 /* proceed with the tainted page */
2477 kr = KERN_SUCCESS;
2478 /* Page might have been tainted before or not; now it
2479 * definitively is. If the page wasn't tainted, we must
2480 * disconnect it from all pmaps later. */
2481 must_disconnect = !m->cs_tainted;
2482 m->cs_tainted = TRUE;
2483 cs_enter_tainted_accepted++;
2484 }
2485 if (cs_debug || kr != KERN_SUCCESS) {
2486 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2487 "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2488 (long long)vaddr, m, m->object, m->offset);
2489 }
2490
2491 } else {
2492 /* proceed with the valid page */
2493 kr = KERN_SUCCESS;
2494 }
2495
2496 /* If we have a KERN_SUCCESS from the previous checks, we either have
2497 * a good page, or a tainted page that has been accepted by the process.
2498 * In both cases the page will be entered into the pmap.
2499 * If the page is writeable, we need to disconnect it from other pmaps
2500 * now so those processes can take note.
2501 */
2502 if (kr == KERN_SUCCESS) {
2503 /*
2504 * NOTE: we may only hold the vm_object lock SHARED
2505 * at this point, but the update of pmapped is ok
2506 * since this is the ONLY bit updated behind the SHARED
2507 * lock... however, we need to figure out how to do an atomic
2508 * update on a bit field to make this less fragile... right
2509 * now I don't know how to coerce 'C' to give me the offset info
2510 * that's needed for an AtomicCompareAndSwap
2511 */
2512 m->pmapped = TRUE;
2513 if(vm_page_is_slideable(m)) {
2514 boolean_t was_busy = m->busy;
2515 m->busy = TRUE;
2516 kr = vm_page_slide(m, 0);
2517 assert(m->busy);
2518 if(!was_busy) {
2519 PAGE_WAKEUP_DONE(m);
2520 }
2521 if (kr != KERN_SUCCESS) {
2522 /*
2523 * This page has not been slid correctly,
2524 * do not do the pmap_enter() !
2525 * Let vm_fault_enter() return the error
2526 * so the caller can fail the fault.
2527 */
2528 goto after_the_pmap_enter;
2529 }
2530 }
2531
2532 if (fault_type & VM_PROT_WRITE) {
2533
2534 if (m->wpmapped == FALSE) {
2535 vm_object_lock_assert_exclusive(m->object);
2536
2537 m->wpmapped = TRUE;
2538 }
2539 if (must_disconnect) {
2540 /*
2541 * We can only get here
2542 * because of the CSE logic
2543 */
2544 assert(cs_enforcement_disable == FALSE);
2545 pmap_disconnect(m->phys_page);
2546 /*
2547 * If we are faulting for a write, we can clear
2548 * the execute bit - that will ensure the page is
2549 * checked again before being executable, which
2550 * protects against a map switch.
2551 * This only happens the first time the page
2552 * gets tainted, so we won't get stuck here
2553 * to make an already writeable page executable.
2554 */
2555 if (!cs_bypass){
2556 prot &= ~VM_PROT_EXECUTE;
2557 }
2558 }
2559 }
2560
2561 /* Prevent a deadlock by not
2562 * holding the object lock if we need to wait for a page in
2563 * pmap_enter() - <rdar://problem/7138958> */
2564 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
2565 wired, PMAP_OPTIONS_NOWAIT, pe_result);
2566
2567 if(pe_result == KERN_RESOURCE_SHORTAGE) {
2568
2569 if (need_retry) {
2570 /*
2571 * this will be non-null in the case where we hold the lock
2572 * on the top-object in this chain... we can't just drop
2573 * the lock on the object we're inserting the page into
2574 * and recall the PMAP_ENTER since we can still cause
2575 * a deadlock if one of the critical paths tries to
2576 * acquire the lock on the top-object and we're blocked
2577 * in PMAP_ENTER waiting for memory... our only recourse
2578 * is to deal with it at a higher level where we can
2579 * drop both locks.
2580 */
2581 *need_retry = TRUE;
2582 vm_pmap_enter_retried++;
2583 goto after_the_pmap_enter;
2584 }
2585 /* The nonblocking version of pmap_enter did not succeed.
2586 * and we don't need to drop other locks and retry
2587 * at the level above us, so
2588 * use the blocking version instead. Requires marking
2589 * the page busy and unlocking the object */
2590 boolean_t was_busy = m->busy;
2591 m->busy = TRUE;
2592 vm_object_unlock(m->object);
2593
2594 PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
2595
2596 /* Take the object lock again. */
2597 vm_object_lock(m->object);
2598
2599 /* If the page was busy, someone else will wake it up.
2600 * Otherwise, we have to do it now. */
2601 assert(m->busy);
2602 if(!was_busy) {
2603 PAGE_WAKEUP_DONE(m);
2604 }
2605 vm_pmap_enter_blocked++;
2606 }
2607 }
2608
2609 after_the_pmap_enter:
2610 /*
2611 * Hold queues lock to manipulate
2612 * the page queues. Change wiring
2613 * case is obvious.
2614 */
2615 if (change_wiring) {
2616 vm_page_lockspin_queues();
2617
2618 if (wired) {
2619 if (kr == KERN_SUCCESS) {
2620 vm_page_wire(m);
2621 }
2622 } else {
2623 vm_page_unwire(m, TRUE);
2624 }
2625 vm_page_unlock_queues();
2626
2627 } else {
2628 if (kr != KERN_SUCCESS) {
2629 vm_page_lockspin_queues();
2630 vm_page_deactivate(m);
2631 vm_page_unlock_queues();
2632 } else {
2633 if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2634
2635 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2636 struct vpl *lq;
2637 uint32_t lid;
2638
2639 /*
2640 * we got a local queue to stuff this new page on...
2641 * its safe to manipulate local and local_id at this point
2642 * since we're behind an exclusive object lock and the
2643 * page is not on any global queue.
2644 *
2645 * we'll use the current cpu number to select the queue
2646 * note that we don't need to disable preemption... we're
2647 * going to behind the local queue's lock to do the real
2648 * work
2649 */
2650 lid = cpu_number();
2651
2652 lq = &vm_page_local_q[lid].vpl_un.vpl;
2653
2654 VPL_LOCK(&lq->vpl_lock);
2655
2656 queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2657 m->local = TRUE;
2658 m->local_id = lid;
2659 lq->vpl_count++;
2660
2661 VPL_UNLOCK(&lq->vpl_lock);
2662
2663 if (lq->vpl_count > vm_page_local_q_soft_limit) {
2664 /*
2665 * we're beyond the soft limit for the local queue
2666 * vm_page_reactivate_local will 'try' to take
2667 * the global page queue lock... if it can't that's
2668 * ok... we'll let the queue continue to grow up
2669 * to the hard limit... at that point we'll wait
2670 * for the lock... once we've got the lock, we'll
2671 * transfer all of the pages from the local queue
2672 * to the global active queue
2673 */
2674 vm_page_reactivate_local(lid, FALSE, FALSE);
2675 }
2676 return kr;
2677 }
2678
2679 vm_page_lockspin_queues();
2680 /*
2681 * test again now that we hold the page queue lock
2682 */
2683 if (!VM_PAGE_WIRED(m)) {
2684 if (m->clean_queue) {
2685 VM_PAGE_QUEUES_REMOVE(m);
2686
2687 vm_pageout_cleaned_reactivated++;
2688 vm_pageout_cleaned_fault_reactivated++;
2689 }
2690
2691 if ((!m->active && !m->inactive) || no_cache) {
2692 /*
2693 * If this is a no_cache mapping and the page has never been
2694 * mapped before or was previously a no_cache page, then we
2695 * want to leave pages in the speculative state so that they
2696 * can be readily recycled if free memory runs low. Otherwise
2697 * the page is activated as normal.
2698 */
2699
2700 if (no_cache && (!previously_pmapped || m->no_cache)) {
2701 m->no_cache = TRUE;
2702
2703 if (!m->speculative)
2704 vm_page_speculate(m, FALSE);
2705
2706 } else if (!m->active && !m->inactive) {
2707
2708 vm_page_activate(m);
2709 }
2710 }
2711 }
2712 vm_page_unlock_queues();
2713 }
2714 }
2715 }
2716 return kr;
2717 }
2718
2719
2720 /*
2721 * Routine: vm_fault
2722 * Purpose:
2723 * Handle page faults, including pseudo-faults
2724 * used to change the wiring status of pages.
2725 * Returns:
2726 * Explicit continuations have been removed.
2727 * Implementation:
2728 * vm_fault and vm_fault_page save mucho state
2729 * in the moral equivalent of a closure. The state
2730 * structure is allocated when first entering vm_fault
2731 * and deallocated when leaving vm_fault.
2732 */
2733
2734 extern int _map_enter_debug;
2735
2736 unsigned long vm_fault_collapse_total = 0;
2737 unsigned long vm_fault_collapse_skipped = 0;
2738
2739 kern_return_t
2740 vm_fault(
2741 vm_map_t map,
2742 vm_map_offset_t vaddr,
2743 vm_prot_t fault_type,
2744 boolean_t change_wiring,
2745 int interruptible,
2746 pmap_t caller_pmap,
2747 vm_map_offset_t caller_pmap_addr)
2748 {
2749 vm_map_version_t version; /* Map version for verificiation */
2750 boolean_t wired; /* Should mapping be wired down? */
2751 vm_object_t object; /* Top-level object */
2752 vm_object_offset_t offset; /* Top-level offset */
2753 vm_prot_t prot; /* Protection for mapping */
2754 vm_object_t old_copy_object; /* Saved copy object */
2755 vm_page_t result_page; /* Result of vm_fault_page */
2756 vm_page_t top_page; /* Placeholder page */
2757 kern_return_t kr;
2758
2759 vm_page_t m; /* Fast access to result_page */
2760 kern_return_t error_code;
2761 vm_object_t cur_object;
2762 vm_object_offset_t cur_offset;
2763 vm_page_t cur_m;
2764 vm_object_t new_object;
2765 int type_of_fault;
2766 pmap_t pmap;
2767 boolean_t interruptible_state;
2768 vm_map_t real_map = map;
2769 vm_map_t original_map = map;
2770 vm_prot_t original_fault_type;
2771 struct vm_object_fault_info fault_info;
2772 boolean_t need_collapse = FALSE;
2773 boolean_t need_retry = FALSE;
2774 int object_lock_type = 0;
2775 int cur_object_lock_type;
2776 vm_object_t top_object = VM_OBJECT_NULL;
2777 int throttle_delay;
2778
2779
2780 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2781 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2782 (int)((uint64_t)vaddr >> 32),
2783 (int)vaddr,
2784 (map == kernel_map),
2785 0,
2786 0);
2787
2788 if (get_preemption_level() != 0) {
2789 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2790 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2791 (int)((uint64_t)vaddr >> 32),
2792 (int)vaddr,
2793 KERN_FAILURE,
2794 0,
2795 0);
2796
2797 return (KERN_FAILURE);
2798 }
2799
2800 interruptible_state = thread_interrupt_level(interruptible);
2801
2802 VM_STAT_INCR(faults);
2803 current_task()->faults++;
2804 original_fault_type = fault_type;
2805
2806 if (fault_type & VM_PROT_WRITE)
2807 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2808 else
2809 object_lock_type = OBJECT_LOCK_SHARED;
2810
2811 cur_object_lock_type = OBJECT_LOCK_SHARED;
2812
2813 RetryFault:
2814 /*
2815 * assume we will hit a page in the cache
2816 * otherwise, explicitly override with
2817 * the real fault type once we determine it
2818 */
2819 type_of_fault = DBG_CACHE_HIT_FAULT;
2820
2821 /*
2822 * Find the backing store object and offset into
2823 * it to begin the search.
2824 */
2825 fault_type = original_fault_type;
2826 map = original_map;
2827 vm_map_lock_read(map);
2828
2829 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2830 object_lock_type, &version,
2831 &object, &offset, &prot, &wired,
2832 &fault_info,
2833 &real_map);
2834
2835 if (kr != KERN_SUCCESS) {
2836 vm_map_unlock_read(map);
2837 goto done;
2838 }
2839 pmap = real_map->pmap;
2840 fault_info.interruptible = interruptible;
2841 fault_info.stealth = FALSE;
2842 fault_info.io_sync = FALSE;
2843 fault_info.mark_zf_absent = FALSE;
2844 fault_info.batch_pmap_op = FALSE;
2845
2846 /*
2847 * If the page is wired, we must fault for the current protection
2848 * value, to avoid further faults.
2849 */
2850 if (wired) {
2851 fault_type = prot | VM_PROT_WRITE;
2852 /*
2853 * since we're treating this fault as a 'write'
2854 * we must hold the top object lock exclusively
2855 */
2856 if (object_lock_type == OBJECT_LOCK_SHARED) {
2857
2858 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2859
2860 if (vm_object_lock_upgrade(object) == FALSE) {
2861 /*
2862 * couldn't upgrade, so explictly
2863 * take the lock exclusively
2864 */
2865 vm_object_lock(object);
2866 }
2867 }
2868 }
2869
2870 #if VM_FAULT_CLASSIFY
2871 /*
2872 * Temporary data gathering code
2873 */
2874 vm_fault_classify(object, offset, fault_type);
2875 #endif
2876 /*
2877 * Fast fault code. The basic idea is to do as much as
2878 * possible while holding the map lock and object locks.
2879 * Busy pages are not used until the object lock has to
2880 * be dropped to do something (copy, zero fill, pmap enter).
2881 * Similarly, paging references aren't acquired until that
2882 * point, and object references aren't used.
2883 *
2884 * If we can figure out what to do
2885 * (zero fill, copy on write, pmap enter) while holding
2886 * the locks, then it gets done. Otherwise, we give up,
2887 * and use the original fault path (which doesn't hold
2888 * the map lock, and relies on busy pages).
2889 * The give up cases include:
2890 * - Have to talk to pager.
2891 * - Page is busy, absent or in error.
2892 * - Pager has locked out desired access.
2893 * - Fault needs to be restarted.
2894 * - Have to push page into copy object.
2895 *
2896 * The code is an infinite loop that moves one level down
2897 * the shadow chain each time. cur_object and cur_offset
2898 * refer to the current object being examined. object and offset
2899 * are the original object from the map. The loop is at the
2900 * top level if and only if object and cur_object are the same.
2901 *
2902 * Invariants: Map lock is held throughout. Lock is held on
2903 * original object and cur_object (if different) when
2904 * continuing or exiting loop.
2905 *
2906 */
2907
2908
2909 /*
2910 * If this page is to be inserted in a copy delay object
2911 * for writing, and if the object has a copy, then the
2912 * copy delay strategy is implemented in the slow fault page.
2913 */
2914 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2915 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2916 goto handle_copy_delay;
2917
2918 cur_object = object;
2919 cur_offset = offset;
2920
2921 while (TRUE) {
2922 if (!cur_object->pager_created &&
2923 cur_object->phys_contiguous) /* superpage */
2924 break;
2925
2926 if (cur_object->blocked_access) {
2927 /*
2928 * Access to this VM object has been blocked.
2929 * Let the slow path handle it.
2930 */
2931 break;
2932 }
2933
2934 m = vm_page_lookup(cur_object, cur_offset);
2935
2936 if (m != VM_PAGE_NULL) {
2937 if (m->busy) {
2938 wait_result_t result;
2939
2940 /*
2941 * in order to do the PAGE_ASSERT_WAIT, we must
2942 * have object that 'm' belongs to locked exclusively
2943 */
2944 if (object != cur_object) {
2945 vm_object_unlock(object);
2946
2947 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2948
2949 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2950
2951 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2952 /*
2953 * couldn't upgrade so go do a full retry
2954 * immediately since we've already dropped
2955 * the top object lock associated with this page
2956 * and the current one got dropped due to the
2957 * failed upgrade... the state is no longer valid
2958 */
2959 vm_map_unlock_read(map);
2960 if (real_map != map)
2961 vm_map_unlock(real_map);
2962
2963 goto RetryFault;
2964 }
2965 }
2966 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2967
2968 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2969
2970 if (vm_object_lock_upgrade(object) == FALSE) {
2971 /*
2972 * couldn't upgrade, so explictly take the lock
2973 * exclusively and go relookup the page since we
2974 * will have dropped the object lock and
2975 * a different thread could have inserted
2976 * a page at this offset
2977 * no need for a full retry since we're
2978 * at the top level of the object chain
2979 */
2980 vm_object_lock(object);
2981
2982 continue;
2983 }
2984 }
2985 vm_map_unlock_read(map);
2986 if (real_map != map)
2987 vm_map_unlock(real_map);
2988
2989 result = PAGE_ASSERT_WAIT(m, interruptible);
2990
2991 vm_object_unlock(cur_object);
2992
2993 if (result == THREAD_WAITING) {
2994 result = thread_block(THREAD_CONTINUE_NULL);
2995
2996 counter(c_vm_fault_page_block_busy_kernel++);
2997 }
2998 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2999 goto RetryFault;
3000
3001 kr = KERN_ABORTED;
3002 goto done;
3003 }
3004 if (m->laundry) {
3005 if (object != cur_object) {
3006 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3007 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3008
3009 vm_object_unlock(object);
3010 vm_object_unlock(cur_object);
3011
3012 vm_map_unlock_read(map);
3013 if (real_map != map)
3014 vm_map_unlock(real_map);
3015
3016 goto RetryFault;
3017 }
3018
3019 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3020
3021 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3022
3023 if (vm_object_lock_upgrade(object) == FALSE) {
3024 /*
3025 * couldn't upgrade, so explictly take the lock
3026 * exclusively and go relookup the page since we
3027 * will have dropped the object lock and
3028 * a different thread could have inserted
3029 * a page at this offset
3030 * no need for a full retry since we're
3031 * at the top level of the object chain
3032 */
3033 vm_object_lock(object);
3034
3035 continue;
3036 }
3037 }
3038 m->pageout = FALSE;
3039
3040 vm_pageout_steal_laundry(m, FALSE);
3041 }
3042
3043 if (m->phys_page == vm_page_guard_addr) {
3044 /*
3045 * Guard page: let the slow path deal with it
3046 */
3047 break;
3048 }
3049 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3050 /*
3051 * Unusual case... let the slow path deal with it
3052 */
3053 break;
3054 }
3055 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3056 if (object != cur_object)
3057 vm_object_unlock(object);
3058 vm_map_unlock_read(map);
3059 if (real_map != map)
3060 vm_map_unlock(real_map);
3061 vm_object_unlock(cur_object);
3062 kr = KERN_MEMORY_ERROR;
3063 goto done;
3064 }
3065
3066 if (m->encrypted) {
3067 /*
3068 * ENCRYPTED SWAP:
3069 * We've soft-faulted (because it's not in the page
3070 * table) on an encrypted page.
3071 * Keep the page "busy" so that no one messes with
3072 * it during the decryption.
3073 * Release the extra locks we're holding, keep only
3074 * the page's VM object lock.
3075 *
3076 * in order to set 'busy' on 'm', we must
3077 * have object that 'm' belongs to locked exclusively
3078 */
3079 if (object != cur_object) {
3080 vm_object_unlock(object);
3081
3082 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3083
3084 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3085
3086 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3087 /*
3088 * couldn't upgrade so go do a full retry
3089 * immediately since we've already dropped
3090 * the top object lock associated with this page
3091 * and the current one got dropped due to the
3092 * failed upgrade... the state is no longer valid
3093 */
3094 vm_map_unlock_read(map);
3095 if (real_map != map)
3096 vm_map_unlock(real_map);
3097
3098 goto RetryFault;
3099 }
3100 }
3101 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3102
3103 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3104
3105 if (vm_object_lock_upgrade(object) == FALSE) {
3106 /*
3107 * couldn't upgrade, so explictly take the lock
3108 * exclusively and go relookup the page since we
3109 * will have dropped the object lock and
3110 * a different thread could have inserted
3111 * a page at this offset
3112 * no need for a full retry since we're
3113 * at the top level of the object chain
3114 */
3115 vm_object_lock(object);
3116
3117 continue;
3118 }
3119 }
3120 m->busy = TRUE;
3121
3122 vm_map_unlock_read(map);
3123 if (real_map != map)
3124 vm_map_unlock(real_map);
3125
3126 vm_page_decrypt(m, 0);
3127
3128 assert(m->busy);
3129 PAGE_WAKEUP_DONE(m);
3130
3131 vm_object_unlock(cur_object);
3132 /*
3133 * Retry from the top, in case anything
3134 * changed while we were decrypting...
3135 */
3136 goto RetryFault;
3137 }
3138 ASSERT_PAGE_DECRYPTED(m);
3139
3140 if(vm_page_is_slideable(m)) {
3141 /*
3142 * We might need to slide this page, and so,
3143 * we want to hold the VM object exclusively.
3144 */
3145 if (object != cur_object) {
3146 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3147 vm_object_unlock(object);
3148 vm_object_unlock(cur_object);
3149
3150 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3151
3152 vm_map_unlock_read(map);
3153 if (real_map != map)
3154 vm_map_unlock(real_map);
3155
3156 goto RetryFault;
3157 }
3158 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3159
3160 vm_object_unlock(object);
3161 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3162 vm_map_unlock_read(map);
3163 goto RetryFault;
3164 }
3165 }
3166
3167 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3168 upgrade_for_validation:
3169 /*
3170 * We might need to validate this page
3171 * against its code signature, so we
3172 * want to hold the VM object exclusively.
3173 */
3174 if (object != cur_object) {
3175 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3176 vm_object_unlock(object);
3177 vm_object_unlock(cur_object);
3178
3179 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3180
3181 vm_map_unlock_read(map);
3182 if (real_map != map)
3183 vm_map_unlock(real_map);
3184
3185 goto RetryFault;
3186 }
3187
3188 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3189
3190 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3191
3192 if (vm_object_lock_upgrade(object) == FALSE) {
3193 /*
3194 * couldn't upgrade, so explictly take the lock
3195 * exclusively and go relookup the page since we
3196 * will have dropped the object lock and
3197 * a different thread could have inserted
3198 * a page at this offset
3199 * no need for a full retry since we're
3200 * at the top level of the object chain
3201 */
3202 vm_object_lock(object);
3203
3204 continue;
3205 }
3206 }
3207 }
3208 /*
3209 * Two cases of map in faults:
3210 * - At top level w/o copy object.
3211 * - Read fault anywhere.
3212 * --> must disallow write.
3213 */
3214
3215 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3216
3217 goto FastPmapEnter;
3218 }
3219
3220 if ((fault_type & VM_PROT_WRITE) == 0) {
3221
3222 if (object != cur_object) {
3223 /*
3224 * We still need to hold the top object
3225 * lock here to prevent a race between
3226 * a read fault (taking only "shared"
3227 * locks) and a write fault (taking
3228 * an "exclusive" lock on the top
3229 * object.
3230 * Otherwise, as soon as we release the
3231 * top lock, the write fault could
3232 * proceed and actually complete before
3233 * the read fault, and the copied page's
3234 * translation could then be overwritten
3235 * by the read fault's translation for
3236 * the original page.
3237 *
3238 * Let's just record what the top object
3239 * is and we'll release it later.
3240 */
3241 top_object = object;
3242
3243 /*
3244 * switch to the object that has the new page
3245 */
3246 object = cur_object;
3247 object_lock_type = cur_object_lock_type;
3248 }
3249 FastPmapEnter:
3250 /*
3251 * prepare for the pmap_enter...
3252 * object and map are both locked
3253 * m contains valid data
3254 * object == m->object
3255 * cur_object == NULL or it's been unlocked
3256 * no paging references on either object or cur_object
3257 */
3258 if (caller_pmap) {
3259 kr = vm_fault_enter(m,
3260 caller_pmap,
3261 caller_pmap_addr,
3262 prot,
3263 fault_type,
3264 wired,
3265 change_wiring,
3266 fault_info.no_cache,
3267 fault_info.cs_bypass,
3268 (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3269 &type_of_fault);
3270 } else {
3271 kr = vm_fault_enter(m,
3272 pmap,
3273 vaddr,
3274 prot,
3275 fault_type,
3276 wired,
3277 change_wiring,
3278 fault_info.no_cache,
3279 fault_info.cs_bypass,
3280 (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3281 &type_of_fault);
3282 }
3283
3284 if (top_object != VM_OBJECT_NULL) {
3285 /*
3286 * It's safe to drop the top object
3287 * now that we've done our
3288 * vm_fault_enter(). Any other fault
3289 * in progress for that virtual
3290 * address will either find our page
3291 * and translation or put in a new page
3292 * and translation.
3293 */
3294 vm_object_unlock(top_object);
3295 top_object = VM_OBJECT_NULL;
3296 }
3297
3298 if (need_collapse == TRUE)
3299 vm_object_collapse(object, offset, TRUE);
3300
3301 if (need_retry == FALSE &&
3302 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3303 /*
3304 * evaluate access pattern and update state
3305 * vm_fault_deactivate_behind depends on the
3306 * state being up to date
3307 */
3308 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3309
3310 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3311 }
3312 /*
3313 * That's it, clean up and return.
3314 */
3315 if (m->busy)
3316 PAGE_WAKEUP_DONE(m);
3317
3318 vm_object_unlock(object);
3319
3320 vm_map_unlock_read(map);
3321 if (real_map != map)
3322 vm_map_unlock(real_map);
3323
3324 if (need_retry == TRUE) {
3325 /*
3326 * vm_fault_enter couldn't complete the PMAP_ENTER...
3327 * at this point we don't hold any locks so it's safe
3328 * to ask the pmap layer to expand the page table to
3329 * accommodate this mapping... once expanded, we'll
3330 * re-drive the fault which should result in vm_fault_enter
3331 * being able to successfully enter the mapping this time around
3332 */
3333 (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER);
3334
3335 need_retry = FALSE;
3336 goto RetryFault;
3337 }
3338 goto done;
3339 }
3340 /*
3341 * COPY ON WRITE FAULT
3342 */
3343 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3344
3345 if ((throttle_delay = vm_page_throttled())) {
3346 /*
3347 * drop all of our locks...
3348 * wait until the free queue is
3349 * pumped back up and then
3350 * redrive the fault
3351 */
3352 if (object != cur_object)
3353 vm_object_unlock(cur_object);
3354 vm_object_unlock(object);
3355 vm_map_unlock_read(map);
3356 if (real_map != map)
3357 vm_map_unlock(real_map);
3358
3359 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3360
3361 delay(throttle_delay);
3362
3363 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3364 THREAD_UNINT :
3365 THREAD_ABORTSAFE))
3366 goto RetryFault;
3367 kr = KERN_ABORTED;
3368 goto done;
3369 }
3370 /*
3371 * If objects match, then
3372 * object->copy must not be NULL (else control
3373 * would be in previous code block), and we
3374 * have a potential push into the copy object
3375 * with which we can't cope with here.
3376 */
3377 if (cur_object == object) {
3378 /*
3379 * must take the slow path to
3380 * deal with the copy push
3381 */
3382 break;
3383 }
3384
3385 /*
3386 * This is now a shadow based copy on write
3387 * fault -- it requires a copy up the shadow
3388 * chain.
3389 */
3390
3391 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3392 VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3393 goto upgrade_for_validation;
3394 }
3395
3396 /*
3397 * Allocate a page in the original top level
3398 * object. Give up if allocate fails. Also
3399 * need to remember current page, as it's the
3400 * source of the copy.
3401 *
3402 * at this point we hold locks on both
3403 * object and cur_object... no need to take
3404 * paging refs or mark pages BUSY since
3405 * we don't drop either object lock until
3406 * the page has been copied and inserted
3407 */
3408 cur_m = m;
3409 m = vm_page_grab();
3410
3411 if (m == VM_PAGE_NULL) {
3412 /*
3413 * no free page currently available...
3414 * must take the slow path
3415 */
3416 break;
3417 }
3418 /*
3419 * Now do the copy. Mark the source page busy...
3420 *
3421 * NOTE: This code holds the map lock across
3422 * the page copy.
3423 */
3424 vm_page_copy(cur_m, m);
3425 vm_page_insert(m, object, offset);
3426 SET_PAGE_DIRTY(m, FALSE);
3427
3428 /*
3429 * Now cope with the source page and object
3430 */
3431 if (object->ref_count > 1 && cur_m->pmapped)
3432 pmap_disconnect(cur_m->phys_page);
3433
3434 need_collapse = TRUE;
3435
3436 if (!cur_object->internal &&
3437 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3438 /*
3439 * The object from which we've just
3440 * copied a page is most probably backed
3441 * by a vnode. We don't want to waste too
3442 * much time trying to collapse the VM objects
3443 * and create a bottleneck when several tasks
3444 * map the same file.
3445 */
3446 if (cur_object->copy == object) {
3447 /*
3448 * Shared mapping or no COW yet.
3449 * We can never collapse a copy
3450 * object into its backing object.
3451 */
3452 need_collapse = FALSE;
3453 } else if (cur_object->copy == object->shadow &&
3454 object->shadow->resident_page_count == 0) {
3455 /*
3456 * Shared mapping after a COW occurred.
3457 */
3458 need_collapse = FALSE;
3459 }
3460 }
3461 vm_object_unlock(cur_object);
3462
3463 if (need_collapse == FALSE)
3464 vm_fault_collapse_skipped++;
3465 vm_fault_collapse_total++;
3466
3467 type_of_fault = DBG_COW_FAULT;
3468 VM_STAT_INCR(cow_faults);
3469 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3470 current_task()->cow_faults++;
3471
3472 goto FastPmapEnter;
3473
3474 } else {
3475 /*
3476 * No page at cur_object, cur_offset... m == NULL
3477 */
3478 if (cur_object->pager_created) {
3479 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3480 /*
3481 * May have to talk to a pager...
3482 * take the slow path.
3483 */
3484 break;
3485 }
3486 /*
3487 * existence map present and indicates
3488 * that the pager doesn't have this page
3489 */
3490 }
3491 if (cur_object->shadow == VM_OBJECT_NULL) {
3492 /*
3493 * Zero fill fault. Page gets
3494 * inserted into the original object.
3495 */
3496 if (cur_object->shadow_severed ||
3497 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3498 {
3499 if (object != cur_object)
3500 vm_object_unlock(cur_object);
3501 vm_object_unlock(object);
3502
3503 vm_map_unlock_read(map);
3504 if (real_map != map)
3505 vm_map_unlock(real_map);
3506
3507 kr = KERN_MEMORY_ERROR;
3508 goto done;
3509 }
3510 if ((throttle_delay = vm_page_throttled())) {
3511 /*
3512 * drop all of our locks...
3513 * wait until the free queue is
3514 * pumped back up and then
3515 * redrive the fault
3516 */
3517 if (object != cur_object)
3518 vm_object_unlock(cur_object);
3519 vm_object_unlock(object);
3520 vm_map_unlock_read(map);
3521 if (real_map != map)
3522 vm_map_unlock(real_map);
3523
3524 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3525
3526 delay(throttle_delay);
3527
3528 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3529 THREAD_UNINT :
3530 THREAD_ABORTSAFE))
3531 goto RetryFault;
3532 kr = KERN_ABORTED;
3533 goto done;
3534 }
3535 if (vm_backing_store_low) {
3536 /*
3537 * we are protecting the system from
3538 * backing store exhaustion...
3539 * must take the slow path if we're
3540 * not privileged
3541 */
3542 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3543 break;
3544 }
3545 if (cur_object != object) {
3546 vm_object_unlock(cur_object);
3547
3548 cur_object = object;
3549 }
3550 if (object_lock_type == OBJECT_LOCK_SHARED) {
3551
3552 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3553
3554 if (vm_object_lock_upgrade(object) == FALSE) {
3555 /*
3556 * couldn't upgrade so do a full retry on the fault
3557 * since we dropped the object lock which
3558 * could allow another thread to insert
3559 * a page at this offset
3560 */
3561 vm_map_unlock_read(map);
3562 if (real_map != map)
3563 vm_map_unlock(real_map);
3564
3565 goto RetryFault;
3566 }
3567 }
3568 m = vm_page_alloc(object, offset);
3569
3570 if (m == VM_PAGE_NULL) {
3571 /*
3572 * no free page currently available...
3573 * must take the slow path
3574 */
3575 break;
3576 }
3577
3578 /*
3579 * Now zero fill page...
3580 * the page is probably going to
3581 * be written soon, so don't bother
3582 * to clear the modified bit
3583 *
3584 * NOTE: This code holds the map
3585 * lock across the zero fill.
3586 */
3587 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3588
3589 goto FastPmapEnter;
3590 }
3591 /*
3592 * On to the next level in the shadow chain
3593 */
3594 cur_offset += cur_object->vo_shadow_offset;
3595 new_object = cur_object->shadow;
3596
3597 /*
3598 * take the new_object's lock with the indicated state
3599 */
3600 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3601 vm_object_lock_shared(new_object);
3602 else
3603 vm_object_lock(new_object);
3604
3605 if (cur_object != object)
3606 vm_object_unlock(cur_object);
3607
3608 cur_object = new_object;
3609
3610 continue;
3611 }
3612 }
3613 /*
3614 * Cleanup from fast fault failure. Drop any object
3615 * lock other than original and drop map lock.
3616 */
3617 if (object != cur_object)
3618 vm_object_unlock(cur_object);
3619
3620 /*
3621 * must own the object lock exclusively at this point
3622 */
3623 if (object_lock_type == OBJECT_LOCK_SHARED) {
3624 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3625
3626 if (vm_object_lock_upgrade(object) == FALSE) {
3627 /*
3628 * couldn't upgrade, so explictly
3629 * take the lock exclusively
3630 * no need to retry the fault at this
3631 * point since "vm_fault_page" will
3632 * completely re-evaluate the state
3633 */
3634 vm_object_lock(object);
3635 }
3636 }
3637
3638 handle_copy_delay:
3639 vm_map_unlock_read(map);
3640 if (real_map != map)
3641 vm_map_unlock(real_map);
3642
3643 /*
3644 * Make a reference to this object to
3645 * prevent its disposal while we are messing with
3646 * it. Once we have the reference, the map is free
3647 * to be diddled. Since objects reference their
3648 * shadows (and copies), they will stay around as well.
3649 */
3650 vm_object_reference_locked(object);
3651 vm_object_paging_begin(object);
3652
3653 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3654
3655 error_code = 0;
3656
3657 kr = vm_fault_page(object, offset, fault_type,
3658 (change_wiring && !wired),
3659 &prot, &result_page, &top_page,
3660 &type_of_fault,
3661 &error_code, map->no_zero_fill,
3662 FALSE, &fault_info);
3663
3664 /*
3665 * if kr != VM_FAULT_SUCCESS, then the paging reference
3666 * has been dropped and the object unlocked... the ref_count
3667 * is still held
3668 *
3669 * if kr == VM_FAULT_SUCCESS, then the paging reference
3670 * is still held along with the ref_count on the original object
3671 *
3672 * the object is returned locked with a paging reference
3673 *
3674 * if top_page != NULL, then it's BUSY and the
3675 * object it belongs to has a paging reference
3676 * but is returned unlocked
3677 */
3678 if (kr != VM_FAULT_SUCCESS &&
3679 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3680 /*
3681 * we didn't succeed, lose the object reference immediately.
3682 */
3683 vm_object_deallocate(object);
3684
3685 /*
3686 * See why we failed, and take corrective action.
3687 */
3688 switch (kr) {
3689 case VM_FAULT_MEMORY_SHORTAGE:
3690 if (vm_page_wait((change_wiring) ?
3691 THREAD_UNINT :
3692 THREAD_ABORTSAFE))
3693 goto RetryFault;
3694 /*
3695 * fall thru
3696 */
3697 case VM_FAULT_INTERRUPTED:
3698 kr = KERN_ABORTED;
3699 goto done;
3700 case VM_FAULT_RETRY:
3701 goto RetryFault;
3702 case VM_FAULT_MEMORY_ERROR:
3703 if (error_code)
3704 kr = error_code;
3705 else
3706 kr = KERN_MEMORY_ERROR;
3707 goto done;
3708 default:
3709 panic("vm_fault: unexpected error 0x%x from "
3710 "vm_fault_page()\n", kr);
3711 }
3712 }
3713 m = result_page;
3714
3715 if (m != VM_PAGE_NULL) {
3716 assert((change_wiring && !wired) ?
3717 (top_page == VM_PAGE_NULL) :
3718 ((top_page == VM_PAGE_NULL) == (m->object == object)));
3719 }
3720
3721 /*
3722 * What to do with the resulting page from vm_fault_page
3723 * if it doesn't get entered into the physical map:
3724 */
3725 #define RELEASE_PAGE(m) \
3726 MACRO_BEGIN \
3727 PAGE_WAKEUP_DONE(m); \
3728 if (!m->active && !m->inactive && !m->throttled) { \
3729 vm_page_lockspin_queues(); \
3730 if (!m->active && !m->inactive && !m->throttled) \
3731 vm_page_activate(m); \
3732 vm_page_unlock_queues(); \
3733 } \
3734 MACRO_END
3735
3736 /*
3737 * We must verify that the maps have not changed
3738 * since our last lookup.
3739 */
3740 if (m != VM_PAGE_NULL) {
3741 old_copy_object = m->object->copy;
3742 vm_object_unlock(m->object);
3743 } else {
3744 old_copy_object = VM_OBJECT_NULL;
3745 vm_object_unlock(object);
3746 }
3747
3748 /*
3749 * no object locks are held at this point
3750 */
3751 if ((map != original_map) || !vm_map_verify(map, &version)) {
3752 vm_object_t retry_object;
3753 vm_object_offset_t retry_offset;
3754 vm_prot_t retry_prot;
3755
3756 /*
3757 * To avoid trying to write_lock the map while another
3758 * thread has it read_locked (in vm_map_pageable), we
3759 * do not try for write permission. If the page is
3760 * still writable, we will get write permission. If it
3761 * is not, or has been marked needs_copy, we enter the
3762 * mapping without write permission, and will merely
3763 * take another fault.
3764 */
3765 map = original_map;
3766 vm_map_lock_read(map);
3767
3768 kr = vm_map_lookup_locked(&map, vaddr,
3769 fault_type & ~VM_PROT_WRITE,
3770 OBJECT_LOCK_EXCLUSIVE, &version,
3771 &retry_object, &retry_offset, &retry_prot,
3772 &wired,
3773 &fault_info,
3774 &real_map);
3775 pmap = real_map->pmap;
3776
3777 if (kr != KERN_SUCCESS) {
3778 vm_map_unlock_read(map);
3779
3780 if (m != VM_PAGE_NULL) {
3781 /*
3782 * retake the lock so that
3783 * we can drop the paging reference
3784 * in vm_fault_cleanup and do the
3785 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3786 */
3787 vm_object_lock(m->object);
3788
3789 RELEASE_PAGE(m);
3790
3791 vm_fault_cleanup(m->object, top_page);
3792 } else {
3793 /*
3794 * retake the lock so that
3795 * we can drop the paging reference
3796 * in vm_fault_cleanup
3797 */
3798 vm_object_lock(object);
3799
3800 vm_fault_cleanup(object, top_page);
3801 }
3802 vm_object_deallocate(object);
3803
3804 goto done;
3805 }
3806 vm_object_unlock(retry_object);
3807
3808 if ((retry_object != object) || (retry_offset != offset)) {
3809
3810 vm_map_unlock_read(map);
3811 if (real_map != map)
3812 vm_map_unlock(real_map);
3813
3814 if (m != VM_PAGE_NULL) {
3815 /*
3816 * retake the lock so that
3817 * we can drop the paging reference
3818 * in vm_fault_cleanup and do the
3819 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3820 */
3821 vm_object_lock(m->object);
3822
3823 RELEASE_PAGE(m);
3824
3825 vm_fault_cleanup(m->object, top_page);
3826 } else {
3827 /*
3828 * retake the lock so that
3829 * we can drop the paging reference
3830 * in vm_fault_cleanup
3831 */
3832 vm_object_lock(object);
3833
3834 vm_fault_cleanup(object, top_page);
3835 }
3836 vm_object_deallocate(object);
3837
3838 goto RetryFault;
3839 }
3840 /*
3841 * Check whether the protection has changed or the object
3842 * has been copied while we left the map unlocked.
3843 */
3844 prot &= retry_prot;
3845 }
3846 if (m != VM_PAGE_NULL) {
3847 vm_object_lock(m->object);
3848
3849 if (m->object->copy != old_copy_object) {
3850 /*
3851 * The copy object changed while the top-level object
3852 * was unlocked, so take away write permission.
3853 */
3854 prot &= ~VM_PROT_WRITE;
3855 }
3856 } else
3857 vm_object_lock(object);
3858
3859 /*
3860 * If we want to wire down this page, but no longer have
3861 * adequate permissions, we must start all over.
3862 */
3863 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3864
3865 vm_map_verify_done(map, &version);
3866 if (real_map != map)
3867 vm_map_unlock(real_map);
3868
3869 if (m != VM_PAGE_NULL) {
3870 RELEASE_PAGE(m);
3871
3872 vm_fault_cleanup(m->object, top_page);
3873 } else
3874 vm_fault_cleanup(object, top_page);
3875
3876 vm_object_deallocate(object);
3877
3878 goto RetryFault;
3879 }
3880 if (m != VM_PAGE_NULL) {
3881 /*
3882 * Put this page into the physical map.
3883 * We had to do the unlock above because pmap_enter
3884 * may cause other faults. The page may be on
3885 * the pageout queues. If the pageout daemon comes
3886 * across the page, it will remove it from the queues.
3887 */
3888 if (caller_pmap) {
3889 kr = vm_fault_enter(m,
3890 caller_pmap,
3891 caller_pmap_addr,
3892 prot,
3893 fault_type,
3894 wired,
3895 change_wiring,
3896 fault_info.no_cache,
3897 fault_info.cs_bypass,
3898 NULL,
3899 &type_of_fault);
3900 } else {
3901 kr = vm_fault_enter(m,
3902 pmap,
3903 vaddr,
3904 prot,
3905 fault_type,
3906 wired,
3907 change_wiring,
3908 fault_info.no_cache,
3909 fault_info.cs_bypass,
3910 NULL,
3911 &type_of_fault);
3912 }
3913 if (kr != KERN_SUCCESS) {
3914 /* abort this page fault */
3915 vm_map_verify_done(map, &version);
3916 if (real_map != map)
3917 vm_map_unlock(real_map);
3918 PAGE_WAKEUP_DONE(m);
3919 vm_fault_cleanup(m->object, top_page);
3920 vm_object_deallocate(object);
3921 goto done;
3922 }
3923 } else {
3924
3925 vm_map_entry_t entry;
3926 vm_map_offset_t laddr;
3927 vm_map_offset_t ldelta, hdelta;
3928
3929 /*
3930 * do a pmap block mapping from the physical address
3931 * in the object
3932 */
3933
3934 #ifdef ppc
3935 /* While we do not worry about execution protection in */
3936 /* general, certian pages may have instruction execution */
3937 /* disallowed. We will check here, and if not allowed */
3938 /* to execute, we return with a protection failure. */
3939
3940 if ((fault_type & VM_PROT_EXECUTE) &&
3941 (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
3942
3943 vm_map_verify_done(map, &version);
3944
3945 if (real_map != map)
3946 vm_map_unlock(real_map);
3947
3948 vm_fault_cleanup(object, top_page);
3949 vm_object_deallocate(object);
3950
3951 kr = KERN_PROTECTION_FAILURE;
3952 goto done;
3953 }
3954 #endif /* ppc */
3955
3956 if (real_map != map)
3957 vm_map_unlock(real_map);
3958
3959 if (original_map != map) {
3960 vm_map_unlock_read(map);
3961 vm_map_lock_read(original_map);
3962 map = original_map;
3963 }
3964 real_map = map;
3965
3966 laddr = vaddr;
3967 hdelta = 0xFFFFF000;
3968 ldelta = 0xFFFFF000;
3969
3970 while (vm_map_lookup_entry(map, laddr, &entry)) {
3971 if (ldelta > (laddr - entry->vme_start))
3972 ldelta = laddr - entry->vme_start;
3973 if (hdelta > (entry->vme_end - laddr))
3974 hdelta = entry->vme_end - laddr;
3975 if (entry->is_sub_map) {
3976
3977 laddr = (laddr - entry->vme_start)
3978 + entry->offset;
3979 vm_map_lock_read(entry->object.sub_map);
3980
3981 if (map != real_map)
3982 vm_map_unlock_read(map);
3983 if (entry->use_pmap) {
3984 vm_map_unlock_read(real_map);
3985 real_map = entry->object.sub_map;
3986 }
3987 map = entry->object.sub_map;
3988
3989 } else {
3990 break;
3991 }
3992 }
3993
3994 if (vm_map_lookup_entry(map, laddr, &entry) &&
3995 (entry->object.vm_object != NULL) &&
3996 (entry->object.vm_object == object)) {
3997
3998 int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3999 if (caller_pmap) {
4000 /*
4001 * Set up a block mapped area
4002 */
4003 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4004 pmap_map_block(caller_pmap,
4005 (addr64_t)(caller_pmap_addr - ldelta),
4006 (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4007 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4008 (uint32_t)((ldelta + hdelta) >> 12), prot,
4009 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4010 } else {
4011 /*
4012 * Set up a block mapped area
4013 */
4014 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4015 pmap_map_block(real_map->pmap,
4016 (addr64_t)(vaddr - ldelta),
4017 (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4018 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4019 (uint32_t)((ldelta + hdelta) >> 12), prot,
4020 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4021 }
4022 }
4023 }
4024
4025 /*
4026 * Unlock everything, and return
4027 */
4028 vm_map_verify_done(map, &version);
4029 if (real_map != map)
4030 vm_map_unlock(real_map);
4031
4032 if (m != VM_PAGE_NULL) {
4033 PAGE_WAKEUP_DONE(m);
4034
4035 vm_fault_cleanup(m->object, top_page);
4036 } else
4037 vm_fault_cleanup(object, top_page);
4038
4039 vm_object_deallocate(object);
4040
4041 #undef RELEASE_PAGE
4042
4043 kr = KERN_SUCCESS;
4044 done:
4045 thread_interrupt_level(interruptible_state);
4046
4047 throttle_lowpri_io(TRUE);
4048
4049 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4050 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4051 (int)((uint64_t)vaddr >> 32),
4052 (int)vaddr,
4053 kr,
4054 type_of_fault,
4055 0);
4056
4057 return (kr);
4058 }
4059
4060 /*
4061 * vm_fault_wire:
4062 *
4063 * Wire down a range of virtual addresses in a map.
4064 */
4065 kern_return_t
4066 vm_fault_wire(
4067 vm_map_t map,
4068 vm_map_entry_t entry,
4069 pmap_t pmap,
4070 vm_map_offset_t pmap_addr)
4071 {
4072
4073 register vm_map_offset_t va;
4074 register vm_map_offset_t end_addr = entry->vme_end;
4075 register kern_return_t rc;
4076
4077 assert(entry->in_transition);
4078
4079 if ((entry->object.vm_object != NULL) &&
4080 !entry->is_sub_map &&
4081 entry->object.vm_object->phys_contiguous) {
4082 return KERN_SUCCESS;
4083 }
4084
4085 /*
4086 * Inform the physical mapping system that the
4087 * range of addresses may not fault, so that
4088 * page tables and such can be locked down as well.
4089 */
4090
4091 pmap_pageable(pmap, pmap_addr,
4092 pmap_addr + (end_addr - entry->vme_start), FALSE);
4093
4094 /*
4095 * We simulate a fault to get the page and enter it
4096 * in the physical map.
4097 */
4098
4099 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4100 if ((rc = vm_fault_wire_fast(
4101 map, va, entry, pmap,
4102 pmap_addr + (va - entry->vme_start)
4103 )) != KERN_SUCCESS) {
4104 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4105 (pmap == kernel_pmap) ?
4106 THREAD_UNINT : THREAD_ABORTSAFE,
4107 pmap, pmap_addr + (va - entry->vme_start));
4108 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4109 }
4110
4111 if (rc != KERN_SUCCESS) {
4112 struct vm_map_entry tmp_entry = *entry;
4113
4114 /* unwire wired pages */
4115 tmp_entry.vme_end = va;
4116 vm_fault_unwire(map,
4117 &tmp_entry, FALSE, pmap, pmap_addr);
4118
4119 return rc;
4120 }
4121 }
4122 return KERN_SUCCESS;
4123 }
4124
4125 /*
4126 * vm_fault_unwire:
4127 *
4128 * Unwire a range of virtual addresses in a map.
4129 */
4130 void
4131 vm_fault_unwire(
4132 vm_map_t map,
4133 vm_map_entry_t entry,
4134 boolean_t deallocate,
4135 pmap_t pmap,
4136 vm_map_offset_t pmap_addr)
4137 {
4138 register vm_map_offset_t va;
4139 register vm_map_offset_t end_addr = entry->vme_end;
4140 vm_object_t object;
4141 struct vm_object_fault_info fault_info;
4142
4143 object = (entry->is_sub_map)
4144 ? VM_OBJECT_NULL : entry->object.vm_object;
4145
4146 /*
4147 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4148 * do anything since such memory is wired by default. So we don't have
4149 * anything to undo here.
4150 */
4151
4152 if (object != VM_OBJECT_NULL && object->phys_contiguous)
4153 return;
4154
4155 fault_info.interruptible = THREAD_UNINT;
4156 fault_info.behavior = entry->behavior;
4157 fault_info.user_tag = entry->alias;
4158 fault_info.lo_offset = entry->offset;
4159 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4160 fault_info.no_cache = entry->no_cache;
4161 fault_info.stealth = TRUE;
4162 fault_info.io_sync = FALSE;
4163 fault_info.cs_bypass = FALSE;
4164 fault_info.mark_zf_absent = FALSE;
4165 fault_info.batch_pmap_op = FALSE;
4166
4167 /*
4168 * Since the pages are wired down, we must be able to
4169 * get their mappings from the physical map system.
4170 */
4171
4172 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4173
4174 if (object == VM_OBJECT_NULL) {
4175 if (pmap) {
4176 pmap_change_wiring(pmap,
4177 pmap_addr + (va - entry->vme_start), FALSE);
4178 }
4179 (void) vm_fault(map, va, VM_PROT_NONE,
4180 TRUE, THREAD_UNINT, pmap, pmap_addr);
4181 } else {
4182 vm_prot_t prot;
4183 vm_page_t result_page;
4184 vm_page_t top_page;
4185 vm_object_t result_object;
4186 vm_fault_return_t result;
4187
4188 if (end_addr - va > (vm_size_t) -1) {
4189 /* 32-bit overflow */
4190 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4191 } else {
4192 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4193 assert(fault_info.cluster_size == end_addr - va);
4194 }
4195
4196 do {
4197 prot = VM_PROT_NONE;
4198
4199 vm_object_lock(object);
4200 vm_object_paging_begin(object);
4201 XPR(XPR_VM_FAULT,
4202 "vm_fault_unwire -> vm_fault_page\n",
4203 0,0,0,0,0);
4204 result = vm_fault_page(
4205 object,
4206 entry->offset + (va - entry->vme_start),
4207 VM_PROT_NONE, TRUE,
4208 &prot, &result_page, &top_page,
4209 (int *)0,
4210 NULL, map->no_zero_fill,
4211 FALSE, &fault_info);
4212 } while (result == VM_FAULT_RETRY);
4213
4214 /*
4215 * If this was a mapping to a file on a device that has been forcibly
4216 * unmounted, then we won't get a page back from vm_fault_page(). Just
4217 * move on to the next one in case the remaining pages are mapped from
4218 * different objects. During a forced unmount, the object is terminated
4219 * so the alive flag will be false if this happens. A forced unmount will
4220 * will occur when an external disk is unplugged before the user does an
4221 * eject, so we don't want to panic in that situation.
4222 */
4223
4224 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4225 continue;
4226
4227 if (result != VM_FAULT_SUCCESS)
4228 panic("vm_fault_unwire: failure");
4229
4230 result_object = result_page->object;
4231
4232 if (deallocate) {
4233 assert(result_page->phys_page !=
4234 vm_page_fictitious_addr);
4235 pmap_disconnect(result_page->phys_page);
4236 VM_PAGE_FREE(result_page);
4237 } else {
4238 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4239 pmap_change_wiring(pmap,
4240 pmap_addr + (va - entry->vme_start), FALSE);
4241
4242
4243 if (VM_PAGE_WIRED(result_page)) {
4244 vm_page_lockspin_queues();
4245 vm_page_unwire(result_page, TRUE);
4246 vm_page_unlock_queues();
4247 }
4248 if(entry->zero_wired_pages) {
4249 pmap_zero_page(result_page->phys_page);
4250 entry->zero_wired_pages = FALSE;
4251 }
4252
4253 PAGE_WAKEUP_DONE(result_page);
4254 }
4255 vm_fault_cleanup(result_object, top_page);
4256 }
4257 }
4258
4259 /*
4260 * Inform the physical mapping system that the range
4261 * of addresses may fault, so that page tables and
4262 * such may be unwired themselves.
4263 */
4264
4265 pmap_pageable(pmap, pmap_addr,
4266 pmap_addr + (end_addr - entry->vme_start), TRUE);
4267
4268 }
4269
4270 /*
4271 * vm_fault_wire_fast:
4272 *
4273 * Handle common case of a wire down page fault at the given address.
4274 * If successful, the page is inserted into the associated physical map.
4275 * The map entry is passed in to avoid the overhead of a map lookup.
4276 *
4277 * NOTE: the given address should be truncated to the
4278 * proper page address.
4279 *
4280 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
4281 * a standard error specifying why the fault is fatal is returned.
4282 *
4283 * The map in question must be referenced, and remains so.
4284 * Caller has a read lock on the map.
4285 *
4286 * This is a stripped version of vm_fault() for wiring pages. Anything
4287 * other than the common case will return KERN_FAILURE, and the caller
4288 * is expected to call vm_fault().
4289 */
4290 kern_return_t
4291 vm_fault_wire_fast(
4292 __unused vm_map_t map,
4293 vm_map_offset_t va,
4294 vm_map_entry_t entry,
4295 pmap_t pmap,
4296 vm_map_offset_t pmap_addr)
4297 {
4298 vm_object_t object;
4299 vm_object_offset_t offset;
4300 register vm_page_t m;
4301 vm_prot_t prot;
4302 thread_t thread = current_thread();
4303 int type_of_fault;
4304 kern_return_t kr;
4305
4306 VM_STAT_INCR(faults);
4307
4308 if (thread != THREAD_NULL && thread->task != TASK_NULL)
4309 thread->task->faults++;
4310
4311 /*
4312 * Recovery actions
4313 */
4314
4315 #undef RELEASE_PAGE
4316 #define RELEASE_PAGE(m) { \
4317 PAGE_WAKEUP_DONE(m); \
4318 vm_page_lockspin_queues(); \
4319 vm_page_unwire(m, TRUE); \
4320 vm_page_unlock_queues(); \
4321 }
4322
4323
4324 #undef UNLOCK_THINGS
4325 #define UNLOCK_THINGS { \
4326 vm_object_paging_end(object); \
4327 vm_object_unlock(object); \
4328 }
4329
4330 #undef UNLOCK_AND_DEALLOCATE
4331 #define UNLOCK_AND_DEALLOCATE { \
4332 UNLOCK_THINGS; \
4333 vm_object_deallocate(object); \
4334 }
4335 /*
4336 * Give up and have caller do things the hard way.
4337 */
4338
4339 #define GIVE_UP { \
4340 UNLOCK_AND_DEALLOCATE; \
4341 return(KERN_FAILURE); \
4342 }
4343
4344
4345 /*
4346 * If this entry is not directly to a vm_object, bail out.
4347 */
4348 if (entry->is_sub_map)
4349 return(KERN_FAILURE);
4350
4351 /*
4352 * Find the backing store object and offset into it.
4353 */
4354
4355 object = entry->object.vm_object;
4356 offset = (va - entry->vme_start) + entry->offset;
4357 prot = entry->protection;
4358
4359 /*
4360 * Make a reference to this object to prevent its
4361 * disposal while we are messing with it.
4362 */
4363
4364 vm_object_lock(object);
4365 vm_object_reference_locked(object);
4366 vm_object_paging_begin(object);
4367
4368 /*
4369 * INVARIANTS (through entire routine):
4370 *
4371 * 1) At all times, we must either have the object
4372 * lock or a busy page in some object to prevent
4373 * some other thread from trying to bring in
4374 * the same page.
4375 *
4376 * 2) Once we have a busy page, we must remove it from
4377 * the pageout queues, so that the pageout daemon
4378 * will not grab it away.
4379 *
4380 */
4381
4382 /*
4383 * Look for page in top-level object. If it's not there or
4384 * there's something going on, give up.
4385 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4386 * decrypt the page before wiring it down.
4387 */
4388 m = vm_page_lookup(object, offset);
4389 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4390 (m->unusual && ( m->error || m->restart || m->absent))) {
4391
4392 GIVE_UP;
4393 }
4394 ASSERT_PAGE_DECRYPTED(m);
4395
4396 if (m->fictitious &&
4397 m->phys_page == vm_page_guard_addr) {
4398 /*
4399 * Guard pages are fictitious pages and are never
4400 * entered into a pmap, so let's say it's been wired...
4401 */
4402 kr = KERN_SUCCESS;
4403 goto done;
4404 }
4405
4406 /*
4407 * Wire the page down now. All bail outs beyond this
4408 * point must unwire the page.
4409 */
4410
4411 vm_page_lockspin_queues();
4412 vm_page_wire(m);
4413 vm_page_unlock_queues();
4414
4415 /*
4416 * Mark page busy for other threads.
4417 */
4418 assert(!m->busy);
4419 m->busy = TRUE;
4420 assert(!m->absent);
4421
4422 /*
4423 * Give up if the page is being written and there's a copy object
4424 */
4425 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4426 RELEASE_PAGE(m);
4427 GIVE_UP;
4428 }
4429
4430 /*
4431 * Put this page into the physical map.
4432 */
4433 type_of_fault = DBG_CACHE_HIT_FAULT;
4434 kr = vm_fault_enter(m,
4435 pmap,
4436 pmap_addr,
4437 prot,
4438 prot,
4439 TRUE,
4440 FALSE,
4441 FALSE,
4442 FALSE,
4443 NULL,
4444 &type_of_fault);
4445
4446 done:
4447 /*
4448 * Unlock everything, and return
4449 */
4450
4451 PAGE_WAKEUP_DONE(m);
4452 UNLOCK_AND_DEALLOCATE;
4453
4454 return kr;
4455
4456 }
4457
4458 /*
4459 * Routine: vm_fault_copy_cleanup
4460 * Purpose:
4461 * Release a page used by vm_fault_copy.
4462 */
4463
4464 void
4465 vm_fault_copy_cleanup(
4466 vm_page_t page,
4467 vm_page_t top_page)
4468 {
4469 vm_object_t object = page->object;
4470
4471 vm_object_lock(object);
4472 PAGE_WAKEUP_DONE(page);
4473 if (!page->active && !page->inactive && !page->throttled) {
4474 vm_page_lockspin_queues();
4475 if (!page->active && !page->inactive && !page->throttled)
4476 vm_page_activate(page);
4477 vm_page_unlock_queues();
4478 }
4479 vm_fault_cleanup(object, top_page);
4480 }
4481
4482 void
4483 vm_fault_copy_dst_cleanup(
4484 vm_page_t page)
4485 {
4486 vm_object_t object;
4487
4488 if (page != VM_PAGE_NULL) {
4489 object = page->object;
4490 vm_object_lock(object);
4491 vm_page_lockspin_queues();
4492 vm_page_unwire(page, TRUE);
4493 vm_page_unlock_queues();
4494 vm_object_paging_end(object);
4495 vm_object_unlock(object);
4496 }
4497 }
4498
4499 /*
4500 * Routine: vm_fault_copy
4501 *
4502 * Purpose:
4503 * Copy pages from one virtual memory object to another --
4504 * neither the source nor destination pages need be resident.
4505 *
4506 * Before actually copying a page, the version associated with
4507 * the destination address map wil be verified.
4508 *
4509 * In/out conditions:
4510 * The caller must hold a reference, but not a lock, to
4511 * each of the source and destination objects and to the
4512 * destination map.
4513 *
4514 * Results:
4515 * Returns KERN_SUCCESS if no errors were encountered in
4516 * reading or writing the data. Returns KERN_INTERRUPTED if
4517 * the operation was interrupted (only possible if the
4518 * "interruptible" argument is asserted). Other return values
4519 * indicate a permanent error in copying the data.
4520 *
4521 * The actual amount of data copied will be returned in the
4522 * "copy_size" argument. In the event that the destination map
4523 * verification failed, this amount may be less than the amount
4524 * requested.
4525 */
4526 kern_return_t
4527 vm_fault_copy(
4528 vm_object_t src_object,
4529 vm_object_offset_t src_offset,
4530 vm_map_size_t *copy_size, /* INOUT */
4531 vm_object_t dst_object,
4532 vm_object_offset_t dst_offset,
4533 vm_map_t dst_map,
4534 vm_map_version_t *dst_version,
4535 int interruptible)
4536 {
4537 vm_page_t result_page;
4538
4539 vm_page_t src_page;
4540 vm_page_t src_top_page;
4541 vm_prot_t src_prot;
4542
4543 vm_page_t dst_page;
4544 vm_page_t dst_top_page;
4545 vm_prot_t dst_prot;
4546
4547 vm_map_size_t amount_left;
4548 vm_object_t old_copy_object;
4549 kern_return_t error = 0;
4550 vm_fault_return_t result;
4551
4552 vm_map_size_t part_size;
4553 struct vm_object_fault_info fault_info_src;
4554 struct vm_object_fault_info fault_info_dst;
4555
4556 /*
4557 * In order not to confuse the clustered pageins, align
4558 * the different offsets on a page boundary.
4559 */
4560
4561 #define RETURN(x) \
4562 MACRO_BEGIN \
4563 *copy_size -= amount_left; \
4564 MACRO_RETURN(x); \
4565 MACRO_END
4566
4567 amount_left = *copy_size;
4568
4569 fault_info_src.interruptible = interruptible;
4570 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4571 fault_info_src.user_tag = 0;
4572 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4573 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4574 fault_info_src.no_cache = FALSE;
4575 fault_info_src.stealth = TRUE;
4576 fault_info_src.io_sync = FALSE;
4577 fault_info_src.cs_bypass = FALSE;
4578 fault_info_src.mark_zf_absent = FALSE;
4579 fault_info_src.batch_pmap_op = FALSE;
4580
4581 fault_info_dst.interruptible = interruptible;
4582 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4583 fault_info_dst.user_tag = 0;
4584 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4585 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4586 fault_info_dst.no_cache = FALSE;
4587 fault_info_dst.stealth = TRUE;
4588 fault_info_dst.io_sync = FALSE;
4589 fault_info_dst.cs_bypass = FALSE;
4590 fault_info_dst.mark_zf_absent = FALSE;
4591 fault_info_dst.batch_pmap_op = FALSE;
4592
4593 do { /* while (amount_left > 0) */
4594 /*
4595 * There may be a deadlock if both source and destination
4596 * pages are the same. To avoid this deadlock, the copy must
4597 * start by getting the destination page in order to apply
4598 * COW semantics if any.
4599 */
4600
4601 RetryDestinationFault: ;
4602
4603 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4604
4605 vm_object_lock(dst_object);
4606 vm_object_paging_begin(dst_object);
4607
4608 if (amount_left > (vm_size_t) -1) {
4609 /* 32-bit overflow */
4610 fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4611 } else {
4612 fault_info_dst.cluster_size = (vm_size_t) amount_left;
4613 assert(fault_info_dst.cluster_size == amount_left);
4614 }
4615
4616 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4617 result = vm_fault_page(dst_object,
4618 vm_object_trunc_page(dst_offset),
4619 VM_PROT_WRITE|VM_PROT_READ,
4620 FALSE,
4621 &dst_prot, &dst_page, &dst_top_page,
4622 (int *)0,
4623 &error,
4624 dst_map->no_zero_fill,
4625 FALSE, &fault_info_dst);
4626 switch (result) {
4627 case VM_FAULT_SUCCESS:
4628 break;
4629 case VM_FAULT_RETRY:
4630 goto RetryDestinationFault;
4631 case VM_FAULT_MEMORY_SHORTAGE:
4632 if (vm_page_wait(interruptible))
4633 goto RetryDestinationFault;
4634 /* fall thru */
4635 case VM_FAULT_INTERRUPTED:
4636 RETURN(MACH_SEND_INTERRUPTED);
4637 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4638 /* success but no VM page: fail the copy */
4639 vm_object_paging_end(dst_object);
4640 vm_object_unlock(dst_object);
4641 /*FALLTHROUGH*/
4642 case VM_FAULT_MEMORY_ERROR:
4643 if (error)
4644 return (error);
4645 else
4646 return(KERN_MEMORY_ERROR);
4647 default:
4648 panic("vm_fault_copy: unexpected error 0x%x from "
4649 "vm_fault_page()\n", result);
4650 }
4651 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4652
4653 old_copy_object = dst_page->object->copy;
4654
4655 /*
4656 * There exists the possiblity that the source and
4657 * destination page are the same. But we can't
4658 * easily determine that now. If they are the
4659 * same, the call to vm_fault_page() for the
4660 * destination page will deadlock. To prevent this we
4661 * wire the page so we can drop busy without having
4662 * the page daemon steal the page. We clean up the
4663 * top page but keep the paging reference on the object
4664 * holding the dest page so it doesn't go away.
4665 */
4666
4667 vm_page_lockspin_queues();
4668 vm_page_wire(dst_page);
4669 vm_page_unlock_queues();
4670 PAGE_WAKEUP_DONE(dst_page);
4671 vm_object_unlock(dst_page->object);
4672
4673 if (dst_top_page != VM_PAGE_NULL) {
4674 vm_object_lock(dst_object);
4675 VM_PAGE_FREE(dst_top_page);
4676 vm_object_paging_end(dst_object);
4677 vm_object_unlock(dst_object);
4678 }
4679
4680 RetrySourceFault: ;
4681
4682 if (src_object == VM_OBJECT_NULL) {
4683 /*
4684 * No source object. We will just
4685 * zero-fill the page in dst_object.
4686 */
4687 src_page = VM_PAGE_NULL;
4688 result_page = VM_PAGE_NULL;
4689 } else {
4690 vm_object_lock(src_object);
4691 src_page = vm_page_lookup(src_object,
4692 vm_object_trunc_page(src_offset));
4693 if (src_page == dst_page) {
4694 src_prot = dst_prot;
4695 result_page = VM_PAGE_NULL;
4696 } else {
4697 src_prot = VM_PROT_READ;
4698 vm_object_paging_begin(src_object);
4699
4700 if (amount_left > (vm_size_t) -1) {
4701 /* 32-bit overflow */
4702 fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4703 } else {
4704 fault_info_src.cluster_size = (vm_size_t) amount_left;
4705 assert(fault_info_src.cluster_size == amount_left);
4706 }
4707
4708 XPR(XPR_VM_FAULT,
4709 "vm_fault_copy(2) -> vm_fault_page\n",
4710 0,0,0,0,0);
4711 result = vm_fault_page(
4712 src_object,
4713 vm_object_trunc_page(src_offset),
4714 VM_PROT_READ, FALSE,
4715 &src_prot,
4716 &result_page, &src_top_page,
4717 (int *)0, &error, FALSE,
4718 FALSE, &fault_info_src);
4719
4720 switch (result) {
4721 case VM_FAULT_SUCCESS:
4722 break;
4723 case VM_FAULT_RETRY:
4724 goto RetrySourceFault;
4725 case VM_FAULT_MEMORY_SHORTAGE:
4726 if (vm_page_wait(interruptible))
4727 goto RetrySourceFault;
4728 /* fall thru */
4729 case VM_FAULT_INTERRUPTED:
4730 vm_fault_copy_dst_cleanup(dst_page);
4731 RETURN(MACH_SEND_INTERRUPTED);
4732 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4733 /* success but no VM page: fail */
4734 vm_object_paging_end(src_object);
4735 vm_object_unlock(src_object);
4736 /*FALLTHROUGH*/
4737 case VM_FAULT_MEMORY_ERROR:
4738 vm_fault_copy_dst_cleanup(dst_page);
4739 if (error)
4740 return (error);
4741 else
4742 return(KERN_MEMORY_ERROR);
4743 default:
4744 panic("vm_fault_copy(2): unexpected "
4745 "error 0x%x from "
4746 "vm_fault_page()\n", result);
4747 }
4748
4749
4750 assert((src_top_page == VM_PAGE_NULL) ==
4751 (result_page->object == src_object));
4752 }
4753 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4754 vm_object_unlock(result_page->object);
4755 }
4756
4757 if (!vm_map_verify(dst_map, dst_version)) {
4758 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4759 vm_fault_copy_cleanup(result_page, src_top_page);
4760 vm_fault_copy_dst_cleanup(dst_page);
4761 break;
4762 }
4763
4764 vm_object_lock(dst_page->object);
4765
4766 if (dst_page->object->copy != old_copy_object) {
4767 vm_object_unlock(dst_page->object);
4768 vm_map_verify_done(dst_map, dst_version);
4769 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4770 vm_fault_copy_cleanup(result_page, src_top_page);
4771 vm_fault_copy_dst_cleanup(dst_page);
4772 break;
4773 }
4774 vm_object_unlock(dst_page->object);
4775
4776 /*
4777 * Copy the page, and note that it is dirty
4778 * immediately.
4779 */
4780
4781 if (!page_aligned(src_offset) ||
4782 !page_aligned(dst_offset) ||
4783 !page_aligned(amount_left)) {
4784
4785 vm_object_offset_t src_po,
4786 dst_po;
4787
4788 src_po = src_offset - vm_object_trunc_page(src_offset);
4789 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4790
4791 if (dst_po > src_po) {
4792 part_size = PAGE_SIZE - dst_po;
4793 } else {
4794 part_size = PAGE_SIZE - src_po;
4795 }
4796 if (part_size > (amount_left)){
4797 part_size = amount_left;
4798 }
4799
4800 if (result_page == VM_PAGE_NULL) {
4801 assert((vm_offset_t) dst_po == dst_po);
4802 assert((vm_size_t) part_size == part_size);
4803 vm_page_part_zero_fill(dst_page,
4804 (vm_offset_t) dst_po,
4805 (vm_size_t) part_size);
4806 } else {
4807 assert((vm_offset_t) src_po == src_po);
4808 assert((vm_offset_t) dst_po == dst_po);
4809 assert((vm_size_t) part_size == part_size);
4810 vm_page_part_copy(result_page,
4811 (vm_offset_t) src_po,
4812 dst_page,
4813 (vm_offset_t) dst_po,
4814 (vm_size_t)part_size);
4815 if(!dst_page->dirty){
4816 vm_object_lock(dst_object);
4817 SET_PAGE_DIRTY(dst_page, TRUE);
4818 vm_object_unlock(dst_page->object);
4819 }
4820
4821 }
4822 } else {
4823 part_size = PAGE_SIZE;
4824
4825 if (result_page == VM_PAGE_NULL)
4826 vm_page_zero_fill(dst_page);
4827 else{
4828 vm_object_lock(result_page->object);
4829 vm_page_copy(result_page, dst_page);
4830 vm_object_unlock(result_page->object);
4831
4832 if(!dst_page->dirty){
4833 vm_object_lock(dst_object);
4834 SET_PAGE_DIRTY(dst_page, TRUE);
4835 vm_object_unlock(dst_page->object);
4836 }
4837 }
4838
4839 }
4840
4841 /*
4842 * Unlock everything, and return
4843 */
4844
4845 vm_map_verify_done(dst_map, dst_version);
4846
4847 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4848 vm_fault_copy_cleanup(result_page, src_top_page);
4849 vm_fault_copy_dst_cleanup(dst_page);
4850
4851 amount_left -= part_size;
4852 src_offset += part_size;
4853 dst_offset += part_size;
4854 } while (amount_left > 0);
4855
4856 RETURN(KERN_SUCCESS);
4857 #undef RETURN
4858
4859 /*NOTREACHED*/
4860 }
4861
4862 #if VM_FAULT_CLASSIFY
4863 /*
4864 * Temporary statistics gathering support.
4865 */
4866
4867 /*
4868 * Statistics arrays:
4869 */
4870 #define VM_FAULT_TYPES_MAX 5
4871 #define VM_FAULT_LEVEL_MAX 8
4872
4873 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4874
4875 #define VM_FAULT_TYPE_ZERO_FILL 0
4876 #define VM_FAULT_TYPE_MAP_IN 1
4877 #define VM_FAULT_TYPE_PAGER 2
4878 #define VM_FAULT_TYPE_COPY 3
4879 #define VM_FAULT_TYPE_OTHER 4
4880
4881
4882 void
4883 vm_fault_classify(vm_object_t object,
4884 vm_object_offset_t offset,
4885 vm_prot_t fault_type)
4886 {
4887 int type, level = 0;
4888 vm_page_t m;
4889
4890 while (TRUE) {
4891 m = vm_page_lookup(object, offset);
4892 if (m != VM_PAGE_NULL) {
4893 if (m->busy || m->error || m->restart || m->absent) {
4894 type = VM_FAULT_TYPE_OTHER;
4895 break;
4896 }
4897 if (((fault_type & VM_PROT_WRITE) == 0) ||
4898 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4899 type = VM_FAULT_TYPE_MAP_IN;
4900 break;
4901 }
4902 type = VM_FAULT_TYPE_COPY;
4903 break;
4904 }
4905 else {
4906 if (object->pager_created) {
4907 type = VM_FAULT_TYPE_PAGER;
4908 break;
4909 }
4910 if (object->shadow == VM_OBJECT_NULL) {
4911 type = VM_FAULT_TYPE_ZERO_FILL;
4912 break;
4913 }
4914
4915 offset += object->vo_shadow_offset;
4916 object = object->shadow;
4917 level++;
4918 continue;
4919 }
4920 }
4921
4922 if (level > VM_FAULT_LEVEL_MAX)
4923 level = VM_FAULT_LEVEL_MAX;
4924
4925 vm_fault_stats[type][level] += 1;
4926
4927 return;
4928 }
4929
4930 /* cleanup routine to call from debugger */
4931
4932 void
4933 vm_fault_classify_init(void)
4934 {
4935 int type, level;
4936
4937 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4938 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4939 vm_fault_stats[type][level] = 0;
4940 }
4941 }
4942
4943 return;
4944 }
4945 #endif /* VM_FAULT_CLASSIFY */
4946
4947
4948 extern int cs_validation;
4949
4950 void
4951 vm_page_validate_cs_mapped(
4952 vm_page_t page,
4953 const void *kaddr)
4954 {
4955 vm_object_t object;
4956 vm_object_offset_t offset;
4957 kern_return_t kr;
4958 memory_object_t pager;
4959 void *blobs;
4960 boolean_t validated, tainted;
4961
4962 assert(page->busy);
4963 vm_object_lock_assert_exclusive(page->object);
4964
4965 if (!cs_validation) {
4966 return;
4967 }
4968
4969 if (page->wpmapped && !page->cs_tainted) {
4970 /*
4971 * This page was mapped for "write" access sometime in the
4972 * past and could still be modifiable in the future.
4973 * Consider it tainted.
4974 * [ If the page was already found to be "tainted", no
4975 * need to re-validate. ]
4976 */
4977 page->cs_validated = TRUE;
4978 page->cs_tainted = TRUE;
4979 if (cs_debug) {
4980 printf("CODESIGNING: vm_page_validate_cs: "
4981 "page %p obj %p off 0x%llx "
4982 "was modified\n",
4983 page, page->object, page->offset);
4984 }
4985 vm_cs_validated_dirtied++;
4986 }
4987
4988 if (page->cs_validated) {
4989 return;
4990 }
4991
4992 vm_cs_validates++;
4993
4994 object = page->object;
4995 assert(object->code_signed);
4996 offset = page->offset;
4997
4998 if (!object->alive || object->terminating || object->pager == NULL) {
4999 /*
5000 * The object is terminating and we don't have its pager
5001 * so we can't validate the data...
5002 */
5003 return;
5004 }
5005 /*
5006 * Since we get here to validate a page that was brought in by
5007 * the pager, we know that this pager is all setup and ready
5008 * by now.
5009 */
5010 assert(!object->internal);
5011 assert(object->pager != NULL);
5012 assert(object->pager_ready);
5013
5014 pager = object->pager;
5015 assert(object->paging_in_progress);
5016 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5017 if (kr != KERN_SUCCESS) {
5018 blobs = NULL;
5019 }
5020
5021 /* verify the SHA1 hash for this page */
5022 validated = cs_validate_page(blobs,
5023 pager,
5024 offset + object->paging_offset,
5025 (const void *)kaddr,
5026 &tainted);
5027
5028 page->cs_validated = validated;
5029 if (validated) {
5030 page->cs_tainted = tainted;
5031 }
5032 }
5033
5034 void
5035 vm_page_validate_cs(
5036 vm_page_t page)
5037 {
5038 vm_object_t object;
5039 vm_object_offset_t offset;
5040 vm_map_offset_t koffset;
5041 vm_map_size_t ksize;
5042 vm_offset_t kaddr;
5043 kern_return_t kr;
5044 boolean_t busy_page;
5045
5046 vm_object_lock_assert_held(page->object);
5047
5048 if (!cs_validation) {
5049 return;
5050 }
5051
5052 if (page->wpmapped && !page->cs_tainted) {
5053 vm_object_lock_assert_exclusive(page->object);
5054
5055 /*
5056 * This page was mapped for "write" access sometime in the
5057 * past and could still be modifiable in the future.
5058 * Consider it tainted.
5059 * [ If the page was already found to be "tainted", no
5060 * need to re-validate. ]
5061 */
5062 page->cs_validated = TRUE;
5063 page->cs_tainted = TRUE;
5064 if (cs_debug) {
5065 printf("CODESIGNING: vm_page_validate_cs: "
5066 "page %p obj %p off 0x%llx "
5067 "was modified\n",
5068 page, page->object, page->offset);
5069 }
5070 vm_cs_validated_dirtied++;
5071 }
5072
5073 if (page->cs_validated) {
5074 return;
5075 }
5076
5077 #if CHECK_CS_VALIDATION_BITMAP
5078 if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5079 page->cs_validated = TRUE;
5080 page->cs_tainted = FALSE;
5081 vm_cs_bitmap_validated++;
5082 return;
5083 }
5084 #endif
5085 vm_object_lock_assert_exclusive(page->object);
5086
5087 object = page->object;
5088 assert(object->code_signed);
5089 offset = page->offset;
5090
5091 busy_page = page->busy;
5092 if (!busy_page) {
5093 /* keep page busy while we map (and unlock) the VM object */
5094 page->busy = TRUE;
5095 }
5096
5097 /*
5098 * Take a paging reference on the VM object
5099 * to protect it from collapse or bypass,
5100 * and keep it from disappearing too.
5101 */
5102 vm_object_paging_begin(object);
5103
5104 /* map the page in the kernel address space */
5105 koffset = 0;
5106 ksize = PAGE_SIZE_64;
5107 kr = vm_paging_map_object(&koffset,
5108 page,
5109 object,
5110 offset,
5111 &ksize,
5112 VM_PROT_READ,
5113 FALSE); /* can't unlock object ! */
5114 if (kr != KERN_SUCCESS) {
5115 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5116 }
5117 kaddr = CAST_DOWN(vm_offset_t, koffset);
5118
5119 /* validate the mapped page */
5120 vm_page_validate_cs_mapped(page, (const void *) kaddr);
5121
5122 #if CHECK_CS_VALIDATION_BITMAP
5123 if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5124 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5125 }
5126 #endif
5127 assert(page->busy);
5128 assert(object == page->object);
5129 vm_object_lock_assert_exclusive(object);
5130
5131 if (!busy_page) {
5132 PAGE_WAKEUP_DONE(page);
5133 }
5134 if (koffset != 0) {
5135 /* unmap the map from the kernel address space */
5136 vm_paging_unmap_object(object, koffset, koffset + ksize);
5137 koffset = 0;
5138 ksize = 0;
5139 kaddr = 0;
5140 }
5141 vm_object_paging_end(object);
5142 }