]> git.saurik.com Git - apple/xnu.git/blob - osfmk/vm/vm_fault.c
809d71e177940e8343a2eeb30bd8f7ce466a8418
[apple/xnu.git] / osfmk / vm / vm_fault.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65 #include <mach_cluster_stats.h>
66 #include <mach_pagemap.h>
67 #include <mach_kdb.h>
68 #include <libkern/OSAtomic.h>
69
70 #include <mach/mach_types.h>
71 #include <mach/kern_return.h>
72 #include <mach/message.h> /* for error codes */
73 #include <mach/vm_param.h>
74 #include <mach/vm_behavior.h>
75 #include <mach/memory_object.h>
76 /* For memory_object_data_{request,unlock} */
77 #include <mach/sdt.h>
78
79 #include <kern/kern_types.h>
80 #include <kern/host_statistics.h>
81 #include <kern/counters.h>
82 #include <kern/task.h>
83 #include <kern/thread.h>
84 #include <kern/sched_prim.h>
85 #include <kern/host.h>
86 #include <kern/xpr.h>
87 #include <kern/mach_param.h>
88 #include <kern/macro_help.h>
89 #include <kern/zalloc.h>
90 #include <kern/misc_protos.h>
91
92 #include <ppc/proc_reg.h>
93
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_kern.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/vm_protos.h>
102 #include <vm/vm_external.h>
103 #include <vm/memory_object.h>
104 #include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105
106 #include <sys/kdebug.h>
107
108 #define VM_FAULT_CLASSIFY 0
109
110 /* Zero-filled pages are marked "m->zero_fill" and put on the
111 * special zero-fill inactive queue only if they belong to
112 * an object at least this big.
113 */
114 #define VM_ZF_OBJECT_SIZE_THRESHOLD (0x200000)
115
116 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118 int vm_object_pagein_throttle = 16;
119
120 extern int cs_debug;
121
122 #if MACH_KDB
123 extern struct db_watchpoint *db_watchpoint_list;
124 #endif /* MACH_KDB */
125
126
127 /* Forward declarations of internal routines. */
128 extern kern_return_t vm_fault_wire_fast(
129 vm_map_t map,
130 vm_map_offset_t va,
131 vm_map_entry_t entry,
132 pmap_t pmap,
133 vm_map_offset_t pmap_addr);
134
135 extern void vm_fault_continue(void);
136
137 extern void vm_fault_copy_cleanup(
138 vm_page_t page,
139 vm_page_t top_page);
140
141 extern void vm_fault_copy_dst_cleanup(
142 vm_page_t page);
143
144 #if VM_FAULT_CLASSIFY
145 extern void vm_fault_classify(vm_object_t object,
146 vm_object_offset_t offset,
147 vm_prot_t fault_type);
148
149 extern void vm_fault_classify_init(void);
150 #endif
151
152
153 unsigned long vm_cs_validates = 0;
154 unsigned long vm_cs_revalidates = 0;
155 unsigned long vm_cs_query_modified = 0;
156 unsigned long vm_cs_validated_dirtied = 0;
157
158 /*
159 * Routine: vm_fault_init
160 * Purpose:
161 * Initialize our private data structures.
162 */
163 void
164 vm_fault_init(void)
165 {
166 }
167
168 /*
169 * Routine: vm_fault_cleanup
170 * Purpose:
171 * Clean up the result of vm_fault_page.
172 * Results:
173 * The paging reference for "object" is released.
174 * "object" is unlocked.
175 * If "top_page" is not null, "top_page" is
176 * freed and the paging reference for the object
177 * containing it is released.
178 *
179 * In/out conditions:
180 * "object" must be locked.
181 */
182 void
183 vm_fault_cleanup(
184 register vm_object_t object,
185 register vm_page_t top_page)
186 {
187 vm_object_paging_end(object);
188 vm_object_unlock(object);
189
190 if (top_page != VM_PAGE_NULL) {
191 object = top_page->object;
192
193 vm_object_lock(object);
194 VM_PAGE_FREE(top_page);
195 vm_object_paging_end(object);
196 vm_object_unlock(object);
197 }
198 }
199
200 #if MACH_CLUSTER_STATS
201 #define MAXCLUSTERPAGES 16
202 struct {
203 unsigned long pages_in_cluster;
204 unsigned long pages_at_higher_offsets;
205 unsigned long pages_at_lower_offsets;
206 } cluster_stats_in[MAXCLUSTERPAGES];
207 #define CLUSTER_STAT(clause) clause
208 #define CLUSTER_STAT_HIGHER(x) \
209 ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
210 #define CLUSTER_STAT_LOWER(x) \
211 ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
212 #define CLUSTER_STAT_CLUSTER(x) \
213 ((cluster_stats_in[(x)].pages_in_cluster)++)
214 #else /* MACH_CLUSTER_STATS */
215 #define CLUSTER_STAT(clause)
216 #endif /* MACH_CLUSTER_STATS */
217
218 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
219
220
221 boolean_t vm_page_deactivate_behind = TRUE;
222 /*
223 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
224 */
225 int vm_default_ahead = 0;
226 int vm_default_behind = MAX_UPL_TRANSFER;
227
228 #define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
229
230 /*
231 * vm_page_is_sequential
232 *
233 * Determine if sequential access is in progress
234 * in accordance with the behavior specified.
235 * Update state to indicate current access pattern.
236 *
237 * object must have at least the shared lock held
238 */
239 static
240 void
241 vm_fault_is_sequential(
242 vm_object_t object,
243 vm_object_offset_t offset,
244 vm_behavior_t behavior)
245 {
246 vm_object_offset_t last_alloc;
247 int sequential;
248 int orig_sequential;
249
250 last_alloc = object->last_alloc;
251 sequential = object->sequential;
252 orig_sequential = sequential;
253
254 switch (behavior) {
255 case VM_BEHAVIOR_RANDOM:
256 /*
257 * reset indicator of sequential behavior
258 */
259 sequential = 0;
260 break;
261
262 case VM_BEHAVIOR_SEQUENTIAL:
263 if (offset && last_alloc == offset - PAGE_SIZE_64) {
264 /*
265 * advance indicator of sequential behavior
266 */
267 if (sequential < MAX_SEQUENTIAL_RUN)
268 sequential += PAGE_SIZE;
269 } else {
270 /*
271 * reset indicator of sequential behavior
272 */
273 sequential = 0;
274 }
275 break;
276
277 case VM_BEHAVIOR_RSEQNTL:
278 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
279 /*
280 * advance indicator of sequential behavior
281 */
282 if (sequential > -MAX_SEQUENTIAL_RUN)
283 sequential -= PAGE_SIZE;
284 } else {
285 /*
286 * reset indicator of sequential behavior
287 */
288 sequential = 0;
289 }
290 break;
291
292 case VM_BEHAVIOR_DEFAULT:
293 default:
294 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
295 /*
296 * advance indicator of sequential behavior
297 */
298 if (sequential < 0)
299 sequential = 0;
300 if (sequential < MAX_SEQUENTIAL_RUN)
301 sequential += PAGE_SIZE;
302
303 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
304 /*
305 * advance indicator of sequential behavior
306 */
307 if (sequential > 0)
308 sequential = 0;
309 if (sequential > -MAX_SEQUENTIAL_RUN)
310 sequential -= PAGE_SIZE;
311 } else {
312 /*
313 * reset indicator of sequential behavior
314 */
315 sequential = 0;
316 }
317 break;
318 }
319 if (sequential != orig_sequential) {
320 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
321 /*
322 * if someone else has already updated object->sequential
323 * don't bother trying to update it or object->last_alloc
324 */
325 return;
326 }
327 }
328 /*
329 * I'd like to do this with a OSCompareAndSwap64, but that
330 * doesn't exist for PPC... however, it shouldn't matter
331 * that much... last_alloc is maintained so that we can determine
332 * if a sequential access pattern is taking place... if only
333 * one thread is banging on this object, no problem with the unprotected
334 * update... if 2 or more threads are banging away, we run the risk of
335 * someone seeing a mangled update... however, in the face of multiple
336 * accesses, no sequential access pattern can develop anyway, so we
337 * haven't lost any real info.
338 */
339 object->last_alloc = offset;
340 }
341
342
343 /*
344 * vm_page_deactivate_behind
345 *
346 * Determine if sequential access is in progress
347 * in accordance with the behavior specified. If
348 * so, compute a potential page to deactivate and
349 * deactivate it.
350 *
351 * object must be locked.
352 *
353 * return TRUE if we actually deactivate a page
354 */
355 static
356 boolean_t
357 vm_fault_deactivate_behind(
358 vm_object_t object,
359 vm_object_offset_t offset,
360 vm_behavior_t behavior)
361 {
362 vm_page_t m = NULL;
363 int sequential_run;
364 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
365
366 #if TRACEFAULTPAGE
367 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
368 #endif
369
370 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
371 /*
372 * Do not deactivate pages from the kernel object: they
373 * are not intended to become pageable.
374 * or we've disabled the deactivate behind mechanism
375 */
376 return FALSE;
377 }
378 if ((sequential_run = object->sequential)) {
379 if (sequential_run < 0) {
380 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
381 sequential_run = 0 - sequential_run;
382 } else {
383 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
384 }
385 }
386 switch (behavior) {
387 case VM_BEHAVIOR_RANDOM:
388 break;
389 case VM_BEHAVIOR_SEQUENTIAL:
390 if (sequential_run >= (int)PAGE_SIZE)
391 m = vm_page_lookup(object, offset - PAGE_SIZE_64);
392 break;
393 case VM_BEHAVIOR_RSEQNTL:
394 if (sequential_run >= (int)PAGE_SIZE)
395 m = vm_page_lookup(object, offset + PAGE_SIZE_64);
396 break;
397 case VM_BEHAVIOR_DEFAULT:
398 default:
399 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
400
401 /*
402 * determine if the run of sequential accesss has been
403 * long enough on an object with default access behavior
404 * to consider it for deactivation
405 */
406 if ((uint64_t)sequential_run >= behind) {
407 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
408 if (offset >= behind)
409 m = vm_page_lookup(object, offset - behind);
410 } else {
411 if (offset < -behind)
412 m = vm_page_lookup(object, offset + behind);
413 }
414 }
415 break;
416 }
417 }
418 if (m) {
419 if (!m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
420 pmap_clear_reference(m->phys_page);
421 m->deactivated = TRUE;
422 #if TRACEFAULTPAGE
423 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
424 #endif
425 return TRUE;
426 }
427 }
428 return FALSE;
429 }
430
431
432 /*
433 * check for various conditions that would
434 * prevent us from creating a ZF page...
435 * cleanup is based on being called from vm_fault_page
436 *
437 * object must be locked
438 * object == m->object
439 */
440 static vm_fault_return_t
441 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
442 {
443 if (object->shadow_severed) {
444 /*
445 * the shadow chain was severed
446 * just have to return an error at this point
447 */
448 if (m != VM_PAGE_NULL)
449 VM_PAGE_FREE(m);
450 vm_fault_cleanup(object, first_m);
451
452 thread_interrupt_level(interruptible_state);
453
454 return (VM_FAULT_MEMORY_ERROR);
455 }
456 if (vm_backing_store_low) {
457 /*
458 * are we protecting the system from
459 * backing store exhaustion. If so
460 * sleep unless we are privileged.
461 */
462 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
463
464 if (m != VM_PAGE_NULL)
465 VM_PAGE_FREE(m);
466 vm_fault_cleanup(object, first_m);
467
468 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
469
470 thread_block(THREAD_CONTINUE_NULL);
471 thread_interrupt_level(interruptible_state);
472
473 return (VM_FAULT_RETRY);
474 }
475 }
476 if (VM_PAGE_ZFILL_THROTTLED()) {
477 /*
478 * we're throttling zero-fills...
479 * treat this as if we couldn't grab a page
480 */
481 if (m != VM_PAGE_NULL)
482 VM_PAGE_FREE(m);
483 vm_fault_cleanup(object, first_m);
484
485 thread_interrupt_level(interruptible_state);
486
487 return (VM_FAULT_MEMORY_SHORTAGE);
488 }
489 return (VM_FAULT_SUCCESS);
490 }
491
492
493 /*
494 * do the work to zero fill a page and
495 * inject it into the correct paging queue
496 *
497 * m->object must be locked
498 * page queue lock must NOT be held
499 */
500 static int
501 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
502 {
503 int my_fault = DBG_ZERO_FILL_FAULT;
504
505 /*
506 * This is is a zero-fill page fault...
507 *
508 * Checking the page lock is a waste of
509 * time; this page was absent, so
510 * it can't be page locked by a pager.
511 *
512 * we also consider it undefined
513 * with respect to instruction
514 * execution. i.e. it is the responsibility
515 * of higher layers to call for an instruction
516 * sync after changing the contents and before
517 * sending a program into this area. We
518 * choose this approach for performance
519 */
520 m->pmapped = TRUE;
521
522 m->cs_validated = FALSE;
523 m->cs_tainted = FALSE;
524
525 if (no_zero_fill == TRUE)
526 my_fault = DBG_NZF_PAGE_FAULT;
527 else {
528 vm_page_zero_fill(m);
529
530 VM_STAT_INCR(zero_fill_count);
531 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
532 }
533 assert(!m->laundry);
534 assert(m->object != kernel_object);
535 //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
536
537 if (!IP_VALID(memory_manager_default) &&
538 (m->object->purgable == VM_PURGABLE_DENY ||
539 m->object->purgable == VM_PURGABLE_NONVOLATILE)) {
540 vm_page_lock_queues();
541
542 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
543 m->throttled = TRUE;
544 vm_page_throttled_count++;
545
546 vm_page_unlock_queues();
547 } else {
548 if (m->object->size > VM_ZF_OBJECT_SIZE_THRESHOLD) {
549 m->zero_fill = TRUE;
550 OSAddAtomic(1, (SInt32 *)&vm_zf_count);
551 }
552 }
553 return (my_fault);
554 }
555
556
557 /*
558 * Routine: vm_fault_page
559 * Purpose:
560 * Find the resident page for the virtual memory
561 * specified by the given virtual memory object
562 * and offset.
563 * Additional arguments:
564 * The required permissions for the page is given
565 * in "fault_type". Desired permissions are included
566 * in "protection".
567 * fault_info is passed along to determine pagein cluster
568 * limits... it contains the expected reference pattern,
569 * cluster size if available, etc...
570 *
571 * If the desired page is known to be resident (for
572 * example, because it was previously wired down), asserting
573 * the "unwiring" parameter will speed the search.
574 *
575 * If the operation can be interrupted (by thread_abort
576 * or thread_terminate), then the "interruptible"
577 * parameter should be asserted.
578 *
579 * Results:
580 * The page containing the proper data is returned
581 * in "result_page".
582 *
583 * In/out conditions:
584 * The source object must be locked and referenced,
585 * and must donate one paging reference. The reference
586 * is not affected. The paging reference and lock are
587 * consumed.
588 *
589 * If the call succeeds, the object in which "result_page"
590 * resides is left locked and holding a paging reference.
591 * If this is not the original object, a busy page in the
592 * original object is returned in "top_page", to prevent other
593 * callers from pursuing this same data, along with a paging
594 * reference for the original object. The "top_page" should
595 * be destroyed when this guarantee is no longer required.
596 * The "result_page" is also left busy. It is not removed
597 * from the pageout queues.
598 */
599
600 vm_fault_return_t
601 vm_fault_page(
602 /* Arguments: */
603 vm_object_t first_object, /* Object to begin search */
604 vm_object_offset_t first_offset, /* Offset into object */
605 vm_prot_t fault_type, /* What access is requested */
606 boolean_t must_be_resident,/* Must page be resident? */
607 /* Modifies in place: */
608 vm_prot_t *protection, /* Protection for mapping */
609 /* Returns: */
610 vm_page_t *result_page, /* Page found, if successful */
611 vm_page_t *top_page, /* Page in top object, if
612 * not result_page. */
613 int *type_of_fault, /* if non-null, fill in with type of fault
614 * COW, zero-fill, etc... returned in trace point */
615 /* More arguments: */
616 kern_return_t *error_code, /* code if page is in error */
617 boolean_t no_zero_fill, /* don't zero fill absent pages */
618 #if MACH_PAGEMAP
619 boolean_t data_supply, /* treat as data_supply if
620 * it is a write fault and a full
621 * page is provided */
622 #else
623 __unused boolean_t data_supply,
624 #endif
625 vm_object_fault_info_t fault_info)
626 {
627 vm_page_t m;
628 vm_object_t object;
629 vm_object_offset_t offset;
630 vm_page_t first_m;
631 vm_object_t next_object;
632 vm_object_t copy_object;
633 boolean_t look_for_page;
634 vm_prot_t access_required = fault_type;
635 vm_prot_t wants_copy_flag;
636 CLUSTER_STAT(int pages_at_higher_offsets;)
637 CLUSTER_STAT(int pages_at_lower_offsets;)
638 kern_return_t wait_result;
639 boolean_t interruptible_state;
640 vm_fault_return_t error;
641 int my_fault;
642 uint32_t try_failed_count;
643 int interruptible; /* how may fault be interrupted? */
644 memory_object_t pager;
645
646 /*
647 * MACH page map - an optional optimization where a bit map is maintained
648 * by the VM subsystem for internal objects to indicate which pages of
649 * the object currently reside on backing store. This existence map
650 * duplicates information maintained by the vnode pager. It is
651 * created at the time of the first pageout against the object, i.e.
652 * at the same time pager for the object is created. The optimization
653 * is designed to eliminate pager interaction overhead, if it is
654 * 'known' that the page does not exist on backing store.
655 *
656 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
657 * either marked as paged out in the existence map for the object or no
658 * existence map exists for the object. MUST_ASK_PAGER() is one of the
659 * criteria in the decision to invoke the pager. It is also used as one
660 * of the criteria to terminate the scan for adjacent pages in a clustered
661 * pagein operation. Note that MUST_ASK_PAGER() always evaluates to TRUE for
662 * permanent objects. Note also that if the pager for an internal object
663 * has not been created, the pager is not invoked regardless of the value
664 * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
665 * for which a pager has been created.
666 *
667 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
668 * is marked as paged out in the existence map for the object. PAGED_OUT()
669 * PAGED_OUT() is used to determine if a page has already been pushed
670 * into a copy object in order to avoid a redundant page out operation.
671 */
672 #if MACH_PAGEMAP
673 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
674 != VM_EXTERNAL_STATE_ABSENT)
675 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
676 == VM_EXTERNAL_STATE_EXISTS)
677 #else
678 #define MUST_ASK_PAGER(o, f) (TRUE)
679 #define PAGED_OUT(o, f) (FALSE)
680 #endif
681
682 /*
683 * Recovery actions
684 */
685 #define PREPARE_RELEASE_PAGE(m) \
686 MACRO_BEGIN \
687 vm_page_lock_queues(); \
688 MACRO_END
689
690 #define DO_RELEASE_PAGE(m) \
691 MACRO_BEGIN \
692 PAGE_WAKEUP_DONE(m); \
693 if (!m->active && !m->inactive && !m->throttled)\
694 vm_page_activate(m); \
695 vm_page_unlock_queues(); \
696 MACRO_END
697
698 #define RELEASE_PAGE(m) \
699 MACRO_BEGIN \
700 PREPARE_RELEASE_PAGE(m); \
701 DO_RELEASE_PAGE(m); \
702 MACRO_END
703
704 #if TRACEFAULTPAGE
705 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
706 #endif
707
708
709 #if MACH_KDB
710 /*
711 * If there are watchpoints set, then
712 * we don't want to give away write permission
713 * on a read fault. Make the task write fault,
714 * so that the watchpoint code notices the access.
715 */
716 if (db_watchpoint_list) {
717 /*
718 * If we aren't asking for write permission,
719 * then don't give it away. We're using write
720 * faults to set the dirty bit.
721 */
722 if (!(fault_type & VM_PROT_WRITE))
723 *protection &= ~VM_PROT_WRITE;
724 }
725 #endif /* MACH_KDB */
726
727 interruptible = fault_info->interruptible;
728 interruptible_state = thread_interrupt_level(interruptible);
729
730 /*
731 * INVARIANTS (through entire routine):
732 *
733 * 1) At all times, we must either have the object
734 * lock or a busy page in some object to prevent
735 * some other thread from trying to bring in
736 * the same page.
737 *
738 * Note that we cannot hold any locks during the
739 * pager access or when waiting for memory, so
740 * we use a busy page then.
741 *
742 * 2) To prevent another thread from racing us down the
743 * shadow chain and entering a new page in the top
744 * object before we do, we must keep a busy page in
745 * the top object while following the shadow chain.
746 *
747 * 3) We must increment paging_in_progress on any object
748 * for which we have a busy page before dropping
749 * the object lock
750 *
751 * 4) We leave busy pages on the pageout queues.
752 * If the pageout daemon comes across a busy page,
753 * it will remove the page from the pageout queues.
754 */
755
756 object = first_object;
757 offset = first_offset;
758 first_m = VM_PAGE_NULL;
759 access_required = fault_type;
760
761
762 XPR(XPR_VM_FAULT,
763 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
764 (integer_t)object, offset, fault_type, *protection, 0);
765
766 /*
767 * default type of fault
768 */
769 my_fault = DBG_CACHE_HIT_FAULT;
770
771 while (TRUE) {
772 #if TRACEFAULTPAGE
773 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
774 #endif
775 if (!object->alive) {
776 /*
777 * object is no longer valid
778 * clean up and return error
779 */
780 vm_fault_cleanup(object, first_m);
781 thread_interrupt_level(interruptible_state);
782
783 return (VM_FAULT_MEMORY_ERROR);
784 }
785
786 /*
787 * See whether the page at 'offset' is resident
788 */
789 m = vm_page_lookup(object, offset);
790 #if TRACEFAULTPAGE
791 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
792 #endif
793 if (m != VM_PAGE_NULL) {
794
795 if (m->busy) {
796 /*
797 * The page is being brought in,
798 * wait for it and then retry.
799 *
800 * A possible optimization: if the page
801 * is known to be resident, we can ignore
802 * pages that are absent (regardless of
803 * whether they're busy).
804 */
805 #if TRACEFAULTPAGE
806 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
807 #endif
808 wait_result = PAGE_SLEEP(object, m, interruptible);
809 XPR(XPR_VM_FAULT,
810 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
811 (integer_t)object, offset,
812 (integer_t)m, 0, 0);
813 counter(c_vm_fault_page_block_busy_kernel++);
814
815 if (wait_result != THREAD_AWAKENED) {
816 vm_fault_cleanup(object, first_m);
817 thread_interrupt_level(interruptible_state);
818
819 if (wait_result == THREAD_RESTART)
820 return (VM_FAULT_RETRY);
821 else
822 return (VM_FAULT_INTERRUPTED);
823 }
824 continue;
825 }
826
827 if (m->phys_page == vm_page_guard_addr) {
828 /*
829 * Guard page: off limits !
830 */
831 if (fault_type == VM_PROT_NONE) {
832 /*
833 * The fault is not requesting any
834 * access to the guard page, so it must
835 * be just to wire or unwire it.
836 * Let's pretend it succeeded...
837 */
838 m->busy = TRUE;
839 *result_page = m;
840 assert(first_m == VM_PAGE_NULL);
841 *top_page = first_m;
842 if (type_of_fault)
843 *type_of_fault = DBG_GUARD_FAULT;
844 return VM_FAULT_SUCCESS;
845 } else {
846 /*
847 * The fault requests access to the
848 * guard page: let's deny that !
849 */
850 vm_fault_cleanup(object, first_m);
851 thread_interrupt_level(interruptible_state);
852 return VM_FAULT_MEMORY_ERROR;
853 }
854 }
855
856 if (m->error) {
857 /*
858 * The page is in error, give up now.
859 */
860 #if TRACEFAULTPAGE
861 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
862 #endif
863 if (error_code)
864 *error_code = KERN_MEMORY_ERROR;
865 VM_PAGE_FREE(m);
866
867 vm_fault_cleanup(object, first_m);
868 thread_interrupt_level(interruptible_state);
869
870 return (VM_FAULT_MEMORY_ERROR);
871 }
872 if (m->restart) {
873 /*
874 * The pager wants us to restart
875 * at the top of the chain,
876 * typically because it has moved the
877 * page to another pager, then do so.
878 */
879 #if TRACEFAULTPAGE
880 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
881 #endif
882 VM_PAGE_FREE(m);
883
884 vm_fault_cleanup(object, first_m);
885 thread_interrupt_level(interruptible_state);
886
887 return (VM_FAULT_RETRY);
888 }
889 if (m->absent) {
890 /*
891 * The page isn't busy, but is absent,
892 * therefore it's deemed "unavailable".
893 *
894 * Remove the non-existent page (unless it's
895 * in the top object) and move on down to the
896 * next object (if there is one).
897 */
898 #if TRACEFAULTPAGE
899 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
900 #endif
901 next_object = object->shadow;
902
903 if (next_object == VM_OBJECT_NULL) {
904 /*
905 * Absent page at bottom of shadow
906 * chain; zero fill the page we left
907 * busy in the first object, and free
908 * the absent page.
909 */
910 assert(!must_be_resident);
911
912 /*
913 * check for any conditions that prevent
914 * us from creating a new zero-fill page
915 * vm_fault_check will do all of the
916 * fault cleanup in the case of an error condition
917 * including resetting the thread_interrupt_level
918 */
919 error = vm_fault_check(object, m, first_m, interruptible_state);
920
921 if (error != VM_FAULT_SUCCESS)
922 return (error);
923
924 XPR(XPR_VM_FAULT,
925 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
926 (integer_t)object, offset,
927 (integer_t)m,
928 (integer_t)first_object, 0);
929
930 if (object != first_object) {
931 /*
932 * free the absent page we just found
933 */
934 VM_PAGE_FREE(m);
935
936 /*
937 * drop reference and lock on current object
938 */
939 vm_object_paging_end(object);
940 vm_object_unlock(object);
941
942 /*
943 * grab the original page we
944 * 'soldered' in place and
945 * retake lock on 'first_object'
946 */
947 m = first_m;
948 first_m = VM_PAGE_NULL;
949
950 object = first_object;
951 offset = first_offset;
952
953 vm_object_lock(object);
954 } else {
955 /*
956 * we're going to use the absent page we just found
957 * so convert it to a 'busy' page
958 */
959 m->absent = FALSE;
960 m->busy = TRUE;
961 }
962 /*
963 * zero-fill the page and put it on
964 * the correct paging queue
965 */
966 my_fault = vm_fault_zero_page(m, no_zero_fill);
967
968 break;
969 } else {
970 if (must_be_resident)
971 vm_object_paging_end(object);
972 else if (object != first_object) {
973 vm_object_paging_end(object);
974 VM_PAGE_FREE(m);
975 } else {
976 first_m = m;
977 m->absent = FALSE;
978 m->busy = TRUE;
979
980 vm_page_lockspin_queues();
981 VM_PAGE_QUEUES_REMOVE(m);
982 vm_page_unlock_queues();
983 }
984 XPR(XPR_VM_FAULT,
985 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
986 (integer_t)object, offset,
987 (integer_t)next_object,
988 offset+object->shadow_offset,0);
989
990 offset += object->shadow_offset;
991 fault_info->lo_offset += object->shadow_offset;
992 fault_info->hi_offset += object->shadow_offset;
993 access_required = VM_PROT_READ;
994
995 vm_object_lock(next_object);
996 vm_object_unlock(object);
997 object = next_object;
998 vm_object_paging_begin(object);
999
1000 /*
1001 * reset to default type of fault
1002 */
1003 my_fault = DBG_CACHE_HIT_FAULT;
1004
1005 continue;
1006 }
1007 }
1008 if ((m->cleaning)
1009 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1010 && (fault_type & VM_PROT_WRITE)) {
1011 /*
1012 * This is a copy-on-write fault that will
1013 * cause us to revoke access to this page, but
1014 * this page is in the process of being cleaned
1015 * in a clustered pageout. We must wait until
1016 * the cleaning operation completes before
1017 * revoking access to the original page,
1018 * otherwise we might attempt to remove a
1019 * wired mapping.
1020 */
1021 #if TRACEFAULTPAGE
1022 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1023 #endif
1024 XPR(XPR_VM_FAULT,
1025 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1026 (integer_t)object, offset,
1027 (integer_t)m, 0, 0);
1028 /*
1029 * take an extra ref so that object won't die
1030 */
1031 vm_object_reference_locked(object);
1032
1033 vm_fault_cleanup(object, first_m);
1034
1035 counter(c_vm_fault_page_block_backoff_kernel++);
1036 vm_object_lock(object);
1037 assert(object->ref_count > 0);
1038
1039 m = vm_page_lookup(object, offset);
1040
1041 if (m != VM_PAGE_NULL && m->cleaning) {
1042 PAGE_ASSERT_WAIT(m, interruptible);
1043
1044 vm_object_unlock(object);
1045 wait_result = thread_block(THREAD_CONTINUE_NULL);
1046 vm_object_deallocate(object);
1047
1048 goto backoff;
1049 } else {
1050 vm_object_unlock(object);
1051
1052 vm_object_deallocate(object);
1053 thread_interrupt_level(interruptible_state);
1054
1055 return (VM_FAULT_RETRY);
1056 }
1057 }
1058 if (type_of_fault == NULL && m->speculative) {
1059 /*
1060 * If we were passed a non-NULL pointer for
1061 * "type_of_fault", than we came from
1062 * vm_fault... we'll let it deal with
1063 * this condition, since it
1064 * needs to see m->speculative to correctly
1065 * account the pageins, otherwise...
1066 * take it off the speculative queue, we'll
1067 * let the caller of vm_fault_page deal
1068 * with getting it onto the correct queue
1069 */
1070 vm_page_lockspin_queues();
1071 VM_PAGE_QUEUES_REMOVE(m);
1072 vm_page_unlock_queues();
1073 }
1074
1075 if (m->encrypted) {
1076 /*
1077 * ENCRYPTED SWAP:
1078 * the user needs access to a page that we
1079 * encrypted before paging it out.
1080 * Decrypt the page now.
1081 * Keep it busy to prevent anyone from
1082 * accessing it during the decryption.
1083 */
1084 m->busy = TRUE;
1085 vm_page_decrypt(m, 0);
1086 assert(object == m->object);
1087 assert(m->busy);
1088 PAGE_WAKEUP_DONE(m);
1089
1090 /*
1091 * Retry from the top, in case
1092 * something changed while we were
1093 * decrypting.
1094 */
1095 continue;
1096 }
1097 ASSERT_PAGE_DECRYPTED(m);
1098
1099 if (m->object->code_signed) {
1100 /*
1101 * CODE SIGNING:
1102 * We just paged in a page from a signed
1103 * memory object but we don't need to
1104 * validate it now. We'll validate it if
1105 * when it gets mapped into a user address
1106 * space for the first time or when the page
1107 * gets copied to another object as a result
1108 * of a copy-on-write.
1109 */
1110 }
1111
1112 /*
1113 * We mark the page busy and leave it on
1114 * the pageout queues. If the pageout
1115 * deamon comes across it, then it will
1116 * remove the page from the queue, but not the object
1117 */
1118 #if TRACEFAULTPAGE
1119 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1120 #endif
1121 XPR(XPR_VM_FAULT,
1122 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1123 (integer_t)object, offset, (integer_t)m, 0, 0);
1124 assert(!m->busy);
1125 assert(!m->absent);
1126
1127 m->busy = TRUE;
1128 break;
1129 }
1130
1131
1132 /*
1133 * we get here when there is no page present in the object at
1134 * the offset we're interested in... we'll allocate a page
1135 * at this point if the pager associated with
1136 * this object can provide the data or we're the top object...
1137 * object is locked; m == NULL
1138 */
1139 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1140
1141 #if TRACEFAULTPAGE
1142 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1143 #endif
1144 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1145 /*
1146 * Allocate a new page for this object/offset pair
1147 */
1148 m = vm_page_grab();
1149 #if TRACEFAULTPAGE
1150 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1151 #endif
1152 if (m == VM_PAGE_NULL) {
1153
1154 vm_fault_cleanup(object, first_m);
1155 thread_interrupt_level(interruptible_state);
1156
1157 return (VM_FAULT_MEMORY_SHORTAGE);
1158 }
1159 vm_page_insert(m, object, offset);
1160 }
1161 if (look_for_page && !must_be_resident) {
1162 kern_return_t rc;
1163
1164 /*
1165 * If the memory manager is not ready, we
1166 * cannot make requests.
1167 */
1168 if (!object->pager_ready) {
1169 #if TRACEFAULTPAGE
1170 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1171 #endif
1172 if (m != VM_PAGE_NULL)
1173 VM_PAGE_FREE(m);
1174
1175 XPR(XPR_VM_FAULT,
1176 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1177 (integer_t)object, offset, 0, 0, 0);
1178
1179 /*
1180 * take an extra ref so object won't die
1181 */
1182 vm_object_reference_locked(object);
1183 vm_fault_cleanup(object, first_m);
1184 counter(c_vm_fault_page_block_backoff_kernel++);
1185
1186 vm_object_lock(object);
1187 assert(object->ref_count > 0);
1188
1189 if (!object->pager_ready) {
1190 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1191
1192 vm_object_unlock(object);
1193 if (wait_result == THREAD_WAITING)
1194 wait_result = thread_block(THREAD_CONTINUE_NULL);
1195 vm_object_deallocate(object);
1196
1197 goto backoff;
1198 } else {
1199 vm_object_unlock(object);
1200 vm_object_deallocate(object);
1201 thread_interrupt_level(interruptible_state);
1202
1203 return (VM_FAULT_RETRY);
1204 }
1205 }
1206 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1207 /*
1208 * If there are too many outstanding page
1209 * requests pending on this external object, we
1210 * wait for them to be resolved now.
1211 */
1212 #if TRACEFAULTPAGE
1213 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1214 #endif
1215 if (m != VM_PAGE_NULL)
1216 VM_PAGE_FREE(m);
1217 /*
1218 * take an extra ref so object won't die
1219 */
1220 vm_object_reference_locked(object);
1221
1222 vm_fault_cleanup(object, first_m);
1223
1224 counter(c_vm_fault_page_block_backoff_kernel++);
1225
1226 vm_object_lock(object);
1227 assert(object->ref_count > 0);
1228
1229 if (object->paging_in_progress > vm_object_pagein_throttle) {
1230 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_IN_PROGRESS, interruptible);
1231
1232 vm_object_unlock(object);
1233 wait_result = thread_block(THREAD_CONTINUE_NULL);
1234 vm_object_deallocate(object);
1235
1236 goto backoff;
1237 } else {
1238 vm_object_unlock(object);
1239 vm_object_deallocate(object);
1240 thread_interrupt_level(interruptible_state);
1241
1242 return (VM_FAULT_RETRY);
1243 }
1244 }
1245 if (m != VM_PAGE_NULL) {
1246 /*
1247 * Indicate that the page is waiting for data
1248 * from the memory manager.
1249 */
1250 m->list_req_pending = TRUE;
1251 m->absent = TRUE;
1252 }
1253
1254 #if TRACEFAULTPAGE
1255 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1256 #endif
1257
1258 /*
1259 * It's possible someone called vm_object_destroy while we weren't
1260 * holding the object lock. If that has happened, then bail out
1261 * here.
1262 */
1263
1264 pager = object->pager;
1265
1266 if (pager == MEMORY_OBJECT_NULL) {
1267 vm_fault_cleanup(object, first_m);
1268 thread_interrupt_level(interruptible_state);
1269 return VM_FAULT_MEMORY_ERROR;
1270 }
1271
1272 /*
1273 * We have an absent page in place for the faulting offset,
1274 * so we can release the object lock.
1275 */
1276
1277 vm_object_unlock(object);
1278
1279 /*
1280 * If this object uses a copy_call strategy,
1281 * and we are interested in a copy of this object
1282 * (having gotten here only by following a
1283 * shadow chain), then tell the memory manager
1284 * via a flag added to the desired_access
1285 * parameter, so that it can detect a race
1286 * between our walking down the shadow chain
1287 * and its pushing pages up into a copy of
1288 * the object that it manages.
1289 */
1290 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1291 wants_copy_flag = VM_PROT_WANTS_COPY;
1292 else
1293 wants_copy_flag = VM_PROT_NONE;
1294
1295 XPR(XPR_VM_FAULT,
1296 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1297 (integer_t)object, offset, (integer_t)m,
1298 access_required | wants_copy_flag, 0);
1299
1300 /*
1301 * Call the memory manager to retrieve the data.
1302 */
1303 rc = memory_object_data_request(
1304 pager,
1305 offset + object->paging_offset,
1306 PAGE_SIZE,
1307 access_required | wants_copy_flag,
1308 (memory_object_fault_info_t)fault_info);
1309
1310 #if TRACEFAULTPAGE
1311 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1312 #endif
1313 vm_object_lock(object);
1314
1315 if (rc != KERN_SUCCESS) {
1316
1317 vm_fault_cleanup(object, first_m);
1318 thread_interrupt_level(interruptible_state);
1319
1320 return ((rc == MACH_SEND_INTERRUPTED) ?
1321 VM_FAULT_INTERRUPTED :
1322 VM_FAULT_MEMORY_ERROR);
1323 }
1324 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_mode & TH_MODE_ABORT)) {
1325
1326 vm_fault_cleanup(object, first_m);
1327 thread_interrupt_level(interruptible_state);
1328
1329 return (VM_FAULT_INTERRUPTED);
1330 }
1331 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1332 /*
1333 * No page here means that the object we
1334 * initially looked up was "physically
1335 * contiguous" (i.e. device memory). However,
1336 * with Virtual VRAM, the object might not
1337 * be backed by that device memory anymore,
1338 * so we're done here only if the object is
1339 * still "phys_contiguous".
1340 * Otherwise, if the object is no longer
1341 * "phys_contiguous", we need to retry the
1342 * page fault against the object's new backing
1343 * store (different memory object).
1344 */
1345 break;
1346 }
1347 /*
1348 * potentially a pagein fault
1349 * if we make it through the state checks
1350 * above, than we'll count it as such
1351 */
1352 my_fault = DBG_PAGEIN_FAULT;
1353
1354 /*
1355 * Retry with same object/offset, since new data may
1356 * be in a different page (i.e., m is meaningless at
1357 * this point).
1358 */
1359 continue;
1360 }
1361
1362 /*
1363 * We get here if the object has no pager, or an existence map
1364 * exists and indicates the page isn't present on the pager
1365 * or we're unwiring a page. If a pager exists, but there
1366 * is no existence map, then the m->absent case above handles
1367 * the ZF case when the pager can't provide the page
1368 */
1369 #if TRACEFAULTPAGE
1370 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1371 #endif
1372 if (object == first_object)
1373 first_m = m;
1374 else
1375 assert(m == VM_PAGE_NULL);
1376
1377 XPR(XPR_VM_FAULT,
1378 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1379 (integer_t)object, offset, (integer_t)m,
1380 (integer_t)object->shadow, 0);
1381
1382 next_object = object->shadow;
1383
1384 if (next_object == VM_OBJECT_NULL) {
1385 /*
1386 * we've hit the bottom of the shadown chain,
1387 * fill the page in the top object with zeros.
1388 */
1389 assert(!must_be_resident);
1390
1391 if (object != first_object) {
1392 vm_object_paging_end(object);
1393 vm_object_unlock(object);
1394
1395 object = first_object;
1396 offset = first_offset;
1397 vm_object_lock(object);
1398 }
1399 m = first_m;
1400 assert(m->object == object);
1401 first_m = VM_PAGE_NULL;
1402
1403 /*
1404 * check for any conditions that prevent
1405 * us from creating a new zero-fill page
1406 * vm_fault_check will do all of the
1407 * fault cleanup in the case of an error condition
1408 * including resetting the thread_interrupt_level
1409 */
1410 error = vm_fault_check(object, m, first_m, interruptible_state);
1411
1412 if (error != VM_FAULT_SUCCESS)
1413 return (error);
1414
1415 if (m == VM_PAGE_NULL) {
1416 m = vm_page_grab();
1417
1418 if (m == VM_PAGE_NULL) {
1419 vm_fault_cleanup(object, VM_PAGE_NULL);
1420 thread_interrupt_level(interruptible_state);
1421
1422 return (VM_FAULT_MEMORY_SHORTAGE);
1423 }
1424 vm_page_insert(m, object, offset);
1425 }
1426 my_fault = vm_fault_zero_page(m, no_zero_fill);
1427
1428 break;
1429
1430 } else {
1431 /*
1432 * Move on to the next object. Lock the next
1433 * object before unlocking the current one.
1434 */
1435 if ((object != first_object) || must_be_resident)
1436 vm_object_paging_end(object);
1437
1438 offset += object->shadow_offset;
1439 fault_info->lo_offset += object->shadow_offset;
1440 fault_info->hi_offset += object->shadow_offset;
1441 access_required = VM_PROT_READ;
1442
1443 vm_object_lock(next_object);
1444 vm_object_unlock(object);
1445
1446 object = next_object;
1447 vm_object_paging_begin(object);
1448 }
1449 }
1450
1451 /*
1452 * PAGE HAS BEEN FOUND.
1453 *
1454 * This page (m) is:
1455 * busy, so that we can play with it;
1456 * not absent, so that nobody else will fill it;
1457 * possibly eligible for pageout;
1458 *
1459 * The top-level page (first_m) is:
1460 * VM_PAGE_NULL if the page was found in the
1461 * top-level object;
1462 * busy, not absent, and ineligible for pageout.
1463 *
1464 * The current object (object) is locked. A paging
1465 * reference is held for the current and top-level
1466 * objects.
1467 */
1468
1469 #if TRACEFAULTPAGE
1470 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1471 #endif
1472 #if EXTRA_ASSERTIONS
1473 if (m != VM_PAGE_NULL) {
1474 assert(m->busy && !m->absent);
1475 assert((first_m == VM_PAGE_NULL) ||
1476 (first_m->busy && !first_m->absent &&
1477 !first_m->active && !first_m->inactive));
1478 }
1479 #endif /* EXTRA_ASSERTIONS */
1480
1481 /*
1482 * ENCRYPTED SWAP:
1483 * If we found a page, we must have decrypted it before we
1484 * get here...
1485 */
1486 if (m != VM_PAGE_NULL) {
1487 ASSERT_PAGE_DECRYPTED(m);
1488 }
1489
1490 XPR(XPR_VM_FAULT,
1491 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1492 (integer_t)object, offset, (integer_t)m,
1493 (integer_t)first_object, (integer_t)first_m);
1494
1495 /*
1496 * If the page is being written, but isn't
1497 * already owned by the top-level object,
1498 * we have to copy it into a new page owned
1499 * by the top-level object.
1500 */
1501 if ((object != first_object) && (m != VM_PAGE_NULL)) {
1502
1503 #if TRACEFAULTPAGE
1504 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1505 #endif
1506 if (fault_type & VM_PROT_WRITE) {
1507 vm_page_t copy_m;
1508
1509 /*
1510 * We only really need to copy if we
1511 * want to write it.
1512 */
1513 assert(!must_be_resident);
1514
1515 /*
1516 * are we protecting the system from
1517 * backing store exhaustion. If so
1518 * sleep unless we are privileged.
1519 */
1520 if (vm_backing_store_low) {
1521 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1522
1523 RELEASE_PAGE(m);
1524 vm_fault_cleanup(object, first_m);
1525
1526 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1527
1528 thread_block(THREAD_CONTINUE_NULL);
1529 thread_interrupt_level(interruptible_state);
1530
1531 return (VM_FAULT_RETRY);
1532 }
1533 }
1534 /*
1535 * If we try to collapse first_object at this
1536 * point, we may deadlock when we try to get
1537 * the lock on an intermediate object (since we
1538 * have the bottom object locked). We can't
1539 * unlock the bottom object, because the page
1540 * we found may move (by collapse) if we do.
1541 *
1542 * Instead, we first copy the page. Then, when
1543 * we have no more use for the bottom object,
1544 * we unlock it and try to collapse.
1545 *
1546 * Note that we copy the page even if we didn't
1547 * need to... that's the breaks.
1548 */
1549
1550 /*
1551 * Allocate a page for the copy
1552 */
1553 copy_m = vm_page_grab();
1554
1555 if (copy_m == VM_PAGE_NULL) {
1556 RELEASE_PAGE(m);
1557
1558 vm_fault_cleanup(object, first_m);
1559 thread_interrupt_level(interruptible_state);
1560
1561 return (VM_FAULT_MEMORY_SHORTAGE);
1562 }
1563 XPR(XPR_VM_FAULT,
1564 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1565 (integer_t)object, offset,
1566 (integer_t)m, (integer_t)copy_m, 0);
1567
1568 vm_page_copy(m, copy_m);
1569
1570 /*
1571 * If another map is truly sharing this
1572 * page with us, we have to flush all
1573 * uses of the original page, since we
1574 * can't distinguish those which want the
1575 * original from those which need the
1576 * new copy.
1577 *
1578 * XXXO If we know that only one map has
1579 * access to this page, then we could
1580 * avoid the pmap_disconnect() call.
1581 */
1582 if (m->pmapped)
1583 pmap_disconnect(m->phys_page);
1584
1585 assert(!m->cleaning);
1586
1587 /*
1588 * We no longer need the old page or object.
1589 */
1590 PAGE_WAKEUP_DONE(m);
1591 vm_object_paging_end(object);
1592 vm_object_unlock(object);
1593
1594 my_fault = DBG_COW_FAULT;
1595 VM_STAT_INCR(cow_faults);
1596 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1597 current_task()->cow_faults++;
1598
1599 object = first_object;
1600 offset = first_offset;
1601
1602 vm_object_lock(object);
1603 /*
1604 * get rid of the place holder
1605 * page that we soldered in earlier
1606 */
1607 VM_PAGE_FREE(first_m);
1608 first_m = VM_PAGE_NULL;
1609
1610 /*
1611 * and replace it with the
1612 * page we just copied into
1613 */
1614 assert(copy_m->busy);
1615 vm_page_insert(copy_m, object, offset);
1616 copy_m->dirty = TRUE;
1617
1618 m = copy_m;
1619 /*
1620 * Now that we've gotten the copy out of the
1621 * way, let's try to collapse the top object.
1622 * But we have to play ugly games with
1623 * paging_in_progress to do that...
1624 */
1625 vm_object_paging_end(object);
1626 vm_object_collapse(object, offset, TRUE);
1627 vm_object_paging_begin(object);
1628
1629 } else
1630 *protection &= (~VM_PROT_WRITE);
1631 }
1632 /*
1633 * Now check whether the page needs to be pushed into the
1634 * copy object. The use of asymmetric copy on write for
1635 * shared temporary objects means that we may do two copies to
1636 * satisfy the fault; one above to get the page from a
1637 * shadowed object, and one here to push it into the copy.
1638 */
1639 try_failed_count = 0;
1640
1641 while ((copy_object = first_object->copy) != VM_OBJECT_NULL && (m != VM_PAGE_NULL)) {
1642 vm_object_offset_t copy_offset;
1643 vm_page_t copy_m;
1644
1645 #if TRACEFAULTPAGE
1646 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1647 #endif
1648 /*
1649 * If the page is being written, but hasn't been
1650 * copied to the copy-object, we have to copy it there.
1651 */
1652 if ((fault_type & VM_PROT_WRITE) == 0) {
1653 *protection &= ~VM_PROT_WRITE;
1654 break;
1655 }
1656
1657 /*
1658 * If the page was guaranteed to be resident,
1659 * we must have already performed the copy.
1660 */
1661 if (must_be_resident)
1662 break;
1663
1664 /*
1665 * Try to get the lock on the copy_object.
1666 */
1667 if (!vm_object_lock_try(copy_object)) {
1668
1669 vm_object_unlock(object);
1670 try_failed_count++;
1671
1672 mutex_pause(try_failed_count); /* wait a bit */
1673 vm_object_lock(object);
1674
1675 continue;
1676 }
1677 try_failed_count = 0;
1678
1679 /*
1680 * Make another reference to the copy-object,
1681 * to keep it from disappearing during the
1682 * copy.
1683 */
1684 vm_object_reference_locked(copy_object);
1685
1686 /*
1687 * Does the page exist in the copy?
1688 */
1689 copy_offset = first_offset - copy_object->shadow_offset;
1690
1691 if (copy_object->size <= copy_offset)
1692 /*
1693 * Copy object doesn't cover this page -- do nothing.
1694 */
1695 ;
1696 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1697 /*
1698 * Page currently exists in the copy object
1699 */
1700 if (copy_m->busy) {
1701 /*
1702 * If the page is being brought
1703 * in, wait for it and then retry.
1704 */
1705 RELEASE_PAGE(m);
1706
1707 /*
1708 * take an extra ref so object won't die
1709 */
1710 vm_object_reference_locked(copy_object);
1711 vm_object_unlock(copy_object);
1712 vm_fault_cleanup(object, first_m);
1713 counter(c_vm_fault_page_block_backoff_kernel++);
1714
1715 vm_object_lock(copy_object);
1716 assert(copy_object->ref_count > 0);
1717 VM_OBJ_RES_DECR(copy_object);
1718 vm_object_lock_assert_exclusive(copy_object);
1719 copy_object->ref_count--;
1720 assert(copy_object->ref_count > 0);
1721 copy_m = vm_page_lookup(copy_object, copy_offset);
1722 /*
1723 * ENCRYPTED SWAP:
1724 * it's OK if the "copy_m" page is encrypted,
1725 * because we're not moving it nor handling its
1726 * contents.
1727 */
1728 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1729 PAGE_ASSERT_WAIT(copy_m, interruptible);
1730
1731 vm_object_unlock(copy_object);
1732 wait_result = thread_block(THREAD_CONTINUE_NULL);
1733 vm_object_deallocate(copy_object);
1734
1735 goto backoff;
1736 } else {
1737 vm_object_unlock(copy_object);
1738 vm_object_deallocate(copy_object);
1739 thread_interrupt_level(interruptible_state);
1740
1741 return (VM_FAULT_RETRY);
1742 }
1743 }
1744 }
1745 else if (!PAGED_OUT(copy_object, copy_offset)) {
1746 /*
1747 * If PAGED_OUT is TRUE, then the page used to exist
1748 * in the copy-object, and has already been paged out.
1749 * We don't need to repeat this. If PAGED_OUT is
1750 * FALSE, then either we don't know (!pager_created,
1751 * for example) or it hasn't been paged out.
1752 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
1753 * We must copy the page to the copy object.
1754 */
1755
1756 if (vm_backing_store_low) {
1757 /*
1758 * we are protecting the system from
1759 * backing store exhaustion. If so
1760 * sleep unless we are privileged.
1761 */
1762 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1763 assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1764
1765 RELEASE_PAGE(m);
1766 VM_OBJ_RES_DECR(copy_object);
1767 vm_object_lock_assert_exclusive(copy_object);
1768 copy_object->ref_count--;
1769 assert(copy_object->ref_count > 0);
1770
1771 vm_object_unlock(copy_object);
1772 vm_fault_cleanup(object, first_m);
1773 thread_block(THREAD_CONTINUE_NULL);
1774 thread_interrupt_level(interruptible_state);
1775
1776 return (VM_FAULT_RETRY);
1777 }
1778 }
1779 /*
1780 * Allocate a page for the copy
1781 */
1782 copy_m = vm_page_alloc(copy_object, copy_offset);
1783
1784 if (copy_m == VM_PAGE_NULL) {
1785 RELEASE_PAGE(m);
1786
1787 VM_OBJ_RES_DECR(copy_object);
1788 vm_object_lock_assert_exclusive(copy_object);
1789 copy_object->ref_count--;
1790 assert(copy_object->ref_count > 0);
1791
1792 vm_object_unlock(copy_object);
1793 vm_fault_cleanup(object, first_m);
1794 thread_interrupt_level(interruptible_state);
1795
1796 return (VM_FAULT_MEMORY_SHORTAGE);
1797 }
1798 /*
1799 * Must copy page into copy-object.
1800 */
1801 vm_page_copy(m, copy_m);
1802
1803 /*
1804 * If the old page was in use by any users
1805 * of the copy-object, it must be removed
1806 * from all pmaps. (We can't know which
1807 * pmaps use it.)
1808 */
1809 if (m->pmapped)
1810 pmap_disconnect(m->phys_page);
1811
1812 /*
1813 * If there's a pager, then immediately
1814 * page out this page, using the "initialize"
1815 * option. Else, we use the copy.
1816 */
1817 if ((!copy_object->pager_created)
1818 #if MACH_PAGEMAP
1819 || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
1820 #endif
1821 ) {
1822
1823 vm_page_lockspin_queues();
1824 assert(!m->cleaning);
1825 vm_page_activate(copy_m);
1826 vm_page_unlock_queues();
1827
1828 copy_m->dirty = TRUE;
1829 PAGE_WAKEUP_DONE(copy_m);
1830 }
1831 else {
1832 assert(copy_m->busy == TRUE);
1833 assert(!m->cleaning);
1834
1835 /*
1836 * dirty is protected by the object lock
1837 */
1838 copy_m->dirty = TRUE;
1839
1840 /*
1841 * The page is already ready for pageout:
1842 * not on pageout queues and busy.
1843 * Unlock everything except the
1844 * copy_object itself.
1845 */
1846 vm_object_unlock(object);
1847
1848 /*
1849 * Write the page to the copy-object,
1850 * flushing it from the kernel.
1851 */
1852 vm_pageout_initialize_page(copy_m);
1853
1854 /*
1855 * Since the pageout may have
1856 * temporarily dropped the
1857 * copy_object's lock, we
1858 * check whether we'll have
1859 * to deallocate the hard way.
1860 */
1861 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
1862 vm_object_unlock(copy_object);
1863 vm_object_deallocate(copy_object);
1864 vm_object_lock(object);
1865
1866 continue;
1867 }
1868 /*
1869 * Pick back up the old object's
1870 * lock. [It is safe to do so,
1871 * since it must be deeper in the
1872 * object tree.]
1873 */
1874 vm_object_lock(object);
1875 }
1876 /*
1877 * Because we're pushing a page upward
1878 * in the object tree, we must restart
1879 * any faults that are waiting here.
1880 * [Note that this is an expansion of
1881 * PAGE_WAKEUP that uses the THREAD_RESTART
1882 * wait result]. Can't turn off the page's
1883 * busy bit because we're not done with it.
1884 */
1885 if (m->wanted) {
1886 m->wanted = FALSE;
1887 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
1888 }
1889 }
1890 /*
1891 * The reference count on copy_object must be
1892 * at least 2: one for our extra reference,
1893 * and at least one from the outside world
1894 * (we checked that when we last locked
1895 * copy_object).
1896 */
1897 vm_object_lock_assert_exclusive(copy_object);
1898 copy_object->ref_count--;
1899 assert(copy_object->ref_count > 0);
1900
1901 VM_OBJ_RES_DECR(copy_object);
1902 vm_object_unlock(copy_object);
1903
1904 break;
1905 }
1906 *result_page = m;
1907 *top_page = first_m;
1908
1909 XPR(XPR_VM_FAULT,
1910 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
1911 (integer_t)object, offset, (integer_t)m, (integer_t)first_m, 0);
1912
1913 if (m != VM_PAGE_NULL) {
1914 if (my_fault == DBG_PAGEIN_FAULT) {
1915
1916 VM_STAT_INCR(pageins);
1917 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
1918 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1919 current_task()->pageins++;
1920
1921 if (m->object->internal) {
1922 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
1923 } else {
1924 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
1925 }
1926
1927 /*
1928 * evaluate access pattern and update state
1929 * vm_fault_deactivate_behind depends on the
1930 * state being up to date
1931 */
1932 vm_fault_is_sequential(object, offset, fault_info->behavior);
1933
1934 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
1935 }
1936 if (type_of_fault)
1937 *type_of_fault = my_fault;
1938 } else
1939 vm_object_unlock(object);
1940
1941 thread_interrupt_level(interruptible_state);
1942
1943 #if TRACEFAULTPAGE
1944 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
1945 #endif
1946 return (VM_FAULT_SUCCESS);
1947
1948 backoff:
1949 thread_interrupt_level(interruptible_state);
1950
1951 if (wait_result == THREAD_INTERRUPTED)
1952 return (VM_FAULT_INTERRUPTED);
1953 return (VM_FAULT_RETRY);
1954
1955 #undef RELEASE_PAGE
1956 }
1957
1958
1959
1960 /*
1961 * page queue lock must NOT be held
1962 * m->object must be locked
1963 *
1964 * NOTE: m->object could be locked "shared" only if we are called
1965 * from vm_fault() as part of a soft fault. If so, we must be
1966 * careful not to modify the VM object in any way that is not
1967 * legal under a shared lock...
1968 */
1969 unsigned long cs_enter_tainted_rejected = 0;
1970 unsigned long cs_enter_tainted_accepted = 0;
1971 kern_return_t
1972 vm_fault_enter(vm_page_t m,
1973 pmap_t pmap,
1974 vm_map_offset_t vaddr,
1975 vm_prot_t prot,
1976 boolean_t wired,
1977 boolean_t change_wiring,
1978 boolean_t no_cache,
1979 int *type_of_fault)
1980 {
1981 unsigned int cache_attr;
1982 kern_return_t kr;
1983 boolean_t previously_pmapped = m->pmapped;
1984
1985 vm_object_lock_assert_held(m->object);
1986 #if DEBUG
1987 mutex_assert(&vm_page_queue_lock, MA_NOTOWNED);
1988 #endif /* DEBUG */
1989
1990 if (m->phys_page == vm_page_guard_addr) {
1991 assert(m->fictitious);
1992 return KERN_SUCCESS;
1993 }
1994
1995 cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
1996
1997 if (m->object->code_signed && pmap != kernel_pmap &&
1998 (!m->cs_validated || m->wpmapped)) {
1999 vm_object_lock_assert_exclusive(m->object);
2000
2001 if (m->cs_validated && m->wpmapped) {
2002 vm_cs_revalidates++;
2003 }
2004
2005 /*
2006 * CODE SIGNING:
2007 * This page comes from a VM object backed by a signed
2008 * memory object. We are about to enter it into a process
2009 * address space, so we need to validate its signature.
2010 */
2011 /* VM map is locked, so 1 ref will remain on VM object */
2012 vm_page_validate_cs(m);
2013 }
2014
2015 if (m->pmapped == FALSE) {
2016 /*
2017 * This is the first time this page is being
2018 * mapped in an address space (pmapped == FALSE).
2019 *
2020 * Part of that page may still be in the data cache
2021 * and not flushed to memory. In case we end up
2022 * accessing that page via the instruction cache,
2023 * we need to ensure that the 2 caches are in sync.
2024 */
2025 pmap_sync_page_data_phys(m->phys_page);
2026
2027 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2028 /*
2029 * found it in the cache, but this
2030 * is the first fault-in of the page (m->pmapped == FALSE)
2031 * so it must have come in as part of
2032 * a cluster... account 1 pagein against it
2033 */
2034 VM_STAT_INCR(pageins);
2035 DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2036
2037 if (m->object->internal) {
2038 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2039 } else {
2040 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2041 }
2042
2043 current_task()->pageins++;
2044
2045 *type_of_fault = DBG_PAGEIN_FAULT;
2046 }
2047 VM_PAGE_CONSUME_CLUSTERED(m);
2048
2049 } else if (cache_attr != VM_WIMG_DEFAULT)
2050 pmap_sync_page_attributes_phys(m->phys_page);
2051
2052 if (*type_of_fault != DBG_COW_FAULT) {
2053 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2054
2055 if (pmap == kernel_pmap) {
2056 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2057 }
2058 }
2059
2060 if (m->cs_tainted) {
2061 /*
2062 * CODE SIGNING:
2063 * This page has been tainted and can not be trusted.
2064 * Let's notify the current process and let it take any
2065 * necessary precautions before we enter the tainted page
2066 * into its address space.
2067 */
2068 if (cs_invalid_page()) {
2069 /* reject the tainted page: abort the page fault */
2070 kr = KERN_MEMORY_ERROR;
2071 cs_enter_tainted_rejected++;
2072 } else {
2073 /* proceed with the tainted page */
2074 kr = KERN_SUCCESS;
2075 cs_enter_tainted_accepted++;
2076 }
2077 if (cs_debug || kr != KERN_SUCCESS) {
2078 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2079 "page %p obj %p off 0x%llx *** TAINTED ***\n",
2080 (long long)vaddr, m, m->object, m->offset);
2081 }
2082 } else {
2083 /* proceed with the valid page */
2084 kr = KERN_SUCCESS;
2085 }
2086
2087 if (kr == KERN_SUCCESS) {
2088 /*
2089 * NOTE: we may only hold the vm_object lock SHARED
2090 * at this point, but the update of pmapped is ok
2091 * since this is the ONLY bit updated behind the SHARED
2092 * lock... however, we need to figure out how to do an atomic
2093 * update on a bit field to make this less fragile... right
2094 * now I don'w know how to coerce 'C' to give me the offset info
2095 * that's needed for an AtomicCompareAndSwap
2096 */
2097 m->pmapped = TRUE;
2098 if (prot & VM_PROT_WRITE) {
2099 vm_object_lock_assert_exclusive(m->object);
2100 m->wpmapped = TRUE;
2101 }
2102
2103 PMAP_ENTER(pmap, vaddr, m, prot, cache_attr, wired);
2104 }
2105
2106 /*
2107 * Hold queues lock to manipulate
2108 * the page queues. Change wiring
2109 * case is obvious.
2110 */
2111 if (change_wiring) {
2112 vm_page_lockspin_queues();
2113
2114 if (wired) {
2115 if (kr == KERN_SUCCESS) {
2116 vm_page_wire(m);
2117 }
2118 } else {
2119 vm_page_unwire(m);
2120 }
2121 vm_page_unlock_queues();
2122
2123 } else {
2124 if (kr != KERN_SUCCESS) {
2125 vm_page_lock_queues();
2126 vm_page_deactivate(m);
2127 vm_page_unlock_queues();
2128 } else {
2129 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count && !m->throttled) {
2130 vm_page_lockspin_queues();
2131 /*
2132 * test again now that we hold the page queue lock
2133 */
2134 if (((!m->active && !m->inactive) || no_cache) && !m->wire_count) {
2135
2136 /*
2137 * If this is a no_cache mapping and the page has never been
2138 * mapped before or was previously a no_cache page, then we
2139 * want to leave pages in the speculative state so that they
2140 * can be readily recycled if free memory runs low. Otherwise
2141 * the page is activated as normal.
2142 */
2143
2144 if (no_cache && (!previously_pmapped || m->no_cache)) {
2145 m->no_cache = TRUE;
2146
2147 if (m->active || m->inactive)
2148 VM_PAGE_QUEUES_REMOVE(m);
2149
2150 if (!m->speculative)
2151 vm_page_speculate(m, TRUE);
2152
2153 } else if (!m->active && !m->inactive)
2154 vm_page_activate(m);
2155
2156 }
2157
2158 vm_page_unlock_queues();
2159 }
2160 }
2161 }
2162 return kr;
2163 }
2164
2165
2166 /*
2167 * Routine: vm_fault
2168 * Purpose:
2169 * Handle page faults, including pseudo-faults
2170 * used to change the wiring status of pages.
2171 * Returns:
2172 * Explicit continuations have been removed.
2173 * Implementation:
2174 * vm_fault and vm_fault_page save mucho state
2175 * in the moral equivalent of a closure. The state
2176 * structure is allocated when first entering vm_fault
2177 * and deallocated when leaving vm_fault.
2178 */
2179
2180 extern int _map_enter_debug;
2181
2182 unsigned long vm_fault_collapse_total = 0;
2183 unsigned long vm_fault_collapse_skipped = 0;
2184
2185 kern_return_t
2186 vm_fault(
2187 vm_map_t map,
2188 vm_map_offset_t vaddr,
2189 vm_prot_t fault_type,
2190 boolean_t change_wiring,
2191 int interruptible,
2192 pmap_t caller_pmap,
2193 vm_map_offset_t caller_pmap_addr)
2194 {
2195 vm_map_version_t version; /* Map version for verificiation */
2196 boolean_t wired; /* Should mapping be wired down? */
2197 vm_object_t object; /* Top-level object */
2198 vm_object_offset_t offset; /* Top-level offset */
2199 vm_prot_t prot; /* Protection for mapping */
2200 vm_object_t old_copy_object; /* Saved copy object */
2201 vm_page_t result_page; /* Result of vm_fault_page */
2202 vm_page_t top_page; /* Placeholder page */
2203 kern_return_t kr;
2204
2205 vm_page_t m; /* Fast access to result_page */
2206 kern_return_t error_code;
2207 vm_object_t cur_object;
2208 vm_object_offset_t cur_offset;
2209 vm_page_t cur_m;
2210 vm_object_t new_object;
2211 int type_of_fault;
2212 pmap_t pmap;
2213 boolean_t interruptible_state;
2214 vm_map_t real_map = map;
2215 vm_map_t original_map = map;
2216 vm_prot_t original_fault_type;
2217 struct vm_object_fault_info fault_info;
2218 boolean_t need_collapse = FALSE;
2219 int object_lock_type = 0;
2220 int cur_object_lock_type;
2221
2222
2223 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2224 (int)((uint64_t)vaddr >> 32),
2225 (int)vaddr,
2226 0,
2227 0,
2228 0);
2229
2230 if (get_preemption_level() != 0) {
2231 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2232 (int)((uint64_t)vaddr >> 32),
2233 (int)vaddr,
2234 KERN_FAILURE,
2235 0,
2236 0);
2237
2238 return (KERN_FAILURE);
2239 }
2240 interruptible_state = thread_interrupt_level(interruptible);
2241
2242 VM_STAT_INCR(faults);
2243 current_task()->faults++;
2244 original_fault_type = fault_type;
2245
2246 if (fault_type & VM_PROT_WRITE)
2247 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2248 else
2249 object_lock_type = OBJECT_LOCK_SHARED;
2250
2251 cur_object_lock_type = OBJECT_LOCK_SHARED;
2252
2253 RetryFault:
2254 /*
2255 * assume we will hit a page in the cache
2256 * otherwise, explicitly override with
2257 * the real fault type once we determine it
2258 */
2259 type_of_fault = DBG_CACHE_HIT_FAULT;
2260
2261 /*
2262 * Find the backing store object and offset into
2263 * it to begin the search.
2264 */
2265 fault_type = original_fault_type;
2266 map = original_map;
2267 vm_map_lock_read(map);
2268
2269 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2270 object_lock_type, &version,
2271 &object, &offset, &prot, &wired,
2272 &fault_info,
2273 &real_map);
2274
2275 if (kr != KERN_SUCCESS) {
2276 vm_map_unlock_read(map);
2277 goto done;
2278 }
2279 pmap = real_map->pmap;
2280 fault_info.interruptible = interruptible;
2281
2282 /*
2283 * If the page is wired, we must fault for the current protection
2284 * value, to avoid further faults.
2285 */
2286 if (wired) {
2287 fault_type = prot | VM_PROT_WRITE;
2288 /*
2289 * since we're treating this fault as a 'write'
2290 * we must hold the top object lock exclusively
2291 */
2292 if (object_lock_type == OBJECT_LOCK_SHARED) {
2293
2294 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2295
2296 if (vm_object_lock_upgrade(object) == FALSE) {
2297 /*
2298 * couldn't upgrade, so explictly
2299 * take the lock exclusively
2300 */
2301 vm_object_lock(object);
2302 }
2303 }
2304 }
2305
2306 #if VM_FAULT_CLASSIFY
2307 /*
2308 * Temporary data gathering code
2309 */
2310 vm_fault_classify(object, offset, fault_type);
2311 #endif
2312 /*
2313 * Fast fault code. The basic idea is to do as much as
2314 * possible while holding the map lock and object locks.
2315 * Busy pages are not used until the object lock has to
2316 * be dropped to do something (copy, zero fill, pmap enter).
2317 * Similarly, paging references aren't acquired until that
2318 * point, and object references aren't used.
2319 *
2320 * If we can figure out what to do
2321 * (zero fill, copy on write, pmap enter) while holding
2322 * the locks, then it gets done. Otherwise, we give up,
2323 * and use the original fault path (which doesn't hold
2324 * the map lock, and relies on busy pages).
2325 * The give up cases include:
2326 * - Have to talk to pager.
2327 * - Page is busy, absent or in error.
2328 * - Pager has locked out desired access.
2329 * - Fault needs to be restarted.
2330 * - Have to push page into copy object.
2331 *
2332 * The code is an infinite loop that moves one level down
2333 * the shadow chain each time. cur_object and cur_offset
2334 * refer to the current object being examined. object and offset
2335 * are the original object from the map. The loop is at the
2336 * top level if and only if object and cur_object are the same.
2337 *
2338 * Invariants: Map lock is held throughout. Lock is held on
2339 * original object and cur_object (if different) when
2340 * continuing or exiting loop.
2341 *
2342 */
2343
2344
2345 /*
2346 * If this page is to be inserted in a copy delay object
2347 * for writing, and if the object has a copy, then the
2348 * copy delay strategy is implemented in the slow fault page.
2349 */
2350 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2351 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2352 goto handle_copy_delay;
2353
2354 cur_object = object;
2355 cur_offset = offset;
2356
2357 while (TRUE) {
2358 m = vm_page_lookup(cur_object, cur_offset);
2359
2360 if (m != VM_PAGE_NULL) {
2361 if (m->busy) {
2362 wait_result_t result;
2363
2364 /*
2365 * in order to do the PAGE_ASSERT_WAIT, we must
2366 * have object that 'm' belongs to locked exclusively
2367 */
2368 if (object != cur_object) {
2369 vm_object_unlock(object);
2370
2371 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2372
2373 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2374
2375 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2376 /*
2377 * couldn't upgrade so go do a full retry
2378 * immediately since we've already dropped
2379 * the top object lock associated with this page
2380 * and the current one got dropped due to the
2381 * failed upgrade... the state is no longer valid
2382 */
2383 vm_map_unlock_read(map);
2384 if (real_map != map)
2385 vm_map_unlock(real_map);
2386
2387 goto RetryFault;
2388 }
2389 }
2390 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2391
2392 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2393
2394 if (vm_object_lock_upgrade(object) == FALSE) {
2395 /*
2396 * couldn't upgrade, so explictly take the lock
2397 * exclusively and go relookup the page since we
2398 * will have dropped the object lock and
2399 * a different thread could have inserted
2400 * a page at this offset
2401 * no need for a full retry since we're
2402 * at the top level of the object chain
2403 */
2404 vm_object_lock(object);
2405
2406 continue;
2407 }
2408 }
2409 vm_map_unlock_read(map);
2410 if (real_map != map)
2411 vm_map_unlock(real_map);
2412
2413 result = PAGE_ASSERT_WAIT(m, interruptible);
2414
2415 vm_object_unlock(cur_object);
2416
2417 if (result == THREAD_WAITING) {
2418 result = thread_block(THREAD_CONTINUE_NULL);
2419
2420 counter(c_vm_fault_page_block_busy_kernel++);
2421 }
2422 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2423 goto RetryFault;
2424
2425 kr = KERN_ABORTED;
2426 goto done;
2427 }
2428 if (m->phys_page == vm_page_guard_addr) {
2429 /*
2430 * Guard page: let the slow path deal with it
2431 */
2432 break;
2433 }
2434 if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2435 /*
2436 * Unusual case... let the slow path deal with it
2437 */
2438 break;
2439 }
2440 if (m->encrypted) {
2441 /*
2442 * ENCRYPTED SWAP:
2443 * We've soft-faulted (because it's not in the page
2444 * table) on an encrypted page.
2445 * Keep the page "busy" so that no one messes with
2446 * it during the decryption.
2447 * Release the extra locks we're holding, keep only
2448 * the page's VM object lock.
2449 *
2450 * in order to set 'busy' on 'm', we must
2451 * have object that 'm' belongs to locked exclusively
2452 */
2453 if (object != cur_object) {
2454 vm_object_unlock(object);
2455
2456 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2457
2458 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2459
2460 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2461 /*
2462 * couldn't upgrade so go do a full retry
2463 * immediately since we've already dropped
2464 * the top object lock associated with this page
2465 * and the current one got dropped due to the
2466 * failed upgrade... the state is no longer valid
2467 */
2468 vm_map_unlock_read(map);
2469 if (real_map != map)
2470 vm_map_unlock(real_map);
2471
2472 goto RetryFault;
2473 }
2474 }
2475 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2476
2477 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2478
2479 if (vm_object_lock_upgrade(object) == FALSE) {
2480 /*
2481 * couldn't upgrade, so explictly take the lock
2482 * exclusively and go relookup the page since we
2483 * will have dropped the object lock and
2484 * a different thread could have inserted
2485 * a page at this offset
2486 * no need for a full retry since we're
2487 * at the top level of the object chain
2488 */
2489 vm_object_lock(object);
2490
2491 continue;
2492 }
2493 }
2494 m->busy = TRUE;
2495
2496 vm_map_unlock_read(map);
2497 if (real_map != map)
2498 vm_map_unlock(real_map);
2499
2500 vm_page_decrypt(m, 0);
2501
2502 assert(m->busy);
2503 PAGE_WAKEUP_DONE(m);
2504
2505 vm_object_unlock(cur_object);
2506 /*
2507 * Retry from the top, in case anything
2508 * changed while we were decrypting...
2509 */
2510 goto RetryFault;
2511 }
2512 ASSERT_PAGE_DECRYPTED(m);
2513
2514 if (m->object->code_signed && map != kernel_map &&
2515 (!m->cs_validated || m->wpmapped)) {
2516 /*
2517 * We might need to validate this page
2518 * against its code signature, so we
2519 * want to hold the VM object exclusively.
2520 */
2521 if (object != cur_object) {
2522 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2523 vm_object_unlock(object);
2524 vm_object_unlock(cur_object);
2525
2526 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2527
2528 vm_map_unlock_read(map);
2529 if (real_map != map)
2530 vm_map_unlock(real_map);
2531
2532 goto RetryFault;
2533 }
2534
2535 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2536
2537 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2538
2539 if (vm_object_lock_upgrade(object) == FALSE) {
2540 /*
2541 * couldn't upgrade, so explictly take the lock
2542 * exclusively and go relookup the page since we
2543 * will have dropped the object lock and
2544 * a different thread could have inserted
2545 * a page at this offset
2546 * no need for a full retry since we're
2547 * at the top level of the object chain
2548 */
2549 vm_object_lock(object);
2550
2551 continue;
2552 }
2553 }
2554 }
2555 /*
2556 * Two cases of map in faults:
2557 * - At top level w/o copy object.
2558 * - Read fault anywhere.
2559 * --> must disallow write.
2560 */
2561
2562 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
2563 if ((fault_type & VM_PROT_WRITE) == 0) {
2564 /*
2565 * This is not a "write" fault, so we
2566 * might not have taken the object lock
2567 * exclusively and we might not be able
2568 * to update the "wpmapped" bit in
2569 * vm_fault_enter().
2570 * Let's just grant read access to
2571 * the page for now and we'll
2572 * soft-fault again if we need write
2573 * access later...
2574 */
2575 prot &= ~VM_PROT_WRITE;
2576 }
2577 goto FastPmapEnter;
2578 }
2579
2580 if ((fault_type & VM_PROT_WRITE) == 0) {
2581
2582 prot &= ~VM_PROT_WRITE;
2583
2584 /*
2585 * Set up to map the page...
2586 * mark the page busy, drop
2587 * unneeded object lock
2588 */
2589 if (object != cur_object) {
2590 /*
2591 * don't need the original object anymore
2592 */
2593 vm_object_unlock(object);
2594
2595 /*
2596 * switch to the object that has the new page
2597 */
2598 object = cur_object;
2599 object_lock_type = cur_object_lock_type;
2600 }
2601 FastPmapEnter:
2602 /*
2603 * prepare for the pmap_enter...
2604 * object and map are both locked
2605 * m contains valid data
2606 * object == m->object
2607 * cur_object == NULL or it's been unlocked
2608 * no paging references on either object or cur_object
2609 */
2610 #if MACH_KDB
2611 if (db_watchpoint_list && (fault_type & VM_PROT_WRITE) == 0)
2612 prot &= ~VM_PROT_WRITE;
2613 #endif
2614 if (caller_pmap) {
2615 kr = vm_fault_enter(m,
2616 caller_pmap,
2617 caller_pmap_addr,
2618 prot,
2619 wired,
2620 change_wiring,
2621 fault_info.no_cache,
2622 &type_of_fault);
2623 } else {
2624 kr = vm_fault_enter(m,
2625 pmap,
2626 vaddr,
2627 prot,
2628 wired,
2629 change_wiring,
2630 fault_info.no_cache,
2631 &type_of_fault);
2632 }
2633
2634 if (need_collapse == TRUE)
2635 vm_object_collapse(object, offset, TRUE);
2636
2637 if (type_of_fault == DBG_PAGEIN_FAULT) {
2638 /*
2639 * evaluate access pattern and update state
2640 * vm_fault_deactivate_behind depends on the
2641 * state being up to date
2642 */
2643 vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
2644
2645 vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
2646 }
2647 /*
2648 * That's it, clean up and return.
2649 */
2650 if (m->busy)
2651 PAGE_WAKEUP_DONE(m);
2652
2653 vm_object_unlock(object);
2654
2655 vm_map_unlock_read(map);
2656 if (real_map != map)
2657 vm_map_unlock(real_map);
2658
2659 goto done;
2660 }
2661 /*
2662 * COPY ON WRITE FAULT
2663 *
2664 * If objects match, then
2665 * object->copy must not be NULL (else control
2666 * would be in previous code block), and we
2667 * have a potential push into the copy object
2668 * with which we can't cope with here.
2669 */
2670 if (cur_object == object) {
2671 /*
2672 * must take the slow path to
2673 * deal with the copy push
2674 */
2675 break;
2676 }
2677 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
2678
2679 /*
2680 * This is now a shadow based copy on write
2681 * fault -- it requires a copy up the shadow
2682 * chain.
2683 *
2684 * Allocate a page in the original top level
2685 * object. Give up if allocate fails. Also
2686 * need to remember current page, as it's the
2687 * source of the copy.
2688 *
2689 * at this point we hold locks on both
2690 * object and cur_object... no need to take
2691 * paging refs or mark pages BUSY since
2692 * we don't drop either object lock until
2693 * the page has been copied and inserted
2694 */
2695 cur_m = m;
2696 m = vm_page_grab();
2697
2698 if (m == VM_PAGE_NULL) {
2699 /*
2700 * no free page currently available...
2701 * must take the slow path
2702 */
2703 break;
2704 }
2705 /*
2706 * Now do the copy. Mark the source page busy...
2707 *
2708 * NOTE: This code holds the map lock across
2709 * the page copy.
2710 */
2711 vm_page_copy(cur_m, m);
2712 vm_page_insert(m, object, offset);
2713 m->dirty = TRUE;
2714
2715 /*
2716 * Now cope with the source page and object
2717 */
2718 if (object->ref_count > 1 && cur_m->pmapped)
2719 pmap_disconnect(cur_m->phys_page);
2720
2721 need_collapse = TRUE;
2722
2723 if (!cur_object->internal &&
2724 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
2725 /*
2726 * The object from which we've just
2727 * copied a page is most probably backed
2728 * by a vnode. We don't want to waste too
2729 * much time trying to collapse the VM objects
2730 * and create a bottleneck when several tasks
2731 * map the same file.
2732 */
2733 if (cur_object->copy == object) {
2734 /*
2735 * Shared mapping or no COW yet.
2736 * We can never collapse a copy
2737 * object into its backing object.
2738 */
2739 need_collapse = FALSE;
2740 } else if (cur_object->copy == object->shadow &&
2741 object->shadow->resident_page_count == 0) {
2742 /*
2743 * Shared mapping after a COW occurred.
2744 */
2745 need_collapse = FALSE;
2746 }
2747 }
2748 vm_object_unlock(cur_object);
2749
2750 if (need_collapse == FALSE)
2751 vm_fault_collapse_skipped++;
2752 vm_fault_collapse_total++;
2753
2754 type_of_fault = DBG_COW_FAULT;
2755 VM_STAT_INCR(cow_faults);
2756 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2757 current_task()->cow_faults++;
2758
2759 goto FastPmapEnter;
2760
2761 } else {
2762 /*
2763 * No page at cur_object, cur_offset... m == NULL
2764 */
2765 if (cur_object->pager_created) {
2766 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
2767 /*
2768 * May have to talk to a pager...
2769 * take the slow path.
2770 */
2771 break;
2772 }
2773 /*
2774 * existence map present and indicates
2775 * that the pager doesn't have this page
2776 */
2777 }
2778 if (cur_object->shadow == VM_OBJECT_NULL) {
2779 /*
2780 * Zero fill fault. Page gets
2781 * inserted into the original object.
2782 */
2783 if (cur_object->shadow_severed) {
2784
2785 if (object != cur_object)
2786 vm_object_unlock(cur_object);
2787 vm_object_unlock(object);
2788
2789 vm_map_unlock_read(map);
2790 if (real_map != map)
2791 vm_map_unlock(real_map);
2792
2793 kr = KERN_MEMORY_ERROR;
2794 goto done;
2795 }
2796 if (VM_PAGE_ZFILL_THROTTLED()) {
2797 /*
2798 * drop all of our locks...
2799 * wait until the free queue is
2800 * pumped back up and then
2801 * redrive the fault
2802 */
2803 if (object != cur_object)
2804 vm_object_unlock(cur_object);
2805 vm_object_unlock(object);
2806 vm_map_unlock_read(map);
2807 if (real_map != map)
2808 vm_map_unlock(real_map);
2809
2810 if (vm_page_wait((change_wiring) ?
2811 THREAD_UNINT :
2812 THREAD_ABORTSAFE))
2813 goto RetryFault;
2814
2815 kr = KERN_ABORTED;
2816 goto done;
2817 }
2818 if (vm_backing_store_low) {
2819 /*
2820 * we are protecting the system from
2821 * backing store exhaustion...
2822 * must take the slow path if we're
2823 * not privileged
2824 */
2825 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
2826 break;
2827 }
2828 if (cur_object != object) {
2829 vm_object_unlock(cur_object);
2830
2831 cur_object = object;
2832 }
2833 if (object_lock_type == OBJECT_LOCK_SHARED) {
2834
2835 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2836
2837 if (vm_object_lock_upgrade(object) == FALSE) {
2838 /*
2839 * couldn't upgrade so do a full retry on the fault
2840 * since we dropped the object lock which
2841 * could allow another thread to insert
2842 * a page at this offset
2843 */
2844 vm_map_unlock_read(map);
2845 if (real_map != map)
2846 vm_map_unlock(real_map);
2847
2848 goto RetryFault;
2849 }
2850 }
2851 m = vm_page_alloc(object, offset);
2852
2853 if (m == VM_PAGE_NULL) {
2854 /*
2855 * no free page currently available...
2856 * must take the slow path
2857 */
2858 break;
2859 }
2860
2861 /*
2862 * Now zero fill page...
2863 * the page is probably going to
2864 * be written soon, so don't bother
2865 * to clear the modified bit
2866 *
2867 * NOTE: This code holds the map
2868 * lock across the zero fill.
2869 */
2870 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
2871
2872 goto FastPmapEnter;
2873 }
2874 /*
2875 * On to the next level in the shadow chain
2876 */
2877 cur_offset += cur_object->shadow_offset;
2878 new_object = cur_object->shadow;
2879
2880 /*
2881 * take the new_object's lock with the indicated state
2882 */
2883 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
2884 vm_object_lock_shared(new_object);
2885 else
2886 vm_object_lock(new_object);
2887
2888 if (cur_object != object)
2889 vm_object_unlock(cur_object);
2890
2891 cur_object = new_object;
2892
2893 continue;
2894 }
2895 }
2896 /*
2897 * Cleanup from fast fault failure. Drop any object
2898 * lock other than original and drop map lock.
2899 */
2900 if (object != cur_object)
2901 vm_object_unlock(cur_object);
2902
2903 /*
2904 * must own the object lock exclusively at this point
2905 */
2906 if (object_lock_type == OBJECT_LOCK_SHARED) {
2907 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2908
2909 if (vm_object_lock_upgrade(object) == FALSE) {
2910 /*
2911 * couldn't upgrade, so explictly
2912 * take the lock exclusively
2913 * no need to retry the fault at this
2914 * point since "vm_fault_page" will
2915 * completely re-evaluate the state
2916 */
2917 vm_object_lock(object);
2918 }
2919 }
2920
2921 handle_copy_delay:
2922 vm_map_unlock_read(map);
2923 if (real_map != map)
2924 vm_map_unlock(real_map);
2925
2926 /*
2927 * Make a reference to this object to
2928 * prevent its disposal while we are messing with
2929 * it. Once we have the reference, the map is free
2930 * to be diddled. Since objects reference their
2931 * shadows (and copies), they will stay around as well.
2932 */
2933 vm_object_reference_locked(object);
2934 vm_object_paging_begin(object);
2935
2936 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
2937
2938 error_code = 0;
2939
2940 kr = vm_fault_page(object, offset, fault_type,
2941 (change_wiring && !wired),
2942 &prot, &result_page, &top_page,
2943 &type_of_fault,
2944 &error_code, map->no_zero_fill,
2945 FALSE, &fault_info);
2946
2947 /*
2948 * if kr != VM_FAULT_SUCCESS, then the paging reference
2949 * has been dropped and the object unlocked... the ref_count
2950 * is still held
2951 *
2952 * if kr == VM_FAULT_SUCCESS, then the paging reference
2953 * is still held along with the ref_count on the original object
2954 *
2955 * if m != NULL, then the object it belongs to
2956 * is returned locked with a paging reference
2957 *
2958 * if top_page != NULL, then it's BUSY and the
2959 * object it belongs to has a paging reference
2960 * but is returned unlocked
2961 */
2962 if (kr != VM_FAULT_SUCCESS) {
2963 /*
2964 * we didn't succeed, lose the object reference immediately.
2965 */
2966 vm_object_deallocate(object);
2967
2968 /*
2969 * See why we failed, and take corrective action.
2970 */
2971 switch (kr) {
2972 case VM_FAULT_MEMORY_SHORTAGE:
2973 if (vm_page_wait((change_wiring) ?
2974 THREAD_UNINT :
2975 THREAD_ABORTSAFE))
2976 goto RetryFault;
2977 /*
2978 * fall thru
2979 */
2980 case VM_FAULT_INTERRUPTED:
2981 kr = KERN_ABORTED;
2982 goto done;
2983 case VM_FAULT_RETRY:
2984 goto RetryFault;
2985 case VM_FAULT_MEMORY_ERROR:
2986 if (error_code)
2987 kr = error_code;
2988 else
2989 kr = KERN_MEMORY_ERROR;
2990 goto done;
2991 }
2992 }
2993 m = result_page;
2994
2995 if (m != VM_PAGE_NULL) {
2996 assert((change_wiring && !wired) ?
2997 (top_page == VM_PAGE_NULL) :
2998 ((top_page == VM_PAGE_NULL) == (m->object == object)));
2999 }
3000
3001 /*
3002 * What to do with the resulting page from vm_fault_page
3003 * if it doesn't get entered into the physical map:
3004 */
3005 #define RELEASE_PAGE(m) \
3006 MACRO_BEGIN \
3007 PAGE_WAKEUP_DONE(m); \
3008 vm_page_lockspin_queues(); \
3009 if (!m->active && !m->inactive && !m->throttled)\
3010 vm_page_activate(m); \
3011 vm_page_unlock_queues(); \
3012 MACRO_END
3013
3014 /*
3015 * We must verify that the maps have not changed
3016 * since our last lookup.
3017 */
3018 if (m != VM_PAGE_NULL) {
3019 old_copy_object = m->object->copy;
3020 vm_object_unlock(m->object);
3021 } else
3022 old_copy_object = VM_OBJECT_NULL;
3023
3024 /*
3025 * no object locks are held at this point
3026 */
3027 if ((map != original_map) || !vm_map_verify(map, &version)) {
3028 vm_object_t retry_object;
3029 vm_object_offset_t retry_offset;
3030 vm_prot_t retry_prot;
3031
3032 /*
3033 * To avoid trying to write_lock the map while another
3034 * thread has it read_locked (in vm_map_pageable), we
3035 * do not try for write permission. If the page is
3036 * still writable, we will get write permission. If it
3037 * is not, or has been marked needs_copy, we enter the
3038 * mapping without write permission, and will merely
3039 * take another fault.
3040 */
3041 map = original_map;
3042 vm_map_lock_read(map);
3043
3044 kr = vm_map_lookup_locked(&map, vaddr,
3045 fault_type & ~VM_PROT_WRITE,
3046 OBJECT_LOCK_EXCLUSIVE, &version,
3047 &retry_object, &retry_offset, &retry_prot,
3048 &wired,
3049 &fault_info,
3050 &real_map);
3051 pmap = real_map->pmap;
3052
3053 if (kr != KERN_SUCCESS) {
3054 vm_map_unlock_read(map);
3055
3056 if (m != VM_PAGE_NULL) {
3057 /*
3058 * retake the lock so that
3059 * we can drop the paging reference
3060 * in vm_fault_cleanup and do the
3061 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3062 */
3063 vm_object_lock(m->object);
3064
3065 RELEASE_PAGE(m);
3066
3067 vm_fault_cleanup(m->object, top_page);
3068 } else {
3069 /*
3070 * retake the lock so that
3071 * we can drop the paging reference
3072 * in vm_fault_cleanup
3073 */
3074 vm_object_lock(object);
3075
3076 vm_fault_cleanup(object, top_page);
3077 }
3078 vm_object_deallocate(object);
3079
3080 goto done;
3081 }
3082 vm_object_unlock(retry_object);
3083
3084 if ((retry_object != object) || (retry_offset != offset)) {
3085
3086 vm_map_unlock_read(map);
3087 if (real_map != map)
3088 vm_map_unlock(real_map);
3089
3090 if (m != VM_PAGE_NULL) {
3091 /*
3092 * retake the lock so that
3093 * we can drop the paging reference
3094 * in vm_fault_cleanup and do the
3095 * PAGE_WAKEUP_DONE in RELEASE_PAGE
3096 */
3097 vm_object_lock(m->object);
3098
3099 RELEASE_PAGE(m);
3100
3101 vm_fault_cleanup(m->object, top_page);
3102 } else {
3103 /*
3104 * retake the lock so that
3105 * we can drop the paging reference
3106 * in vm_fault_cleanup
3107 */
3108 vm_object_lock(object);
3109
3110 vm_fault_cleanup(object, top_page);
3111 }
3112 vm_object_deallocate(object);
3113
3114 goto RetryFault;
3115 }
3116 /*
3117 * Check whether the protection has changed or the object
3118 * has been copied while we left the map unlocked.
3119 */
3120 prot &= retry_prot;
3121 }
3122 if (m != VM_PAGE_NULL) {
3123 vm_object_lock(m->object);
3124
3125 if (m->object->copy != old_copy_object) {
3126 /*
3127 * The copy object changed while the top-level object
3128 * was unlocked, so take away write permission.
3129 */
3130 prot &= ~VM_PROT_WRITE;
3131 }
3132 } else
3133 vm_object_lock(object);
3134
3135 /*
3136 * If we want to wire down this page, but no longer have
3137 * adequate permissions, we must start all over.
3138 */
3139 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3140
3141 vm_map_verify_done(map, &version);
3142 if (real_map != map)
3143 vm_map_unlock(real_map);
3144
3145 if (m != VM_PAGE_NULL) {
3146 RELEASE_PAGE(m);
3147
3148 vm_fault_cleanup(m->object, top_page);
3149 } else
3150 vm_fault_cleanup(object, top_page);
3151
3152 vm_object_deallocate(object);
3153
3154 goto RetryFault;
3155 }
3156 if (m != VM_PAGE_NULL) {
3157 /*
3158 * Put this page into the physical map.
3159 * We had to do the unlock above because pmap_enter
3160 * may cause other faults. The page may be on
3161 * the pageout queues. If the pageout daemon comes
3162 * across the page, it will remove it from the queues.
3163 */
3164 if (caller_pmap) {
3165 kr = vm_fault_enter(m,
3166 caller_pmap,
3167 caller_pmap_addr,
3168 prot,
3169 wired,
3170 change_wiring,
3171 fault_info.no_cache,
3172 &type_of_fault);
3173 } else {
3174 kr = vm_fault_enter(m,
3175 pmap,
3176 vaddr,
3177 prot,
3178 wired,
3179 change_wiring,
3180 fault_info.no_cache,
3181 &type_of_fault);
3182 }
3183 if (kr != KERN_SUCCESS) {
3184 /* abort this page fault */
3185 vm_map_verify_done(map, &version);
3186 if (real_map != map)
3187 vm_map_unlock(real_map);
3188 PAGE_WAKEUP_DONE(m);
3189 vm_fault_cleanup(m->object, top_page);
3190 vm_object_deallocate(object);
3191 goto done;
3192 }
3193 } else {
3194
3195 vm_map_entry_t entry;
3196 vm_map_offset_t laddr;
3197 vm_map_offset_t ldelta, hdelta;
3198
3199 /*
3200 * do a pmap block mapping from the physical address
3201 * in the object
3202 */
3203
3204 #ifdef ppc
3205 /* While we do not worry about execution protection in */
3206 /* general, certian pages may have instruction execution */
3207 /* disallowed. We will check here, and if not allowed */
3208 /* to execute, we return with a protection failure. */
3209
3210 if ((fault_type & VM_PROT_EXECUTE) &&
3211 (!pmap_eligible_for_execute((ppnum_t)(object->shadow_offset >> 12)))) {
3212
3213 vm_map_verify_done(map, &version);
3214
3215 if (real_map != map)
3216 vm_map_unlock(real_map);
3217
3218 vm_fault_cleanup(object, top_page);
3219 vm_object_deallocate(object);
3220
3221 kr = KERN_PROTECTION_FAILURE;
3222 goto done;
3223 }
3224 #endif /* ppc */
3225
3226 if (real_map != map)
3227 vm_map_unlock(real_map);
3228
3229 if (original_map != map) {
3230 vm_map_unlock_read(map);
3231 vm_map_lock_read(original_map);
3232 map = original_map;
3233 }
3234 real_map = map;
3235
3236 laddr = vaddr;
3237 hdelta = 0xFFFFF000;
3238 ldelta = 0xFFFFF000;
3239
3240 while (vm_map_lookup_entry(map, laddr, &entry)) {
3241 if (ldelta > (laddr - entry->vme_start))
3242 ldelta = laddr - entry->vme_start;
3243 if (hdelta > (entry->vme_end - laddr))
3244 hdelta = entry->vme_end - laddr;
3245 if (entry->is_sub_map) {
3246
3247 laddr = (laddr - entry->vme_start)
3248 + entry->offset;
3249 vm_map_lock_read(entry->object.sub_map);
3250
3251 if (map != real_map)
3252 vm_map_unlock_read(map);
3253 if (entry->use_pmap) {
3254 vm_map_unlock_read(real_map);
3255 real_map = entry->object.sub_map;
3256 }
3257 map = entry->object.sub_map;
3258
3259 } else {
3260 break;
3261 }
3262 }
3263
3264 if (vm_map_lookup_entry(map, laddr, &entry) &&
3265 (entry->object.vm_object != NULL) &&
3266 (entry->object.vm_object == object)) {
3267
3268 if (caller_pmap) {
3269 /*
3270 * Set up a block mapped area
3271 */
3272 pmap_map_block(caller_pmap,
3273 (addr64_t)(caller_pmap_addr - ldelta),
3274 (((vm_map_offset_t) (entry->object.vm_object->shadow_offset)) +
3275 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3276 ((ldelta + hdelta) >> 12), prot,
3277 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3278 } else {
3279 /*
3280 * Set up a block mapped area
3281 */
3282 pmap_map_block(real_map->pmap,
3283 (addr64_t)(vaddr - ldelta),
3284 (((vm_map_offset_t)(entry->object.vm_object->shadow_offset)) +
3285 entry->offset + (laddr - entry->vme_start) - ldelta) >> 12,
3286 ((ldelta + hdelta) >> 12), prot,
3287 (VM_WIMG_MASK & (int)object->wimg_bits), 0);
3288 }
3289 }
3290 }
3291
3292 /*
3293 * Unlock everything, and return
3294 */
3295 vm_map_verify_done(map, &version);
3296 if (real_map != map)
3297 vm_map_unlock(real_map);
3298
3299 if (m != VM_PAGE_NULL) {
3300 PAGE_WAKEUP_DONE(m);
3301
3302 vm_fault_cleanup(m->object, top_page);
3303 } else
3304 vm_fault_cleanup(object, top_page);
3305
3306 vm_object_deallocate(object);
3307
3308 #undef RELEASE_PAGE
3309
3310 kr = KERN_SUCCESS;
3311 done:
3312 thread_interrupt_level(interruptible_state);
3313
3314 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3315 (int)((uint64_t)vaddr >> 32),
3316 (int)vaddr,
3317 kr,
3318 type_of_fault,
3319 0);
3320
3321 return (kr);
3322 }
3323
3324 /*
3325 * vm_fault_wire:
3326 *
3327 * Wire down a range of virtual addresses in a map.
3328 */
3329 kern_return_t
3330 vm_fault_wire(
3331 vm_map_t map,
3332 vm_map_entry_t entry,
3333 pmap_t pmap,
3334 vm_map_offset_t pmap_addr)
3335 {
3336
3337 register vm_map_offset_t va;
3338 register vm_map_offset_t end_addr = entry->vme_end;
3339 register kern_return_t rc;
3340
3341 assert(entry->in_transition);
3342
3343 if ((entry->object.vm_object != NULL) &&
3344 !entry->is_sub_map &&
3345 entry->object.vm_object->phys_contiguous) {
3346 return KERN_SUCCESS;
3347 }
3348
3349 /*
3350 * Inform the physical mapping system that the
3351 * range of addresses may not fault, so that
3352 * page tables and such can be locked down as well.
3353 */
3354
3355 pmap_pageable(pmap, pmap_addr,
3356 pmap_addr + (end_addr - entry->vme_start), FALSE);
3357
3358 /*
3359 * We simulate a fault to get the page and enter it
3360 * in the physical map.
3361 */
3362
3363 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3364 if ((rc = vm_fault_wire_fast(
3365 map, va, entry, pmap,
3366 pmap_addr + (va - entry->vme_start)
3367 )) != KERN_SUCCESS) {
3368 rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3369 (pmap == kernel_pmap) ?
3370 THREAD_UNINT : THREAD_ABORTSAFE,
3371 pmap, pmap_addr + (va - entry->vme_start));
3372 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3373 }
3374
3375 if (rc != KERN_SUCCESS) {
3376 struct vm_map_entry tmp_entry = *entry;
3377
3378 /* unwire wired pages */
3379 tmp_entry.vme_end = va;
3380 vm_fault_unwire(map,
3381 &tmp_entry, FALSE, pmap, pmap_addr);
3382
3383 return rc;
3384 }
3385 }
3386 return KERN_SUCCESS;
3387 }
3388
3389 /*
3390 * vm_fault_unwire:
3391 *
3392 * Unwire a range of virtual addresses in a map.
3393 */
3394 void
3395 vm_fault_unwire(
3396 vm_map_t map,
3397 vm_map_entry_t entry,
3398 boolean_t deallocate,
3399 pmap_t pmap,
3400 vm_map_offset_t pmap_addr)
3401 {
3402 register vm_map_offset_t va;
3403 register vm_map_offset_t end_addr = entry->vme_end;
3404 vm_object_t object;
3405 struct vm_object_fault_info fault_info;
3406
3407 object = (entry->is_sub_map)
3408 ? VM_OBJECT_NULL : entry->object.vm_object;
3409
3410 /*
3411 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
3412 * do anything since such memory is wired by default. So we don't have
3413 * anything to undo here.
3414 */
3415
3416 if (object != VM_OBJECT_NULL && object->phys_contiguous)
3417 return;
3418
3419 fault_info.interruptible = THREAD_UNINT;
3420 fault_info.behavior = entry->behavior;
3421 fault_info.user_tag = entry->alias;
3422 fault_info.lo_offset = entry->offset;
3423 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
3424 fault_info.no_cache = entry->no_cache;
3425
3426 /*
3427 * Since the pages are wired down, we must be able to
3428 * get their mappings from the physical map system.
3429 */
3430
3431 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3432
3433 if (pmap) {
3434 pmap_change_wiring(pmap,
3435 pmap_addr + (va - entry->vme_start), FALSE);
3436 }
3437 if (object == VM_OBJECT_NULL) {
3438 (void) vm_fault(map, va, VM_PROT_NONE,
3439 TRUE, THREAD_UNINT, pmap, pmap_addr);
3440 } else {
3441 vm_prot_t prot;
3442 vm_page_t result_page;
3443 vm_page_t top_page;
3444 vm_object_t result_object;
3445 vm_fault_return_t result;
3446
3447 fault_info.cluster_size = end_addr - va;
3448
3449 do {
3450 prot = VM_PROT_NONE;
3451
3452 vm_object_lock(object);
3453 vm_object_paging_begin(object);
3454 XPR(XPR_VM_FAULT,
3455 "vm_fault_unwire -> vm_fault_page\n",
3456 0,0,0,0,0);
3457 result = vm_fault_page(
3458 object,
3459 entry->offset + (va - entry->vme_start),
3460 VM_PROT_NONE, TRUE,
3461 &prot, &result_page, &top_page,
3462 (int *)0,
3463 NULL, map->no_zero_fill,
3464 FALSE, &fault_info);
3465 } while (result == VM_FAULT_RETRY);
3466
3467 /*
3468 * If this was a mapping to a file on a device that has been forcibly
3469 * unmounted, then we won't get a page back from vm_fault_page(). Just
3470 * move on to the next one in case the remaining pages are mapped from
3471 * different objects. During a forced unmount, the object is terminated
3472 * so the alive flag will be false if this happens. A forced unmount will
3473 * will occur when an external disk is unplugged before the user does an
3474 * eject, so we don't want to panic in that situation.
3475 */
3476
3477 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
3478 continue;
3479
3480 if (result != VM_FAULT_SUCCESS)
3481 panic("vm_fault_unwire: failure");
3482
3483 result_object = result_page->object;
3484
3485 if (deallocate) {
3486 assert(result_page->phys_page !=
3487 vm_page_fictitious_addr);
3488 pmap_disconnect(result_page->phys_page);
3489 VM_PAGE_FREE(result_page);
3490 } else {
3491 vm_page_lockspin_queues();
3492 vm_page_unwire(result_page);
3493 vm_page_unlock_queues();
3494 PAGE_WAKEUP_DONE(result_page);
3495 }
3496 vm_fault_cleanup(result_object, top_page);
3497 }
3498 }
3499
3500 /*
3501 * Inform the physical mapping system that the range
3502 * of addresses may fault, so that page tables and
3503 * such may be unwired themselves.
3504 */
3505
3506 pmap_pageable(pmap, pmap_addr,
3507 pmap_addr + (end_addr - entry->vme_start), TRUE);
3508
3509 }
3510
3511 /*
3512 * vm_fault_wire_fast:
3513 *
3514 * Handle common case of a wire down page fault at the given address.
3515 * If successful, the page is inserted into the associated physical map.
3516 * The map entry is passed in to avoid the overhead of a map lookup.
3517 *
3518 * NOTE: the given address should be truncated to the
3519 * proper page address.
3520 *
3521 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
3522 * a standard error specifying why the fault is fatal is returned.
3523 *
3524 * The map in question must be referenced, and remains so.
3525 * Caller has a read lock on the map.
3526 *
3527 * This is a stripped version of vm_fault() for wiring pages. Anything
3528 * other than the common case will return KERN_FAILURE, and the caller
3529 * is expected to call vm_fault().
3530 */
3531 kern_return_t
3532 vm_fault_wire_fast(
3533 __unused vm_map_t map,
3534 vm_map_offset_t va,
3535 vm_map_entry_t entry,
3536 pmap_t pmap,
3537 vm_map_offset_t pmap_addr)
3538 {
3539 vm_object_t object;
3540 vm_object_offset_t offset;
3541 register vm_page_t m;
3542 vm_prot_t prot;
3543 thread_t thread = current_thread();
3544 int type_of_fault;
3545 kern_return_t kr;
3546
3547 VM_STAT_INCR(faults);
3548
3549 if (thread != THREAD_NULL && thread->task != TASK_NULL)
3550 thread->task->faults++;
3551
3552 /*
3553 * Recovery actions
3554 */
3555
3556 #undef RELEASE_PAGE
3557 #define RELEASE_PAGE(m) { \
3558 PAGE_WAKEUP_DONE(m); \
3559 vm_page_lockspin_queues(); \
3560 vm_page_unwire(m); \
3561 vm_page_unlock_queues(); \
3562 }
3563
3564
3565 #undef UNLOCK_THINGS
3566 #define UNLOCK_THINGS { \
3567 vm_object_paging_end(object); \
3568 vm_object_unlock(object); \
3569 }
3570
3571 #undef UNLOCK_AND_DEALLOCATE
3572 #define UNLOCK_AND_DEALLOCATE { \
3573 UNLOCK_THINGS; \
3574 vm_object_deallocate(object); \
3575 }
3576 /*
3577 * Give up and have caller do things the hard way.
3578 */
3579
3580 #define GIVE_UP { \
3581 UNLOCK_AND_DEALLOCATE; \
3582 return(KERN_FAILURE); \
3583 }
3584
3585
3586 /*
3587 * If this entry is not directly to a vm_object, bail out.
3588 */
3589 if (entry->is_sub_map)
3590 return(KERN_FAILURE);
3591
3592 /*
3593 * Find the backing store object and offset into it.
3594 */
3595
3596 object = entry->object.vm_object;
3597 offset = (va - entry->vme_start) + entry->offset;
3598 prot = entry->protection;
3599
3600 /*
3601 * Make a reference to this object to prevent its
3602 * disposal while we are messing with it.
3603 */
3604
3605 vm_object_lock(object);
3606 vm_object_reference_locked(object);
3607 vm_object_paging_begin(object);
3608
3609 /*
3610 * INVARIANTS (through entire routine):
3611 *
3612 * 1) At all times, we must either have the object
3613 * lock or a busy page in some object to prevent
3614 * some other thread from trying to bring in
3615 * the same page.
3616 *
3617 * 2) Once we have a busy page, we must remove it from
3618 * the pageout queues, so that the pageout daemon
3619 * will not grab it away.
3620 *
3621 */
3622
3623 /*
3624 * Look for page in top-level object. If it's not there or
3625 * there's something going on, give up.
3626 * ENCRYPTED SWAP: use the slow fault path, since we'll need to
3627 * decrypt the page before wiring it down.
3628 */
3629 m = vm_page_lookup(object, offset);
3630 if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
3631 (m->unusual && ( m->error || m->restart || m->absent))) {
3632
3633 GIVE_UP;
3634 }
3635 ASSERT_PAGE_DECRYPTED(m);
3636
3637 if (m->fictitious &&
3638 m->phys_page == vm_page_guard_addr) {
3639 /*
3640 * Guard pages are fictitious pages and are never
3641 * entered into a pmap, so let's say it's been wired...
3642 */
3643 kr = KERN_SUCCESS;
3644 goto done;
3645 }
3646
3647 /*
3648 * Wire the page down now. All bail outs beyond this
3649 * point must unwire the page.
3650 */
3651
3652 vm_page_lockspin_queues();
3653 vm_page_wire(m);
3654 vm_page_unlock_queues();
3655
3656 /*
3657 * Mark page busy for other threads.
3658 */
3659 assert(!m->busy);
3660 m->busy = TRUE;
3661 assert(!m->absent);
3662
3663 /*
3664 * Give up if the page is being written and there's a copy object
3665 */
3666 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
3667 RELEASE_PAGE(m);
3668 GIVE_UP;
3669 }
3670
3671 /*
3672 * Put this page into the physical map.
3673 */
3674 type_of_fault = DBG_CACHE_HIT_FAULT;
3675 kr = vm_fault_enter(m,
3676 pmap,
3677 pmap_addr,
3678 prot,
3679 TRUE,
3680 FALSE,
3681 FALSE,
3682 &type_of_fault);
3683
3684 done:
3685 /*
3686 * Unlock everything, and return
3687 */
3688
3689 PAGE_WAKEUP_DONE(m);
3690 UNLOCK_AND_DEALLOCATE;
3691
3692 return kr;
3693
3694 }
3695
3696 /*
3697 * Routine: vm_fault_copy_cleanup
3698 * Purpose:
3699 * Release a page used by vm_fault_copy.
3700 */
3701
3702 void
3703 vm_fault_copy_cleanup(
3704 vm_page_t page,
3705 vm_page_t top_page)
3706 {
3707 vm_object_t object = page->object;
3708
3709 vm_object_lock(object);
3710 PAGE_WAKEUP_DONE(page);
3711 vm_page_lockspin_queues();
3712 if (!page->active && !page->inactive && !page->throttled)
3713 vm_page_activate(page);
3714 vm_page_unlock_queues();
3715 vm_fault_cleanup(object, top_page);
3716 }
3717
3718 void
3719 vm_fault_copy_dst_cleanup(
3720 vm_page_t page)
3721 {
3722 vm_object_t object;
3723
3724 if (page != VM_PAGE_NULL) {
3725 object = page->object;
3726 vm_object_lock(object);
3727 vm_page_lockspin_queues();
3728 vm_page_unwire(page);
3729 vm_page_unlock_queues();
3730 vm_object_paging_end(object);
3731 vm_object_unlock(object);
3732 }
3733 }
3734
3735 /*
3736 * Routine: vm_fault_copy
3737 *
3738 * Purpose:
3739 * Copy pages from one virtual memory object to another --
3740 * neither the source nor destination pages need be resident.
3741 *
3742 * Before actually copying a page, the version associated with
3743 * the destination address map wil be verified.
3744 *
3745 * In/out conditions:
3746 * The caller must hold a reference, but not a lock, to
3747 * each of the source and destination objects and to the
3748 * destination map.
3749 *
3750 * Results:
3751 * Returns KERN_SUCCESS if no errors were encountered in
3752 * reading or writing the data. Returns KERN_INTERRUPTED if
3753 * the operation was interrupted (only possible if the
3754 * "interruptible" argument is asserted). Other return values
3755 * indicate a permanent error in copying the data.
3756 *
3757 * The actual amount of data copied will be returned in the
3758 * "copy_size" argument. In the event that the destination map
3759 * verification failed, this amount may be less than the amount
3760 * requested.
3761 */
3762 kern_return_t
3763 vm_fault_copy(
3764 vm_object_t src_object,
3765 vm_object_offset_t src_offset,
3766 vm_map_size_t *copy_size, /* INOUT */
3767 vm_object_t dst_object,
3768 vm_object_offset_t dst_offset,
3769 vm_map_t dst_map,
3770 vm_map_version_t *dst_version,
3771 int interruptible)
3772 {
3773 vm_page_t result_page;
3774
3775 vm_page_t src_page;
3776 vm_page_t src_top_page;
3777 vm_prot_t src_prot;
3778
3779 vm_page_t dst_page;
3780 vm_page_t dst_top_page;
3781 vm_prot_t dst_prot;
3782
3783 vm_map_size_t amount_left;
3784 vm_object_t old_copy_object;
3785 kern_return_t error = 0;
3786
3787 vm_map_size_t part_size;
3788 struct vm_object_fault_info fault_info_src;
3789 struct vm_object_fault_info fault_info_dst;
3790
3791 /*
3792 * In order not to confuse the clustered pageins, align
3793 * the different offsets on a page boundary.
3794 */
3795
3796 #define RETURN(x) \
3797 MACRO_BEGIN \
3798 *copy_size -= amount_left; \
3799 MACRO_RETURN(x); \
3800 MACRO_END
3801
3802 amount_left = *copy_size;
3803
3804 fault_info_src.interruptible = interruptible;
3805 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
3806 fault_info_src.user_tag = 0;
3807 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
3808 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
3809 fault_info_src.no_cache = FALSE;
3810
3811 fault_info_dst.interruptible = interruptible;
3812 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
3813 fault_info_dst.user_tag = 0;
3814 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
3815 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
3816 fault_info_dst.no_cache = FALSE;
3817
3818 do { /* while (amount_left > 0) */
3819 /*
3820 * There may be a deadlock if both source and destination
3821 * pages are the same. To avoid this deadlock, the copy must
3822 * start by getting the destination page in order to apply
3823 * COW semantics if any.
3824 */
3825
3826 RetryDestinationFault: ;
3827
3828 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
3829
3830 vm_object_lock(dst_object);
3831 vm_object_paging_begin(dst_object);
3832
3833 fault_info_dst.cluster_size = amount_left;
3834
3835 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
3836 switch (vm_fault_page(dst_object,
3837 vm_object_trunc_page(dst_offset),
3838 VM_PROT_WRITE|VM_PROT_READ,
3839 FALSE,
3840 &dst_prot, &dst_page, &dst_top_page,
3841 (int *)0,
3842 &error,
3843 dst_map->no_zero_fill,
3844 FALSE, &fault_info_dst)) {
3845 case VM_FAULT_SUCCESS:
3846 break;
3847 case VM_FAULT_RETRY:
3848 goto RetryDestinationFault;
3849 case VM_FAULT_MEMORY_SHORTAGE:
3850 if (vm_page_wait(interruptible))
3851 goto RetryDestinationFault;
3852 /* fall thru */
3853 case VM_FAULT_INTERRUPTED:
3854 RETURN(MACH_SEND_INTERRUPTED);
3855 case VM_FAULT_MEMORY_ERROR:
3856 if (error)
3857 return (error);
3858 else
3859 return(KERN_MEMORY_ERROR);
3860 }
3861 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
3862
3863 old_copy_object = dst_page->object->copy;
3864
3865 /*
3866 * There exists the possiblity that the source and
3867 * destination page are the same. But we can't
3868 * easily determine that now. If they are the
3869 * same, the call to vm_fault_page() for the
3870 * destination page will deadlock. To prevent this we
3871 * wire the page so we can drop busy without having
3872 * the page daemon steal the page. We clean up the
3873 * top page but keep the paging reference on the object
3874 * holding the dest page so it doesn't go away.
3875 */
3876
3877 vm_page_lockspin_queues();
3878 vm_page_wire(dst_page);
3879 vm_page_unlock_queues();
3880 PAGE_WAKEUP_DONE(dst_page);
3881 vm_object_unlock(dst_page->object);
3882
3883 if (dst_top_page != VM_PAGE_NULL) {
3884 vm_object_lock(dst_object);
3885 VM_PAGE_FREE(dst_top_page);
3886 vm_object_paging_end(dst_object);
3887 vm_object_unlock(dst_object);
3888 }
3889
3890 RetrySourceFault: ;
3891
3892 if (src_object == VM_OBJECT_NULL) {
3893 /*
3894 * No source object. We will just
3895 * zero-fill the page in dst_object.
3896 */
3897 src_page = VM_PAGE_NULL;
3898 result_page = VM_PAGE_NULL;
3899 } else {
3900 vm_object_lock(src_object);
3901 src_page = vm_page_lookup(src_object,
3902 vm_object_trunc_page(src_offset));
3903 if (src_page == dst_page) {
3904 src_prot = dst_prot;
3905 result_page = VM_PAGE_NULL;
3906 } else {
3907 src_prot = VM_PROT_READ;
3908 vm_object_paging_begin(src_object);
3909
3910 fault_info_src.cluster_size = amount_left;
3911
3912 XPR(XPR_VM_FAULT,
3913 "vm_fault_copy(2) -> vm_fault_page\n",
3914 0,0,0,0,0);
3915 switch (vm_fault_page(
3916 src_object,
3917 vm_object_trunc_page(src_offset),
3918 VM_PROT_READ, FALSE,
3919 &src_prot,
3920 &result_page, &src_top_page,
3921 (int *)0, &error, FALSE,
3922 FALSE, &fault_info_src)) {
3923
3924 case VM_FAULT_SUCCESS:
3925 break;
3926 case VM_FAULT_RETRY:
3927 goto RetrySourceFault;
3928 case VM_FAULT_MEMORY_SHORTAGE:
3929 if (vm_page_wait(interruptible))
3930 goto RetrySourceFault;
3931 /* fall thru */
3932 case VM_FAULT_INTERRUPTED:
3933 vm_fault_copy_dst_cleanup(dst_page);
3934 RETURN(MACH_SEND_INTERRUPTED);
3935 case VM_FAULT_MEMORY_ERROR:
3936 vm_fault_copy_dst_cleanup(dst_page);
3937 if (error)
3938 return (error);
3939 else
3940 return(KERN_MEMORY_ERROR);
3941 }
3942
3943
3944 assert((src_top_page == VM_PAGE_NULL) ==
3945 (result_page->object == src_object));
3946 }
3947 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
3948 vm_object_unlock(result_page->object);
3949 }
3950
3951 if (!vm_map_verify(dst_map, dst_version)) {
3952 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3953 vm_fault_copy_cleanup(result_page, src_top_page);
3954 vm_fault_copy_dst_cleanup(dst_page);
3955 break;
3956 }
3957
3958 vm_object_lock(dst_page->object);
3959
3960 if (dst_page->object->copy != old_copy_object) {
3961 vm_object_unlock(dst_page->object);
3962 vm_map_verify_done(dst_map, dst_version);
3963 if (result_page != VM_PAGE_NULL && src_page != dst_page)
3964 vm_fault_copy_cleanup(result_page, src_top_page);
3965 vm_fault_copy_dst_cleanup(dst_page);
3966 break;
3967 }
3968 vm_object_unlock(dst_page->object);
3969
3970 /*
3971 * Copy the page, and note that it is dirty
3972 * immediately.
3973 */
3974
3975 if (!page_aligned(src_offset) ||
3976 !page_aligned(dst_offset) ||
3977 !page_aligned(amount_left)) {
3978
3979 vm_object_offset_t src_po,
3980 dst_po;
3981
3982 src_po = src_offset - vm_object_trunc_page(src_offset);
3983 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
3984
3985 if (dst_po > src_po) {
3986 part_size = PAGE_SIZE - dst_po;
3987 } else {
3988 part_size = PAGE_SIZE - src_po;
3989 }
3990 if (part_size > (amount_left)){
3991 part_size = amount_left;
3992 }
3993
3994 if (result_page == VM_PAGE_NULL) {
3995 vm_page_part_zero_fill(dst_page,
3996 dst_po, part_size);
3997 } else {
3998 vm_page_part_copy(result_page, src_po,
3999 dst_page, dst_po, part_size);
4000 if(!dst_page->dirty){
4001 vm_object_lock(dst_object);
4002 dst_page->dirty = TRUE;
4003 vm_object_unlock(dst_page->object);
4004 }
4005
4006 }
4007 } else {
4008 part_size = PAGE_SIZE;
4009
4010 if (result_page == VM_PAGE_NULL)
4011 vm_page_zero_fill(dst_page);
4012 else{
4013 vm_page_copy(result_page, dst_page);
4014 if(!dst_page->dirty){
4015 vm_object_lock(dst_object);
4016 dst_page->dirty = TRUE;
4017 vm_object_unlock(dst_page->object);
4018 }
4019 }
4020
4021 }
4022
4023 /*
4024 * Unlock everything, and return
4025 */
4026
4027 vm_map_verify_done(dst_map, dst_version);
4028
4029 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4030 vm_fault_copy_cleanup(result_page, src_top_page);
4031 vm_fault_copy_dst_cleanup(dst_page);
4032
4033 amount_left -= part_size;
4034 src_offset += part_size;
4035 dst_offset += part_size;
4036 } while (amount_left > 0);
4037
4038 RETURN(KERN_SUCCESS);
4039 #undef RETURN
4040
4041 /*NOTREACHED*/
4042 }
4043
4044 #if VM_FAULT_CLASSIFY
4045 /*
4046 * Temporary statistics gathering support.
4047 */
4048
4049 /*
4050 * Statistics arrays:
4051 */
4052 #define VM_FAULT_TYPES_MAX 5
4053 #define VM_FAULT_LEVEL_MAX 8
4054
4055 int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4056
4057 #define VM_FAULT_TYPE_ZERO_FILL 0
4058 #define VM_FAULT_TYPE_MAP_IN 1
4059 #define VM_FAULT_TYPE_PAGER 2
4060 #define VM_FAULT_TYPE_COPY 3
4061 #define VM_FAULT_TYPE_OTHER 4
4062
4063
4064 void
4065 vm_fault_classify(vm_object_t object,
4066 vm_object_offset_t offset,
4067 vm_prot_t fault_type)
4068 {
4069 int type, level = 0;
4070 vm_page_t m;
4071
4072 while (TRUE) {
4073 m = vm_page_lookup(object, offset);
4074 if (m != VM_PAGE_NULL) {
4075 if (m->busy || m->error || m->restart || m->absent) {
4076 type = VM_FAULT_TYPE_OTHER;
4077 break;
4078 }
4079 if (((fault_type & VM_PROT_WRITE) == 0) ||
4080 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4081 type = VM_FAULT_TYPE_MAP_IN;
4082 break;
4083 }
4084 type = VM_FAULT_TYPE_COPY;
4085 break;
4086 }
4087 else {
4088 if (object->pager_created) {
4089 type = VM_FAULT_TYPE_PAGER;
4090 break;
4091 }
4092 if (object->shadow == VM_OBJECT_NULL) {
4093 type = VM_FAULT_TYPE_ZERO_FILL;
4094 break;
4095 }
4096
4097 offset += object->shadow_offset;
4098 object = object->shadow;
4099 level++;
4100 continue;
4101 }
4102 }
4103
4104 if (level > VM_FAULT_LEVEL_MAX)
4105 level = VM_FAULT_LEVEL_MAX;
4106
4107 vm_fault_stats[type][level] += 1;
4108
4109 return;
4110 }
4111
4112 /* cleanup routine to call from debugger */
4113
4114 void
4115 vm_fault_classify_init(void)
4116 {
4117 int type, level;
4118
4119 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4120 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4121 vm_fault_stats[type][level] = 0;
4122 }
4123 }
4124
4125 return;
4126 }
4127 #endif /* VM_FAULT_CLASSIFY */
4128
4129
4130 extern int cs_validation;
4131
4132 void
4133 vm_page_validate_cs(
4134 vm_page_t page)
4135 {
4136 vm_object_t object;
4137 vm_object_offset_t offset;
4138 vm_map_offset_t koffset;
4139 vm_map_size_t ksize;
4140 vm_offset_t kaddr;
4141 kern_return_t kr;
4142 memory_object_t pager;
4143 void *blobs;
4144 boolean_t validated, tainted;
4145 boolean_t busy_page;
4146
4147 vm_object_lock_assert_held(page->object);
4148
4149 if (!cs_validation) {
4150 return;
4151 }
4152
4153 if (page->cs_validated && !page->cs_tainted && page->wpmapped) {
4154 vm_object_lock_assert_exclusive(page->object);
4155
4156 /*
4157 * This page has already been validated and found to
4158 * be valid. However, it was mapped for "write" access
4159 * sometime in the past, so we have to check if it was
4160 * modified. If so, it needs to be revalidated.
4161 * If the page was already found to be "tainted", no
4162 * need to re-validate.
4163 */
4164 if (!page->dirty) {
4165 vm_cs_query_modified++;
4166 page->dirty = pmap_is_modified(page->phys_page);
4167 }
4168 if (page->dirty) {
4169 /*
4170 * The page is dirty, so let's clear its
4171 * "validated" bit and re-validate it.
4172 */
4173 if (cs_debug) {
4174 printf("CODESIGNING: vm_page_validate_cs: "
4175 "page %p obj %p off 0x%llx "
4176 "was modified\n",
4177 page, page->object, page->offset);
4178 }
4179 page->cs_validated = FALSE;
4180 vm_cs_validated_dirtied++;
4181 }
4182 }
4183
4184 if (page->cs_validated) {
4185 return;
4186 }
4187
4188 vm_object_lock_assert_exclusive(page->object);
4189
4190 vm_cs_validates++;
4191
4192 object = page->object;
4193 assert(object->code_signed);
4194 offset = page->offset;
4195
4196 busy_page = page->busy;
4197 if (!busy_page) {
4198 /* keep page busy while we map (and unlock) the VM object */
4199 page->busy = TRUE;
4200 }
4201
4202 /*
4203 * Take a paging reference on the VM object
4204 * to protect it from collapse or bypass,
4205 * and keep it from disappearing too.
4206 */
4207 vm_object_paging_begin(object);
4208
4209 /* map the page in the kernel address space */
4210 koffset = 0;
4211 ksize = PAGE_SIZE_64;
4212 kr = vm_paging_map_object(&koffset,
4213 page,
4214 object,
4215 offset,
4216 &ksize,
4217 FALSE); /* can't unlock object ! */
4218 if (kr != KERN_SUCCESS) {
4219 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4220 }
4221 kaddr = CAST_DOWN(vm_offset_t, koffset);
4222
4223 /*
4224 * Since we get here to validate a page that was brought in by
4225 * the pager, we know that this pager is all setup and ready
4226 * by now.
4227 */
4228 assert(!object->internal);
4229 assert(object->pager != NULL);
4230 assert(object->pager_ready);
4231
4232 if (!object->alive || object->terminating || object->pager == NULL) {
4233 /*
4234 * The object is terminating and we don't have its pager
4235 * so we can't validate the data...
4236 */
4237 goto out;
4238 }
4239
4240 pager = object->pager;
4241 assert(pager != NULL);
4242
4243 kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4244 if (kr != KERN_SUCCESS) {
4245 blobs = NULL;
4246 }
4247
4248 /* verify the SHA1 hash for this page */
4249 validated = cs_validate_page(blobs,
4250 offset + object->paging_offset,
4251 (const void *)kaddr,
4252 &tainted);
4253
4254 assert(page->busy);
4255 assert(object == page->object);
4256 vm_object_lock_assert_exclusive(object);
4257
4258 page->cs_validated = validated;
4259 if (validated) {
4260 page->cs_tainted = tainted;
4261 }
4262
4263 out:
4264 if (!busy_page) {
4265 PAGE_WAKEUP_DONE(page);
4266 }
4267 if (koffset != 0) {
4268 /* unmap the map from the kernel address space */
4269 vm_paging_unmap_object(object, koffset, koffset + ksize);
4270 koffset = 0;
4271 ksize = 0;
4272 kaddr = 0;
4273 }
4274 vm_object_paging_end(object);
4275 }